check_teamredminer/check_teamredminer.py

340 lines
9.6 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
import socket
import json
import logging
import argparse
import nagiosplugin
import sys
from nagiosplugin import (
Check,
Context,
Metric,
Performance,
Resource,
ScalarContext,
Summary,
)
from nagiosplugin.state import Critical, Ok, Unknown, Warn
logger = logging.getLogger(__name__)
class ApiError(Exception):
pass
class TeamRedMinerApi:
def __init__(self, host, port, timeout):
self.host = host
self.port = port
self.timeout = timeout
def request(self, command):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as client:
client.settimeout(self.timeout)
client.connect((self.host, self.port))
r = {"command": command}
client.sendall(json.dumps(r).encode())
response = []
while True:
data = client.recv(4096)
if data:
response.append(data.decode("utf-8"))
else:
break
response = json.loads("".join(response))
self.raise_for_status(response)
logger.debug(response)
response.pop("STATUS")
response.pop("id")
return response[list(response.keys())[0]]
@staticmethod
def raise_for_status(response):
for r in response["STATUS"]:
status = r["STATUS"]
code = r["Code"]
message = r["Msg"]
if status in ["W", "E", "F"]:
raise ApiError(f"API error: {message} (code {code})")
class TeamRedMiner(Resource):
def __init__(self, host, port, timeout):
self.host = host
self.port = port
self.timeout = timeout
def probe(self):
client = TeamRedMinerApi(host=self.host, port=self.port, timeout=self.timeout)
metrics = []
summary = client.request("summary")[0]
if "MHS 30s" in summary:
hashrate = summary["MHS 30s"]
logger.info(f"Hashrate is {hashrate} MH/s")
metrics.append(Metric("hashrate", hashrate, uom="MH/s", context="hashrate"))
if "Elapsed" in summary:
uptime = summary["Elapsed"]
seconds = "seconds" if uptime > 1 else "second"
logger.info(f"Uptime is {uptime} {seconds}")
metrics.append(Metric("uptime", uptime, uom="s", context="uptime"))
devices = client.request("devs")
for device in devices:
if "GPU" in device:
id = device["GPU"]
if "Status" in device:
alive = device["Status"] == "Alive"
if alive:
logger.info(f"GPU {id} is alive")
else:
logger.info(f"GPU {id} is dead!")
metrics.append(Metric(f"alive_{id}", alive, context="alive"))
if "Temperature" in device:
temperature = device["Temperature"]
logger.info(f"GPU {id}: temperature is {temperature}C")
metrics.append(
Metric(
f"temperature_{id}",
temperature,
uom="C",
context="temperature",
)
)
if "TemperatureMem" in device:
temperature = device["TemperatureMem"]
logger.info(f"GPU {id}: memory temperature is {temperature}C")
metrics.append(
Metric(
f"memory_temperature_{id}",
temperature,
uom="C",
context="temperature",
)
)
return metrics
class BelowThresholdContext(Context):
def __init__(self, name, warning=None, critical=None):
super().__init__(name)
self.warning = warning
self.critical = critical
def evaluate(self, metric, resource):
unit = None
if metric.uom:
unit = metric.uom
if self.critical and metric.value <= self.critical:
return self.result_cls(
Critical, f"{metric.value}<={self.critical}{unit}", metric
)
elif self.warning and metric.value <= self.warning:
return self.result_cls(
Warn, f"{metric.value}<={self.warning}{unit}", metric
)
else:
return self.result_cls(Ok, None, metric)
def performance(self, metric, resource):
return Performance(
metric.name,
metric.value,
metric.uom,
self.warning,
self.critical,
metric.min,
metric.max,
)
class BooleanContext(Context):
def __init__(self, name, expected=True, warning=False, critical=False):
super().__init__(name)
self.expected = expected
self.warning = warning
self.critical = critical
def evaluate(self, metric, resource):
if not metric.value is self.expected:
result_type = Ok
if self.critical:
result_type = Critical
elif self.warning:
result_type = Warn
return self.result_cls(
result_type, f"{metric.name} is not {self.expected}", metric
)
else:
return self.result_cls(Ok, None, metric)
class TeamRedMinerSummary(Summary):
def problem(self, results):
return ", ".join(
[
f"{result.metric.name} {result.state}: {result.hint}"
for result in results
if str(result.state) != "ok"
]
)
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"-v",
"--verbose",
dest="loglevel",
action="store_const",
const=logging.INFO,
help="Print more output",
)
parser.add_argument(
"-d",
"--debug",
dest="loglevel",
action="store_const",
const=logging.DEBUG,
default=logging.WARNING,
help="Print even more output",
)
parser.add_argument(
"--version",
dest="show_version",
action="store_true",
help="Print version and exit",
)
parser.add_argument(
"--host",
dest="host",
type=str,
help="Host address of TeamRedMiner API",
default="127.0.0.1",
)
parser.add_argument(
"--port",
dest="port",
type=int,
help="Port of TeamRedMiner API",
default=4028,
)
parser.add_argument(
"--timeout",
dest="timeout",
type=int,
help="Timeout, in seconds, when requesting TeamRedMiner API",
default=1,
)
parser.add_argument(
"--hashrate-warning",
dest="hashrate_warning",
type=int,
help="Raise warning if hashrate goes below this threshold",
)
parser.add_argument(
"--hashrate-critical",
dest="hashrate_critical",
type=int,
help="Raise critical if hashrate goes below this threshold",
)
parser.add_argument(
"--uptime-warning",
dest="uptime_warning",
type=int,
help="Raise warning if uptime goes below this threshold",
)
parser.add_argument(
"--uptime-critical",
dest="uptime_critical",
type=int,
help="Raise critical if uptime goes below this threshold",
)
parser.add_argument(
"--temperature-warning",
dest="temperature_warning",
type=int,
help="Raise warning if temperature goes over this threshold",
default=70,
)
parser.add_argument(
"--temperature-critical",
dest="temperature_critical",
type=int,
help="Raise critcal if temperature goes over this threshold",
default=90,
)
parser.add_argument(
"--memory-temperature-warning",
dest="memory_temperature_warning",
type=int,
help="Raise warning if memory temperature goes over this threshold",
default=90,
)
parser.add_argument(
"--memory-temperature-critical",
dest="memory_temperature_critical",
type=int,
help="Raise critcal if memory temperature goes over this threshold",
default=110,
)
args = parser.parse_args()
return args
def setup_logging(args):
logging.basicConfig(format="%(levelname)s: %(message)s", level=args.loglevel)
def show_version():
print("1.0.0")
def main():
args = parse_arguments()
setup_logging(args)
if args.show_version:
show_version()
return
try:
check = Check(
TeamRedMiner(host=args.host, port=args.port, timeout=args.timeout),
BelowThresholdContext(
"hashrate",
warning=args.hashrate_warning,
critical=args.hashrate_critical,
),
BelowThresholdContext(
"uptime", warning=args.uptime_warning, critical=args.uptime_critical
),
ScalarContext(
"temperature",
warning=args.temperature_warning,
critical=args.temperature_critical,
),
BooleanContext("alive", expected=True, critical=True),
TeamRedMinerSummary(),
)
check.main()
except Exception as err:
print(f"Failed to execute check: {str(err)}")
logger.debug(err, exc_info=True)
sys.exit(Unknown.code)
if __name__ == "__main__":
main()