Release 1.0.0

Signed-off-by: Julien Riou <julien@riou.xyz>
This commit is contained in:
Julien Riou 2022-05-14 01:04:49 +02:00
parent 5df4ce0ea6
commit 4c633e47aa
No known key found for this signature in database
GPG key ID: FF42D23B580C89F7
4 changed files with 359 additions and 1 deletions

20
.pre-commit-config.yaml Normal file
View file

@ -0,0 +1,20 @@
---
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.2.0
hooks:
- id: check-executables-have-shebangs
- id: check-merge-conflict
- id: end-of-file-fixer
- id: fix-encoding-pragma
args: ['--remove']
- id: requirements-txt-fixer
- id: trailing-whitespace
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
- repo: https://github.com/pycqa/isort
rev: 5.10.1
hooks:
- id: isort

View file

@ -1,2 +1,32 @@
# check_trex # check_trex
Nagios check for T-Rex miner
Nagios check for T-Rex miner.
# Installation
Using pip:
```
python3 -m venv venv
. ./venv/bin/activate
pip install -r requirements.txt
```
Using debian package manager:
```
sudo apt-get install python3-nagiosplugin python3-requests
```
# Usage
```
./check_trex --help
```
# Contributing
```
pip install pre-commit
pre-commit run --files check_trex.py
```

302
check_trex.py Executable file
View file

@ -0,0 +1,302 @@
#!/usr/bin/env python3
import argparse
import logging
import sys
import requests
from nagiosplugin import (Check, Context, Metric, Performance, Resource,
ScalarContext, Summary)
from nagiosplugin.state import Critical, Ok, Unknown, Warn
logger = logging.getLogger(__name__)
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"-v",
"--verbose",
dest="loglevel",
action="store_const",
const=logging.INFO,
help="Print more output",
)
parser.add_argument(
"-d",
"--debug",
dest="loglevel",
action="store_const",
const=logging.DEBUG,
default=logging.WARNING,
help="Print even more output",
)
parser.add_argument(
"--version",
dest="show_version",
action="store_true",
help="Print version and exit",
)
parser.add_argument(
"--url",
dest="url",
type=str,
help="API URL of T-Rex miner",
default="http://127.0.0.1:4067",
)
parser.add_argument(
"--timeout",
dest="timeout",
type=int,
help="Timeout when requesting T-Rex API",
default=3,
)
parser.add_argument(
"--hashrate-warning",
dest="hashrate_warning",
type=int,
help="Raise warning if hashrate goes below this threshold",
)
parser.add_argument(
"--hashrate-critical",
dest="hashrate_critical",
type=int,
help="Raise critical if hashrate goes below this threshold",
)
parser.add_argument(
"--uptime-warning",
dest="uptime_warning",
type=int,
help="Raise warning if uptime goes below this threshold",
)
parser.add_argument(
"--uptime-critical",
dest="uptime_critical",
type=int,
help="Raise critical if uptime goes below this threshold",
)
parser.add_argument(
"--paused-warning",
dest="paused_warning",
action="store_true",
help="Raise warning when T-Rex is paused",
)
parser.add_argument(
"--paused-critical",
dest="paused_critical",
action="store_true",
help="Raise critical when T-Rex is paused",
)
parser.add_argument(
"--temperature-warning",
dest="temperature_warning",
type=int,
help="Raise warning if temperature goes over this threshold",
default=70,
)
parser.add_argument(
"--temperature-critical",
dest="temperature_critical",
type=int,
help="Raise critcal if temperature goes over this threshold",
default=90,
)
parser.add_argument(
"--memory-temperature-warning",
dest="memory_temperature_warning",
type=int,
help="Raise warning if memory temperature goes over this threshold",
default=90,
)
parser.add_argument(
"--memory-temperature-critical",
dest="memory_temperature_critical",
type=int,
help="Raise critcal if memory temperature goes over this threshold",
default=110,
)
args = parser.parse_args()
return args
def setup_logging(args):
logging.basicConfig(format="%(levelname)s: %(message)s", level=args.loglevel)
def show_version():
print("1.0.0")
class BelowThresholdContext(Context):
def __init__(self, name, warning=None, critical=None):
super().__init__(name)
self.warning = warning
self.critical = critical
def evaluate(self, metric, resource):
if self.critical and metric.value <= self.critical:
return self.result_cls(Critical, f"{metric.value}<={self.critical}", metric)
elif self.warning and metric.value <= self.warning:
return self.result_cls(Warn, f"{metric.value}<={self.warning}", metric)
else:
return self.result_cls(Ok, None, metric)
def performance(self, metric, resource):
return Performance(
metric.name,
metric.value,
metric.uom,
self.warning,
self.critical,
metric.min,
metric.max,
)
class BooleanContext(Context):
def __init__(self, name, expected=True, warning=False, critical=False):
super().__init__(name)
self.expected = expected
self.warning = warning
self.critical = critical
def evaluate(self, metric, resource):
if not metric.value is self.expected:
result_type = Ok
if self.critical:
result_type = Critical
elif self.warning:
result_type = Warn
return self.result_cls(
result_type, f"{metric.name} is not {self.expected}", metric
)
else:
return self.result_cls(Ok, None, metric)
class Trex(Resource):
def __init__(self, url, timeout):
self.url = url
self.timeout = timeout
def probe(self):
r = requests.get(f"{self.url}/summary", timeout=self.timeout)
r.raise_for_status()
data = r.json()
logger.debug("Response:")
logger.debug(data)
metrics = []
if "hashrate" in data:
hashrate = data["hashrate"]
logger.debug(f"Hashrate is {hashrate}")
metrics.append(Metric("hashrate", hashrate, context="hashrate"))
if "success" in data:
success = bool(data["success"])
if success:
logger.debug("T-Rex is successfully started")
else:
logger.debug("T-Rex is not successfully started")
metrics.append(Metric("success", success, context="success"))
if "paused" in data:
paused = bool(data["paused"])
if paused:
logger.debug("T-Rex is paused")
else:
logger.debug("T-Rex is not paused")
metrics.append(Metric("paused", paused, context="paused"))
if "uptime" in data:
uptime = data["uptime"]
seconds = "seconds" if uptime > 1 else "second"
logger.debug(f"Uptime is {uptime} {seconds}")
metrics.append(Metric("uptime", uptime, context="uptime"))
for gpu in data.get("gpus"):
name = gpu["name"]
id = gpu["gpu_id"]
if "temperature" in gpu:
temperature = gpu["temperature"]
logger.debug(f"Temperature of {name} ({id}) is {temperature}C")
metrics.append(
Metric("temperature", temperature, context="temperature")
)
if "memory_temperature" in gpu:
temperature = gpu["memory_temperature"]
logger.debug(
f"Memory temperature of {name} ({id}) is {memory_temperature}C"
)
metrics.append(
Metric(
"memory_temperature",
memory_temperature,
context="memory_temperature",
)
)
return metrics
class TrexSummary(Summary):
def problem(self, results):
return ", ".join(
[
f"{result.metric.name} {result.state}: {result.hint}"
for result in results
if str(result.state) != "ok"
]
)
def main():
args = parse_arguments()
setup_logging(args)
if args.show_version:
show_version()
return
try:
check = Check(
Trex(url=args.url, timeout=args.timeout),
BooleanContext("success", expected=True),
BooleanContext(
"paused",
expected=True,
warning=args.paused_warning,
critical=args.paused_critical,
),
BelowThresholdContext(
"hashrate",
warning=args.hashrate_warning,
critical=args.hashrate_critical,
),
BelowThresholdContext(
"uptime", warning=args.uptime_warning, critical=args.uptime_critical
),
ScalarContext(
"temperature",
warning=args.temperature_warning,
critical=args.temperature_critical,
),
TrexSummary(),
)
check.main()
except Exception as err:
print(f"Failed to execute check: {str(err)}")
logger.debug(err, exc_info=True)
sys.exit(Unknown.code)
if __name__ == "__main__":
main()

6
requirements.txt Normal file
View file

@ -0,0 +1,6 @@
certifi==2021.10.8
charset-normalizer==2.0.12
idna==3.3
nagiosplugin==1.3.3
requests==2.27.1
urllib3==1.26.9