From 3b392700d3a3c909b7619fbeb4c5d9e56308cd6c Mon Sep 17 00:00:00 2001 From: Julien Riou Date: Wed, 31 Aug 2022 18:40:52 +0200 Subject: [PATCH] feat: Add -monitor option Signed-off-by: Julien Riou --- .gitignore | 1 + README.md | 1 + main.go | 8 ++++ monitoring.go | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+) create mode 100644 monitoring.go diff --git a/.gitignore b/.gitignore index bed624e..ffd6401 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ restockbot.pid ferret.log shop.fql *.bak +venv diff --git a/README.md b/README.md index e033722..353f5d7 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,7 @@ docker run -it --name restockbot --rm --link chromium:chromium -v $(pwd):/root/ There are two modes: * **default**: without special argument, the bot parses websites and manage its own database * **API**: using the `-api` argument, the bot starts the HTTP API to expose data from the database +* **monitor**: using the `-monitor` (optionaly with `-monitor-warning-timeout` and `-monitor-critical-timeout` arguments), the bot checks for last execution times per shop to return a Nagios compatible output ## How to contribute diff --git a/main.go b/main.go index 162696e..ccff985 100644 --- a/main.go +++ b/main.go @@ -53,6 +53,9 @@ func main() { pidWaitTimeout := flag.Int("pid-wait-timeout", 0, "Seconds to wait before giving up when another instance is running") retention := flag.Int("retention", 0, "Automatically remove products from the database with this number of days old (disabled by default)") api := flag.Bool("api", false, "Start the HTTP API") + monitor := flag.Bool("monitor", false, "Perform health check with Nagios output") + warningTimeout := flag.Int("monitor-warning-timeout", 300, "Raise a warning alert when the last execution time has reached this number of seconds (see -monitor)") + criticalTimeout := flag.Int("monitor-critical-timeout", 600, "Raise a critical alert when the last execution time has reached this number of seconds (see -monitor)") flag.Parse() @@ -138,6 +141,11 @@ func main() { } } + // start monitoring + if *monitor { + os.Exit(Monitor(db, *warningTimeout, *criticalTimeout)) + } + // start the api if *api { log.Fatal(StartAPI(db, config.APIConfig)) diff --git a/monitoring.go b/monitoring.go new file mode 100644 index 0000000..014ad5d --- /dev/null +++ b/monitoring.go @@ -0,0 +1,118 @@ +package main + +import ( + "fmt" + "strings" + "time" + + log "github.com/sirupsen/logrus" + "gorm.io/gorm" +) + +const ( + // NagiosOk return the Nagios OK code (see https://nagios-plugins.org/doc/guidelines.html#AEN78) + NagiosOk = 0 + // NagiosWarning return the Nagios WARNING code (see https://nagios-plugins.org/doc/guidelines.html#AEN78) + NagiosWarning = 1 + // NagiosCritical return the Nagios CRITICAL code (see https://nagios-plugins.org/doc/guidelines.html#AEN78) + NagiosCritical = 2 + // NagiosUnknown return the Nagios UNKNOWN code (see https://nagios-plugins.org/doc/guidelines.html#AEN78) + NagiosUnknown = 3 +) + +// MonitoringResult to store result of Nagios checks +type MonitoringResult struct { + ShopName string + UpdatedAt time.Time + ReturnCode int +} + +// String to print a MonitoringResult nicely +func (m MonitoringResult) String() string { + diff := time.Now().Sub(m.UpdatedAt) + + var wording string + if diff.Seconds() > 0 { + wording = "seconds" + } else { + wording = "second" + } + + return fmt.Sprintf("%s (%d %s ago)", m.ShopName, diff, wording) +} + +// FormatMonitoringResults to print a list of MonitoringResult nicely +func FormatMonitoringResults(results []MonitoringResult) string { + var s []string + for _, result := range results { + s = append(s, result.String()) + } + return strings.Join(s, ", ") +} + +// Monitor will check for last execution time for each shop and return either +// a warning or critical alert when the threshold has been reached +func Monitor(db *gorm.DB, warningTimeout int, criticalTimeout int) (rc int) { + + // Find date and time thresholds + warningTime := time.Now().Add(-time.Duration(warningTimeout) * time.Second) + criticalTime := time.Now().Add(-time.Duration(criticalTimeout) * time.Second) + + // Map to sort monitoring result by status code + resultMap := make(map[int][]MonitoringResult) + + // List shops + var shops []Shop + trx := db.Find(&shops) + if trx.Error != nil { + fmt.Printf("%s\n", trx.Error) + return NagiosUnknown + } + + for _, shop := range shops { + // Fetch last execution time + var product Product + trx := db.Where(Product{ShopID: shop.ID}).Order("updated_at asc").First(&product) + if trx.Error == gorm.ErrRecordNotFound { + fmt.Printf("%s\n", fmt.Errorf("No product found for shop %s", shop.Name)) + return NagiosCritical + } + if trx.Error != nil { + fmt.Printf("%s\n", trx.Error) + return NagiosUnknown + } + + // Compare to thresholds and add to result map + result := MonitoringResult{ShopName: shop.Name, UpdatedAt: product.UpdatedAt, ReturnCode: NAGIOS_OK} + if product.UpdatedAt.Before(criticalTime) { + log.Infof("%s has been updated at %s (before time of %s) (crit)", shop.Name, product.UpdatedAt, criticalTime) + result.ReturnCode = NagiosCritical + } else if product.UpdatedAt.Before(warningTime) { + log.Infof("%s has been updated at %s (before time of %s) (warn)", shop.Name, product.UpdatedAt, warningTime) + result.ReturnCode = NagiosWarning + } else { + log.Infof("%s has been updated at %s (after %s) (ok)", shop.Name, product.UpdatedAt, warningTime) + } + resultMap[result.ReturnCode] = append(resultMap[result.ReturnCode], result) + } + + var message, prefix string + + if len(resultMap[NagiosWarning]) > 0 { + rc = NagiosWarning + prefix = "WARN" + message = FormatMonitoringResults(resultMap[NagiosWarning]) + } else if len(resultMap[NagiosCritical]) > 0 { + rc = NagiosCritical + prefix = "CRIT" + message = FormatMonitoringResults(resultMap[NagiosCritical]) + } else { + rc = NagiosOK + prefix = "OK" + message = "All shops have been updated recently" + } + + // Print output + fmt.Printf("%s - %s\n", prefix, message) + return rc +}