feat: Add -monitor option
Signed-off-by: Julien Riou <julien@riou.xyz>
This commit is contained in:
parent
cb53106e1f
commit
3b392700d3
4 changed files with 128 additions and 0 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -8,3 +8,4 @@ restockbot.pid
|
||||||
ferret.log
|
ferret.log
|
||||||
shop.fql
|
shop.fql
|
||||||
*.bak
|
*.bak
|
||||||
|
venv
|
||||||
|
|
|
@ -203,6 +203,7 @@ docker run -it --name restockbot --rm --link chromium:chromium -v $(pwd):/root/
|
||||||
There are two modes:
|
There are two modes:
|
||||||
* **default**: without special argument, the bot parses websites and manage its own database
|
* **default**: without special argument, the bot parses websites and manage its own database
|
||||||
* **API**: using the `-api` argument, the bot starts the HTTP API to expose data from the database
|
* **API**: using the `-api` argument, the bot starts the HTTP API to expose data from the database
|
||||||
|
* **monitor**: using the `-monitor` (optionaly with `-monitor-warning-timeout` and `-monitor-critical-timeout` arguments), the bot checks for last execution times per shop to return a Nagios compatible output
|
||||||
|
|
||||||
## How to contribute
|
## How to contribute
|
||||||
|
|
||||||
|
|
8
main.go
8
main.go
|
@ -53,6 +53,9 @@ func main() {
|
||||||
pidWaitTimeout := flag.Int("pid-wait-timeout", 0, "Seconds to wait before giving up when another instance is running")
|
pidWaitTimeout := flag.Int("pid-wait-timeout", 0, "Seconds to wait before giving up when another instance is running")
|
||||||
retention := flag.Int("retention", 0, "Automatically remove products from the database with this number of days old (disabled by default)")
|
retention := flag.Int("retention", 0, "Automatically remove products from the database with this number of days old (disabled by default)")
|
||||||
api := flag.Bool("api", false, "Start the HTTP API")
|
api := flag.Bool("api", false, "Start the HTTP API")
|
||||||
|
monitor := flag.Bool("monitor", false, "Perform health check with Nagios output")
|
||||||
|
warningTimeout := flag.Int("monitor-warning-timeout", 300, "Raise a warning alert when the last execution time has reached this number of seconds (see -monitor)")
|
||||||
|
criticalTimeout := flag.Int("monitor-critical-timeout", 600, "Raise a critical alert when the last execution time has reached this number of seconds (see -monitor)")
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
|
@ -138,6 +141,11 @@ func main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// start monitoring
|
||||||
|
if *monitor {
|
||||||
|
os.Exit(Monitor(db, *warningTimeout, *criticalTimeout))
|
||||||
|
}
|
||||||
|
|
||||||
// start the api
|
// start the api
|
||||||
if *api {
|
if *api {
|
||||||
log.Fatal(StartAPI(db, config.APIConfig))
|
log.Fatal(StartAPI(db, config.APIConfig))
|
||||||
|
|
118
monitoring.go
Normal file
118
monitoring.go
Normal file
|
@ -0,0 +1,118 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
"gorm.io/gorm"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// NagiosOk return the Nagios OK code (see https://nagios-plugins.org/doc/guidelines.html#AEN78)
|
||||||
|
NagiosOk = 0
|
||||||
|
// NagiosWarning return the Nagios WARNING code (see https://nagios-plugins.org/doc/guidelines.html#AEN78)
|
||||||
|
NagiosWarning = 1
|
||||||
|
// NagiosCritical return the Nagios CRITICAL code (see https://nagios-plugins.org/doc/guidelines.html#AEN78)
|
||||||
|
NagiosCritical = 2
|
||||||
|
// NagiosUnknown return the Nagios UNKNOWN code (see https://nagios-plugins.org/doc/guidelines.html#AEN78)
|
||||||
|
NagiosUnknown = 3
|
||||||
|
)
|
||||||
|
|
||||||
|
// MonitoringResult to store result of Nagios checks
|
||||||
|
type MonitoringResult struct {
|
||||||
|
ShopName string
|
||||||
|
UpdatedAt time.Time
|
||||||
|
ReturnCode int
|
||||||
|
}
|
||||||
|
|
||||||
|
// String to print a MonitoringResult nicely
|
||||||
|
func (m MonitoringResult) String() string {
|
||||||
|
diff := time.Now().Sub(m.UpdatedAt)
|
||||||
|
|
||||||
|
var wording string
|
||||||
|
if diff.Seconds() > 0 {
|
||||||
|
wording = "seconds"
|
||||||
|
} else {
|
||||||
|
wording = "second"
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf("%s (%d %s ago)", m.ShopName, diff, wording)
|
||||||
|
}
|
||||||
|
|
||||||
|
// FormatMonitoringResults to print a list of MonitoringResult nicely
|
||||||
|
func FormatMonitoringResults(results []MonitoringResult) string {
|
||||||
|
var s []string
|
||||||
|
for _, result := range results {
|
||||||
|
s = append(s, result.String())
|
||||||
|
}
|
||||||
|
return strings.Join(s, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Monitor will check for last execution time for each shop and return either
|
||||||
|
// a warning or critical alert when the threshold has been reached
|
||||||
|
func Monitor(db *gorm.DB, warningTimeout int, criticalTimeout int) (rc int) {
|
||||||
|
|
||||||
|
// Find date and time thresholds
|
||||||
|
warningTime := time.Now().Add(-time.Duration(warningTimeout) * time.Second)
|
||||||
|
criticalTime := time.Now().Add(-time.Duration(criticalTimeout) * time.Second)
|
||||||
|
|
||||||
|
// Map to sort monitoring result by status code
|
||||||
|
resultMap := make(map[int][]MonitoringResult)
|
||||||
|
|
||||||
|
// List shops
|
||||||
|
var shops []Shop
|
||||||
|
trx := db.Find(&shops)
|
||||||
|
if trx.Error != nil {
|
||||||
|
fmt.Printf("%s\n", trx.Error)
|
||||||
|
return NagiosUnknown
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, shop := range shops {
|
||||||
|
// Fetch last execution time
|
||||||
|
var product Product
|
||||||
|
trx := db.Where(Product{ShopID: shop.ID}).Order("updated_at asc").First(&product)
|
||||||
|
if trx.Error == gorm.ErrRecordNotFound {
|
||||||
|
fmt.Printf("%s\n", fmt.Errorf("No product found for shop %s", shop.Name))
|
||||||
|
return NagiosCritical
|
||||||
|
}
|
||||||
|
if trx.Error != nil {
|
||||||
|
fmt.Printf("%s\n", trx.Error)
|
||||||
|
return NagiosUnknown
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare to thresholds and add to result map
|
||||||
|
result := MonitoringResult{ShopName: shop.Name, UpdatedAt: product.UpdatedAt, ReturnCode: NAGIOS_OK}
|
||||||
|
if product.UpdatedAt.Before(criticalTime) {
|
||||||
|
log.Infof("%s has been updated at %s (before time of %s) (crit)", shop.Name, product.UpdatedAt, criticalTime)
|
||||||
|
result.ReturnCode = NagiosCritical
|
||||||
|
} else if product.UpdatedAt.Before(warningTime) {
|
||||||
|
log.Infof("%s has been updated at %s (before time of %s) (warn)", shop.Name, product.UpdatedAt, warningTime)
|
||||||
|
result.ReturnCode = NagiosWarning
|
||||||
|
} else {
|
||||||
|
log.Infof("%s has been updated at %s (after %s) (ok)", shop.Name, product.UpdatedAt, warningTime)
|
||||||
|
}
|
||||||
|
resultMap[result.ReturnCode] = append(resultMap[result.ReturnCode], result)
|
||||||
|
}
|
||||||
|
|
||||||
|
var message, prefix string
|
||||||
|
|
||||||
|
if len(resultMap[NagiosWarning]) > 0 {
|
||||||
|
rc = NagiosWarning
|
||||||
|
prefix = "WARN"
|
||||||
|
message = FormatMonitoringResults(resultMap[NagiosWarning])
|
||||||
|
} else if len(resultMap[NagiosCritical]) > 0 {
|
||||||
|
rc = NagiosCritical
|
||||||
|
prefix = "CRIT"
|
||||||
|
message = FormatMonitoringResults(resultMap[NagiosCritical])
|
||||||
|
} else {
|
||||||
|
rc = NagiosOK
|
||||||
|
prefix = "OK"
|
||||||
|
message = "All shops have been updated recently"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print output
|
||||||
|
fmt.Printf("%s - %s\n", prefix, message)
|
||||||
|
return rc
|
||||||
|
}
|
Reference in a new issue