From e67ab63ca884494e6e1b3850f9c3c9b47dd082b4 Mon Sep 17 00:00:00 2001 From: Julien Riou Date: Tue, 23 Mar 2021 09:00:10 +0100 Subject: [PATCH] Prepare for new parsers - Rename "Parser" to "URLParser" - Make "Parse" function generic - Rename "crawlShop" function to "handleProducts" - Reduce "handleProducts" footprint a little bit Signed-off-by: Julien Riou --- main.go | 194 +++++++++++++-------------- parser.go => parser_url.go | 26 ++-- parser_test.go => parser_url_test.go | 4 +- 3 files changed, 115 insertions(+), 109 deletions(-) rename parser.go => parser_url.go (95%) rename parser_test.go => parser_url_test.go (96%) diff --git a/main.go b/main.go index 31fa387..f420b83 100644 --- a/main.go +++ b/main.go @@ -99,12 +99,6 @@ func main() { defer removePid(*pidFile) } - // create parser - parser, err := NewParser(config.BrowserAddress, config.IncludeRegex, config.ExcludeRegex) - if err != nil { - log.Fatalf("could not create parser: %s", err) - } - // connect to the database db, err := gorm.Open(sqlite.Open(*databaseFileName), &gorm.Config{}) if err != nil { @@ -145,128 +139,134 @@ func main() { } } - // crawl shops asynchronously + // parse asynchronously var wg sync.WaitGroup jobsCount := 0 + + // start with URLs for shopName, shopLinks := range ShopsMap { - if jobsCount < *workers { - wg.Add(1) - jobsCount++ - go crawlShop(parser, shopName, shopLinks, notifiers, db, &wg) - } else { - log.Debugf("waiting for intermediate jobs to end") - wg.Wait() - jobsCount = 0 + + // read shop from database or create it + var shop Shop + trx := db.Where(Shop{Name: shopName}).FirstOrCreate(&shop) + if trx.Error != nil { + log.Errorf("cannot create or select shop %s to/from database: %s", shopName, trx.Error) + continue + } + + for _, link := range shopLinks { + if jobsCount < *workers { + // create parser + parser, err := NewURLParser(link, config.BrowserAddress, config.IncludeRegex, config.ExcludeRegex) + if err != nil { + log.Warnf("could not create URL parser for %s", link) + continue + } + wg.Add(1) + jobsCount++ + go handleProducts(shop, parser, notifiers, db, &wg) + } else { + log.Debugf("waiting for intermediate jobs to end") + wg.Wait() + jobsCount = 0 + } } } + log.Debugf("waiting for all jobs to end") wg.Wait() } -// For a given shop, fetch and parse all the dependent URLs, then eventually send notifications -func crawlShop(parser *Parser, shopName string, shopLinks []string, notifiers []Notifier, db *gorm.DB, wg *sync.WaitGroup) { +// For a given shop, fetch and parse its URL, then eventually send notifications +func handleProducts(shop Shop, parser *URLParser, notifiers []Notifier, db *gorm.DB, wg *sync.WaitGroup) { defer wg.Done() - log.Debugf("parsing shop %s", shopName) - // read shop from database or create it - var shop Shop - trx := db.Where(Shop{Name: shopName}).FirstOrCreate(&shop) - if trx.Error != nil { - log.Errorf("cannot create or select shop %s to/from database: %s", shopName, trx.Error) + log.Debugf("parsing with %s", parser) + products, err := parser.Parse() + if err != nil { + log.Warnf("cannot parse: %s", err) return } + log.Debugf("parsed") - for _, link := range shopLinks { + // upsert products to database + for _, product := range products { - log.Debugf("parsing url %s", link) - products, err := parser.Parse(link) - if err != nil { - log.Warnf("cannot parse %s: %s", link, err) + log.Debugf("detected product %+v", product) + + if !product.IsValid() { + log.Warnf("parsed malformatted product: %+v", product) continue } - log.Debugf("url %s parsed", link) - // upsert products to database - for _, product := range products { + // check if product is already in the database + // sometimes new products are detected on the website, directly available, without reference in the database + // the bot has to send a notification instead of blindly creating it in the database and check availability afterwards + var count int64 + trx := db.Model(&Product{}).Where(Product{URL: product.URL}).Count(&count) + if trx.Error != nil { + log.Warnf("cannot see if product %s already exists in the database: %s", product.Name, trx.Error) + continue + } - log.Debugf("detected product %+v", product) + // fetch product from database or create it if it doesn't exist + var dbProduct Product + trx = db.Where(Product{URL: product.URL}).Attrs(Product{Name: product.Name, Shop: shop, Price: product.Price, PriceCurrency: product.PriceCurrency, Available: product.Available}).FirstOrCreate(&dbProduct) + if trx.Error != nil { + log.Warnf("cannot fetch product %s from database: %s", product.Name, trx.Error) + continue + } + log.Debugf("product %s found in database", dbProduct.Name) - if !product.IsValid() { - log.Warnf("parsed malformatted product: %+v", product) - continue - } + // detect availability change + duration := time.Now().Sub(dbProduct.UpdatedAt).Truncate(time.Second) + createThread := false + closeThread := false - // check if product is already in the database - // sometimes new products are detected on the website, directly available, without reference in the database - // the bot has to send a notification instead of blindly creating it in the database and check availability afterwards - var count int64 - trx = db.Model(&Product{}).Where(Product{URL: product.URL}).Count(&count) - if trx.Error != nil { - log.Warnf("cannot see if product %s already exists in the database: %s", product.Name, trx.Error) - continue - } + // non-existing product directly available + if count == 0 && product.Available { + log.Infof("product %s on %s is now available", product.Name, shop.Name) + createThread = true + } - // fetch product from database or create it if it doesn't exist - var dbProduct Product - trx = db.Where(Product{URL: product.URL}).Attrs(Product{Name: product.Name, Shop: shop, Price: product.Price, PriceCurrency: product.PriceCurrency, Available: product.Available}).FirstOrCreate(&dbProduct) - if trx.Error != nil { - log.Warnf("cannot fetch product %s from database: %s", product.Name, trx.Error) - continue - } - log.Debugf("product %s found in database", dbProduct.Name) - - // detect availability change - duration := time.Now().Sub(dbProduct.UpdatedAt).Truncate(time.Second) - createThread := false - closeThread := false - - // non-existing product directly available - if count == 0 && product.Available { - log.Infof("product %s on %s is now available", product.Name, shopName) + // existing product with availability change + if count > 0 && (dbProduct.Available != product.Available) { + if product.Available { + log.Infof("product %s on %s is now available", product.Name, shop.Name) createThread = true + } else { + log.Infof("product %s on %s is not available anymore", product.Name, shop.Name) + closeThread = true } + } - // existing product with availability change - if count > 0 && (dbProduct.Available != product.Available) { - if product.Available { - log.Infof("product %s on %s is now available", product.Name, shopName) - createThread = true - } else { - log.Infof("product %s on %s is not available anymore", product.Name, shopName) - closeThread = true + // update product in database before sending notification + // if there is a database failure, we don't want the bot to send a notification at each run + if dbProduct.ToMerge(product) { + dbProduct.Merge(product) + trx = db.Save(&dbProduct) + if trx.Error != nil { + log.Warnf("cannot save product %s to database: %s", dbProduct.Name, trx.Error) + continue + } + log.Debugf("product %s updated in database", dbProduct.Name) + } + + // send notifications + if createThread { + for _, notifier := range notifiers { + if err := notifier.NotifyWhenAvailable(shop.Name, dbProduct.Name, dbProduct.Price, dbProduct.PriceCurrency, dbProduct.URL); err != nil { + log.Errorf("%s", err) } } - - // update product in database before sending notification - // if there is a database failure, we don't want the bot to send a notification at each run - if dbProduct.ToMerge(product) { - dbProduct.Merge(product) - trx = db.Save(&dbProduct) - if trx.Error != nil { - log.Warnf("cannot save product %s to database: %s", dbProduct.Name, trx.Error) - continue - } - log.Debugf("product %s updated in database", dbProduct.Name) - } - - // send notifications - if createThread { - for _, notifier := range notifiers { - if err := notifier.NotifyWhenAvailable(shop.Name, dbProduct.Name, dbProduct.Price, dbProduct.PriceCurrency, dbProduct.URL); err != nil { - log.Errorf("%s", err) - } - } - } else if closeThread { - for _, notifier := range notifiers { - if err := notifier.NotifyWhenNotAvailable(dbProduct.URL, duration); err != nil { - log.Errorf("%s", err) - } + } else if closeThread { + for _, notifier := range notifiers { + if err := notifier.NotifyWhenNotAvailable(dbProduct.URL, duration); err != nil { + log.Errorf("%s", err) } } } } - - log.Debugf("shop %s parsed", shopName) } func showVersion() { diff --git a/parser.go b/parser_url.go similarity index 95% rename from parser.go rename to parser_url.go index f97c8a1..7089a52 100644 --- a/parser.go +++ b/parser_url.go @@ -14,15 +14,20 @@ import ( "github.com/MontFerret/ferret/pkg/drivers/http" ) -// Parser structure to handle websites parsing logic -type Parser struct { +// URLParser structure to handle websites parsing logic +type URLParser struct { + url string includeRegex *regexp.Regexp excludeRegex *regexp.Regexp ctx context.Context } -// NewParser to create a new Parser instance -func NewParser(browserAddress string, includeRegex string, excludeRegex string) (*Parser, error) { +func (p *URLParser) String() string { + return fmt.Sprintf("URLParser<%s>", p.url) +} + +// NewURLParser to create a new URLParser instance +func NewURLParser(url string, browserAddress string, includeRegex string, excludeRegex string) (*URLParser, error) { var err error var includeRegexCompiled, excludeRegexCompiled *regexp.Regexp @@ -47,7 +52,8 @@ func NewParser(browserAddress string, includeRegex string, excludeRegex string) ctx = drivers.WithContext(ctx, cdp.NewDriver(cdp.WithAddress(browserAddress))) ctx = drivers.WithContext(ctx, http.NewDriver(), drivers.AsDefault()) - return &Parser{ + return &URLParser{ + url: url, includeRegex: includeRegexCompiled, excludeRegex: excludeRegexCompiled, ctx: ctx, @@ -56,13 +62,13 @@ func NewParser(browserAddress string, includeRegex string, excludeRegex string) // Parse a website to return list of products // TODO: redirect output to logger -func (p *Parser) Parse(url string) ([]*Product, error) { - shopName, err := ExtractShopName(url) +func (p *URLParser) Parse() ([]*Product, error) { + shopName, err := ExtractShopName(p.url) if err != nil { return nil, err } - query, err := createQuery(shopName, url) + query, err := createQuery(shopName, p.url) if err != nil { return nil, err } @@ -90,7 +96,7 @@ func (p *Parser) Parse(url string) ([]*Product, error) { } // filterInclusive returns a list of products matching the include regex -func (p *Parser) filterInclusive(products []*Product) []*Product { +func (p *URLParser) filterInclusive(products []*Product) []*Product { var filtered []*Product if p.includeRegex != nil { for _, product := range products { @@ -107,7 +113,7 @@ func (p *Parser) filterInclusive(products []*Product) []*Product { } // filterExclusive returns a list of products that don't match the exclude regex -func (p *Parser) filterExclusive(products []*Product) []*Product { +func (p *URLParser) filterExclusive(products []*Product) []*Product { var filtered []*Product if p.excludeRegex != nil { for _, product := range products { diff --git a/parser_test.go b/parser_url_test.go similarity index 96% rename from parser_test.go rename to parser_url_test.go index 9070c6d..fe091e7 100644 --- a/parser_test.go +++ b/parser_url_test.go @@ -18,7 +18,7 @@ func TestFilterInclusive(t *testing.T) { for i, tc := range tests { t.Run(fmt.Sprintf("TestFilterInclusive#%d", i), func(t *testing.T) { - p, err := NewParser("", tc.regex, "") + p, err := NewURLParser("", "", tc.regex, "") if err != nil { t.Errorf("failed to initialize parser: %s", err) } else { @@ -58,7 +58,7 @@ func TestFilterExclusive(t *testing.T) { for i, tc := range tests { t.Run(fmt.Sprintf("TestFilterExclusive#%d", i), func(t *testing.T) { - p, err := NewParser("", "", tc.regex) + p, err := NewURLParser("", "", "", tc.regex) if err != nil { t.Errorf("failed to initialize parser: %s", err) } else {