Archived
1
0
Fork 0
This repository has been archived on 2024-12-18. You can view files and clone it, but cannot push or open issues or pull requests.
restockbot/parser.go
Julien Riou 42f79c03d4
Add Micro Center support (#2)
As a good start, only the "shippable items" are parsed. Next enhancement would
be to configure local shops.

Signed-off-by: Julien Riou <julien@riou.xyz>
2021-02-28 10:33:28 +01:00

376 lines
11 KiB
Go
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main
import (
"context"
"encoding/json"
"fmt"
"regexp"
log "github.com/sirupsen/logrus"
"github.com/MontFerret/ferret/pkg/compiler"
"github.com/MontFerret/ferret/pkg/drivers"
"github.com/MontFerret/ferret/pkg/drivers/cdp"
"github.com/MontFerret/ferret/pkg/drivers/http"
)
// Parser structure to handle websites parsing logic
type Parser struct {
includeRegex *regexp.Regexp
excludeRegex *regexp.Regexp
ctx context.Context
}
// NewParser to create a new Parser instance
func NewParser(includeRegex string, excludeRegex string) (*Parser, error) {
log.Debugf("compiling include name regex")
includeRegexCompiled, err := regexp.Compile(includeRegex)
if err != nil {
return nil, err
}
log.Debugf("compiling exclude name regex")
excludeRegexCompiled, err := regexp.Compile(excludeRegex)
if err != nil {
return nil, err
}
log.Debugf("creating context with headless browser drivers")
ctx := context.Background()
ctx = drivers.WithContext(ctx, cdp.NewDriver())
ctx = drivers.WithContext(ctx, http.NewDriver(), drivers.AsDefault())
return &Parser{
includeRegex: includeRegexCompiled,
excludeRegex: excludeRegexCompiled,
ctx: ctx,
}, nil
}
// Parse a website to return list of products
// TODO: redirect output to logger
func (p *Parser) Parse(url string) ([]*Product, error) {
shopName, err := ExtractShopName(url)
if err != nil {
return nil, err
}
query, err := createQuery(shopName, url)
if err != nil {
return nil, err
}
comp := compiler.New()
program, err := comp.Compile(string(query))
if err != nil {
return nil, err
}
out, err := program.Run(p.ctx)
if err != nil {
return nil, err
}
var products []*Product
err = json.Unmarshal(out, &products)
if err != nil {
return nil, err
}
// apply filters
products = p.filterInclusive(products)
products = p.filterExclusive(products)
return products, nil
}
// filterInclusive returns a list of products matching the include regex
func (p *Parser) filterInclusive(products []*Product) []*Product {
var filtered []*Product
if p.includeRegex != nil {
for _, product := range products {
if p.includeRegex.MatchString(product.Name) {
log.Debugf("product %s included because it matches the include regex", product.Name)
filtered = append(filtered, product)
} else {
log.Debugf("product %s excluded because it does not match the include regex", product.Name)
}
}
return filtered
}
return products
}
// filterExclusive returns a list of products that don't match the exclude regex
func (p *Parser) filterExclusive(products []*Product) []*Product {
var filtered []*Product
if p.excludeRegex != nil {
for _, product := range products {
if !p.excludeRegex.MatchString(product.Name) {
log.Debugf("product %s included because it matches does not match the exclude regex", product.Name)
filtered = append(filtered, product)
} else {
log.Debugf("product %s excluded because it matches the exclude regex", product.Name)
}
}
}
return products
}
func createQuery(shopName string, url string) (string, error) {
switch shopName {
case "cybertek.fr":
return createQueryForCybertek(url), nil
case "ldlc.com":
return createQueryForLDLC(url), nil
case "materiel.net":
return createQueryForMaterielNet(url), nil
case "microcenter.com":
return createQueryForMicroCenter(url), nil
case "mediamarkt.ch":
return createQueryForMediamarktCh(url), nil
case "topachat.com":
return createQueryForTopachat(url), nil
default:
return "", fmt.Errorf("shop %s not supported", shopName)
}
}
func createQueryForLDLC(url string) string {
q := `
// gather first page
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
// discover next pages
LET pagination = ELEMENT(doc, ".pagination")
LET next_pages = (
FOR url in ELEMENTS(pagination, "a")
RETURN "https://www.ldlc.com" + url.attributes.href
)
// append first page to pagination and remove duplicates
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
// create a result array containing an one array of products per page
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
LET products = (
FOR el IN ELEMENTS(doc, ".pdt-item")
LET url = ELEMENT(el, "a")
LET name = INNER_TEXT(ELEMENT(el, "h3"))
LET price = TO_FLOAT(SUBSTITUTE(SUBSTITUTE(INNER_TEXT(ELEMENT(el, ".price")), "€", "."), " ", ""))
LET available = !CONTAINS(INNER_TEXT(ELEMENT(el, ".stock-web"), 'span'), "RUPTURE")
RETURN {
name: name,
url: "https://www.ldlc.com" + url.attributes.href,
price: price,
price_currency: "EUR",
available: available,
}
)
RETURN products
)
// combine all arrays to a single one
RETURN FLATTEN(results)
`
return q
}
func createQueryForMaterielNet(url string) string {
q := `
// gather first page
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
// discover next pages
LET pagination = ELEMENT(doc, ".pagination")
LET next_pages = (
FOR url in ELEMENTS(pagination, "a")
RETURN "https://www.materiel.net" + url.attributes.href
)
// append first page to pagination and remove duplicates
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
// create a result array containing an one array of products per page
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
WAIT_ELEMENT(doc, "div .o-product__price")
LET products = (
FOR el IN ELEMENTS(doc, "div .ajax-product-item")
LET image = ELEMENT(el, "img")
LET url = ELEMENT(el, "a")
LET price = TO_FLOAT(SUBSTITUTE(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .o-product__price")), "€", "."), " ", ""))
LET available = !CONTAINS(ELEMENT(el, "div .o-availability__value"), "Rupture")
RETURN {
name: image.attributes.alt,
url: "https://www.materiel.net" + url.attributes.href,
price: price,
price_currency: "EUR",
available: available,
}
)
RETURN products
)
// combine all arrays to a single one
RETURN FLATTEN(results)
`
return q
}
func createQueryForTopachat(url string) string {
q := `
LET page = '` + url + `'
LET doc = DOCUMENT(page, {driver: "cdp"})
FOR el IN ELEMENTS(doc, "article .grille-produit")
LET url = ELEMENT(el, "a")
LET name = INNER_TEXT(ELEMENT(el, "h3"))
LET price = TO_FLOAT(ELEMENT(el, "div .prod_px_euro").attributes.content)
LET available = !CONTAINS(ELEMENT(el, "link").attributes.href, "http://schema.org/OutOfStock")
RETURN {
url: "https://www.topachat.com" + url.attributes.href,
name: name,
price: price,
price_currency: "EUR",
available: available,
}
`
return q
}
func createQueryForCybertek(url string) string {
q := `
// gather first page
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
// discover next pages
LET pagination = ELEMENT(doc, "div .pagination-div")
LET next_pages = (
FOR url in ELEMENTS(pagination, "a")
RETURN url.attributes.href
)
// append first page to pagination, remove "#" link and remove duplicates
LET pages = SORTED_UNIQUE(APPEND(MINUS(next_pages, ["#"]), first_page))
// create a result array containing an one array of products per page
LET results = (
FOR page in pages
NAVIGATE(doc, page)
LET products_available = (
FOR el IN ELEMENTS(doc, "div .listing_dispo")
LET url = ELEMENT(el, "a")
LET name = TRIM(FIRST(SPLIT(INNER_TEXT(ELEMENT(el, "div .height-txt-cat")), "-")))
LET price = TO_FLOAT(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .price_prod_resp")), "€", "."))
RETURN {
name: name,
url: url.attributes.href,
available: true,
price: price,
price_currency: "EUR",
}
)
LET products_not_available = (
FOR el IN ELEMENTS(doc, "div .listing_nodispo")
LET url = ELEMENT(el, "a")
LET name = TRIM(FIRST(SPLIT(INNER_TEXT(ELEMENT(el, "div .height-txt-cat")), "-")))
LET price = TO_FLOAT(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .price_prod_resp")), "€", "."))
RETURN {
name: name,
url: url.attributes.href,
available: false,
price: price,
price_currency: "EUR",
}
)
// combine available and not available list of products into a single array of products
RETURN FLATTEN([products_available, products_not_available])
)
// combine all arrays to a single one
RETURN FLATTEN(results)
`
return q
}
func createQueryForMediamarktCh(url string) string {
q := `
LET page = '` + url + `'
LET doc = DOCUMENT(page, {driver: "cdp"})
LET pagination = (
FOR el IN ELEMENTS(doc, "div .pagination-wrapper a")
RETURN "https://www.mediamarkt.ch" + el.attributes.href
)
LET pages = SORTED_UNIQUE(pagination)
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
LET products = (
FOR el IN ELEMENTS(doc, "div .product-wrapper")
LET name = TRIM(FIRST(SPLIT(INNER_TEXT(ELEMENT(el, "h2")), "-")))
LET url = ELEMENT(el, "a").attributes.href
LET price = TO_FLOAT(CONCAT(POP(ELEMENTS(el, "div .price span"))))
LET available = !REGEX_TEST(INNER_TEXT(ELEMENT(el, "div .availability li")), "^Non disponible(.*)")
RETURN {
name: name,
url: "https://www.mediamarkt.ch" + url,
price: price,
price_currency: "CHF",
available: available,
}
)
RETURN products
)
RETURN FLATTEN(results)
`
return q
}
func createQueryForMicroCenter(url string) string {
q := `
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
LET base_url = 'https://www.microcenter.com'
LET next_pages = (
FOR a IN ELEMENTS(doc, "div .pagination .pages a")
RETURN base_url + a.attributes.href
)
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
LET products = (
FOR el IN ELEMENTS(doc, "div .products .product_wrapper")
LET details = ELEMENT(el, "h2")
LET name = INNER_TEXT(details)
LET url = ELEMENT(details, "a").attributes.href
LET price = TO_FLOAT(ELEMENT(details, "a").attributes."data-price")
LET available = LENGTH(ELEMENTS(el, "div .price_wrapper form .STBTN"))>0
RETURN {
name: name,
url: base_url + url,
price: price,
price_currency: "USD",
available: available,
}
)
RETURN products
)
RETURN FLATTEN(results)
`
return q
}