Julien Riou
2afd36584b
Local stores are set with an "storeid" param in the query string of the URL and by a "storeSelected" cookie to avoid garbage in the query strings for further requests. As product URL is a unique key in the database, Micro Center is able to handle an URL with the store ID for every product. We can add this storeid in the list of URLs to parse and job done. Every single Micro Center local store are parsable. Signed-off-by: Julien Riou <julien@riou.xyz>
381 lines
11 KiB
Go
381 lines
11 KiB
Go
package main
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"fmt"
|
||
"regexp"
|
||
|
||
log "github.com/sirupsen/logrus"
|
||
|
||
"github.com/MontFerret/ferret/pkg/compiler"
|
||
"github.com/MontFerret/ferret/pkg/drivers"
|
||
"github.com/MontFerret/ferret/pkg/drivers/cdp"
|
||
"github.com/MontFerret/ferret/pkg/drivers/http"
|
||
)
|
||
|
||
// Parser structure to handle websites parsing logic
|
||
type Parser struct {
|
||
includeRegex *regexp.Regexp
|
||
excludeRegex *regexp.Regexp
|
||
ctx context.Context
|
||
}
|
||
|
||
// NewParser to create a new Parser instance
|
||
func NewParser(includeRegex string, excludeRegex string) (*Parser, error) {
|
||
|
||
log.Debugf("compiling include name regex")
|
||
includeRegexCompiled, err := regexp.Compile(includeRegex)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
log.Debugf("compiling exclude name regex")
|
||
excludeRegexCompiled, err := regexp.Compile(excludeRegex)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
log.Debugf("creating context with headless browser drivers")
|
||
ctx := context.Background()
|
||
ctx = drivers.WithContext(ctx, cdp.NewDriver())
|
||
ctx = drivers.WithContext(ctx, http.NewDriver(), drivers.AsDefault())
|
||
|
||
return &Parser{
|
||
includeRegex: includeRegexCompiled,
|
||
excludeRegex: excludeRegexCompiled,
|
||
ctx: ctx,
|
||
}, nil
|
||
}
|
||
|
||
// Parse a website to return list of products
|
||
// TODO: redirect output to logger
|
||
func (p *Parser) Parse(url string) ([]*Product, error) {
|
||
shopName, err := ExtractShopName(url)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
query, err := createQuery(shopName, url)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
comp := compiler.New()
|
||
program, err := comp.Compile(string(query))
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
out, err := program.Run(p.ctx)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
var products []*Product
|
||
err = json.Unmarshal(out, &products)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
// apply filters
|
||
products = p.filterInclusive(products)
|
||
products = p.filterExclusive(products)
|
||
|
||
return products, nil
|
||
}
|
||
|
||
// filterInclusive returns a list of products matching the include regex
|
||
func (p *Parser) filterInclusive(products []*Product) []*Product {
|
||
var filtered []*Product
|
||
if p.includeRegex != nil {
|
||
for _, product := range products {
|
||
if p.includeRegex.MatchString(product.Name) {
|
||
log.Debugf("product %s included because it matches the include regex", product.Name)
|
||
filtered = append(filtered, product)
|
||
} else {
|
||
log.Debugf("product %s excluded because it does not match the include regex", product.Name)
|
||
}
|
||
}
|
||
return filtered
|
||
}
|
||
return products
|
||
}
|
||
|
||
// filterExclusive returns a list of products that don't match the exclude regex
|
||
func (p *Parser) filterExclusive(products []*Product) []*Product {
|
||
var filtered []*Product
|
||
if p.excludeRegex != nil {
|
||
for _, product := range products {
|
||
if !p.excludeRegex.MatchString(product.Name) {
|
||
log.Debugf("product %s included because it matches does not match the exclude regex", product.Name)
|
||
filtered = append(filtered, product)
|
||
} else {
|
||
log.Debugf("product %s excluded because it matches the exclude regex", product.Name)
|
||
}
|
||
}
|
||
}
|
||
return products
|
||
}
|
||
|
||
func createQuery(shopName string, url string) (string, error) {
|
||
switch shopName {
|
||
case "cybertek.fr":
|
||
return createQueryForCybertek(url), nil
|
||
case "ldlc.com":
|
||
return createQueryForLDLC(url), nil
|
||
case "materiel.net":
|
||
return createQueryForMaterielNet(url), nil
|
||
case "microcenter.com":
|
||
return createQueryForMicroCenter(url), nil
|
||
case "mediamarkt.ch":
|
||
return createQueryForMediamarktCh(url), nil
|
||
case "topachat.com":
|
||
return createQueryForTopachat(url), nil
|
||
default:
|
||
return "", fmt.Errorf("shop %s not supported", shopName)
|
||
}
|
||
}
|
||
|
||
func createQueryForLDLC(url string) string {
|
||
q := `
|
||
// gather first page
|
||
LET first_page = '` + url + `'
|
||
LET doc = DOCUMENT(first_page, {driver: "cdp"})
|
||
|
||
// discover next pages
|
||
LET pagination = ELEMENT(doc, ".pagination")
|
||
LET next_pages = (
|
||
FOR url in ELEMENTS(pagination, "a")
|
||
RETURN "https://www.ldlc.com" + url.attributes.href
|
||
)
|
||
|
||
// append first page to pagination and remove duplicates
|
||
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
|
||
|
||
// create a result array containing an one array of products per page
|
||
LET results = (
|
||
FOR page IN pages
|
||
NAVIGATE(doc, page)
|
||
LET products = (
|
||
FOR el IN ELEMENTS(doc, ".pdt-item")
|
||
LET url = ELEMENT(el, "a")
|
||
LET name = INNER_TEXT(ELEMENT(el, "h3"))
|
||
LET price = TO_FLOAT(SUBSTITUTE(SUBSTITUTE(INNER_TEXT(ELEMENT(el, ".price")), "€", "."), " ", ""))
|
||
LET available = !CONTAINS(INNER_TEXT(ELEMENT(el, ".stock-web"), 'span'), "RUPTURE")
|
||
RETURN {
|
||
name: name,
|
||
url: "https://www.ldlc.com" + url.attributes.href,
|
||
price: price,
|
||
price_currency: "EUR",
|
||
available: available,
|
||
}
|
||
)
|
||
RETURN products
|
||
)
|
||
|
||
// combine all arrays to a single one
|
||
RETURN FLATTEN(results)
|
||
`
|
||
return q
|
||
}
|
||
|
||
func createQueryForMaterielNet(url string) string {
|
||
q := `
|
||
// gather first page
|
||
LET first_page = '` + url + `'
|
||
LET doc = DOCUMENT(first_page, {driver: "cdp"})
|
||
|
||
// discover next pages
|
||
LET pagination = ELEMENT(doc, ".pagination")
|
||
LET next_pages = (
|
||
FOR url in ELEMENTS(pagination, "a")
|
||
RETURN "https://www.materiel.net" + url.attributes.href
|
||
)
|
||
|
||
// append first page to pagination and remove duplicates
|
||
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
|
||
|
||
// create a result array containing an one array of products per page
|
||
LET results = (
|
||
FOR page IN pages
|
||
NAVIGATE(doc, page)
|
||
WAIT_ELEMENT(doc, "div .o-product__price")
|
||
LET products = (
|
||
FOR el IN ELEMENTS(doc, "div .ajax-product-item")
|
||
LET image = ELEMENT(el, "img")
|
||
LET url = ELEMENT(el, "a")
|
||
LET price = TO_FLOAT(SUBSTITUTE(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .o-product__price")), "€", "."), " ", ""))
|
||
LET available = !CONTAINS(ELEMENT(el, "div .o-availability__value"), "Rupture")
|
||
RETURN {
|
||
name: image.attributes.alt,
|
||
url: "https://www.materiel.net" + url.attributes.href,
|
||
price: price,
|
||
price_currency: "EUR",
|
||
available: available,
|
||
}
|
||
)
|
||
RETURN products
|
||
)
|
||
|
||
// combine all arrays to a single one
|
||
RETURN FLATTEN(results)
|
||
`
|
||
return q
|
||
}
|
||
|
||
func createQueryForTopachat(url string) string {
|
||
q := `
|
||
LET page = '` + url + `'
|
||
LET doc = DOCUMENT(page, {driver: "cdp"})
|
||
|
||
FOR el IN ELEMENTS(doc, "article .grille-produit")
|
||
LET url = ELEMENT(el, "a")
|
||
LET name = INNER_TEXT(ELEMENT(el, "h3"))
|
||
LET price = TO_FLOAT(ELEMENT(el, "div .prod_px_euro").attributes.content)
|
||
LET available = !CONTAINS(ELEMENT(el, "link").attributes.href, "http://schema.org/OutOfStock")
|
||
RETURN {
|
||
url: "https://www.topachat.com" + url.attributes.href,
|
||
name: name,
|
||
price: price,
|
||
price_currency: "EUR",
|
||
available: available,
|
||
}
|
||
`
|
||
return q
|
||
}
|
||
|
||
func createQueryForCybertek(url string) string {
|
||
q := `
|
||
// gather first page
|
||
LET first_page = '` + url + `'
|
||
LET doc = DOCUMENT(first_page, {driver: "cdp"})
|
||
|
||
// discover next pages
|
||
LET pagination = ELEMENT(doc, "div .pagination-div")
|
||
LET next_pages = (
|
||
FOR url in ELEMENTS(pagination, "a")
|
||
RETURN url.attributes.href
|
||
)
|
||
|
||
// append first page to pagination, remove "#" link and remove duplicates
|
||
LET pages = SORTED_UNIQUE(APPEND(MINUS(next_pages, ["#"]), first_page))
|
||
|
||
// create a result array containing an one array of products per page
|
||
LET results = (
|
||
FOR page in pages
|
||
NAVIGATE(doc, page)
|
||
LET products_available = (
|
||
FOR el IN ELEMENTS(doc, "div .listing_dispo")
|
||
LET url = ELEMENT(el, "a")
|
||
LET name = TRIM(FIRST(SPLIT(INNER_TEXT(ELEMENT(el, "div .height-txt-cat")), "-")))
|
||
LET price = TO_FLOAT(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .price_prod_resp")), "€", "."))
|
||
RETURN {
|
||
name: name,
|
||
url: url.attributes.href,
|
||
available: true,
|
||
price: price,
|
||
price_currency: "EUR",
|
||
}
|
||
)
|
||
LET products_not_available = (
|
||
FOR el IN ELEMENTS(doc, "div .listing_nodispo")
|
||
LET url = ELEMENT(el, "a")
|
||
LET name = TRIM(FIRST(SPLIT(INNER_TEXT(ELEMENT(el, "div .height-txt-cat")), "-")))
|
||
LET price = TO_FLOAT(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .price_prod_resp")), "€", "."))
|
||
RETURN {
|
||
name: name,
|
||
url: url.attributes.href,
|
||
available: false,
|
||
price: price,
|
||
price_currency: "EUR",
|
||
}
|
||
)
|
||
// combine available and not available list of products into a single array of products
|
||
RETURN FLATTEN([products_available, products_not_available])
|
||
)
|
||
|
||
// combine all arrays to a single one
|
||
RETURN FLATTEN(results)
|
||
`
|
||
return q
|
||
}
|
||
|
||
func createQueryForMediamarktCh(url string) string {
|
||
q := `
|
||
LET page = '` + url + `'
|
||
LET doc = DOCUMENT(page, {driver: "cdp"})
|
||
|
||
LET pagination = (
|
||
FOR el IN ELEMENTS(doc, "div .pagination-wrapper a")
|
||
RETURN "https://www.mediamarkt.ch" + el.attributes.href
|
||
)
|
||
|
||
LET pages = SORTED_UNIQUE(pagination)
|
||
|
||
LET results = (
|
||
FOR page IN pages
|
||
NAVIGATE(doc, page)
|
||
LET products = (
|
||
FOR el IN ELEMENTS(doc, "div .product-wrapper")
|
||
LET name = TRIM(FIRST(SPLIT(INNER_TEXT(ELEMENT(el, "h2")), "-")))
|
||
LET url = ELEMENT(el, "a").attributes.href
|
||
LET price = TO_FLOAT(CONCAT(POP(ELEMENTS(el, "div .price span"))))
|
||
LET available = !REGEX_TEST(INNER_TEXT(ELEMENT(el, "div .availability li")), "^Non disponible(.*)")
|
||
RETURN {
|
||
name: name,
|
||
url: "https://www.mediamarkt.ch" + url,
|
||
price: price,
|
||
price_currency: "CHF",
|
||
available: available,
|
||
}
|
||
)
|
||
RETURN products
|
||
)
|
||
|
||
RETURN FLATTEN(results)
|
||
`
|
||
return q
|
||
}
|
||
|
||
func createQueryForMicroCenter(url string) string {
|
||
q := `
|
||
LET first_page = '` + url + `'
|
||
LET doc = DOCUMENT(first_page, {driver: "cdp"})
|
||
LET base_url = 'https://www.microcenter.com'
|
||
|
||
// determine store id or default to "shippable items" (online store)
|
||
LET detected_store_id = LAST(REGEX_MATCH(first_page, '&storeid=([0-9]*)', true))
|
||
LET store_id = detected_store_id ? detected_store_id : "029" // id of the online store if not found
|
||
COOKIE_SET(doc, {"name": "storeSelected", "value": store_id})
|
||
|
||
LET next_pages = (
|
||
FOR a IN ELEMENTS(doc, "div .pagination .pages a")
|
||
RETURN base_url + a.attributes.href
|
||
)
|
||
|
||
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
|
||
|
||
LET results = (
|
||
FOR page IN pages
|
||
NAVIGATE(doc, page)
|
||
LET products = (
|
||
FOR el IN ELEMENTS(doc, "div .products .product_wrapper")
|
||
LET details = ELEMENT(el, "h2")
|
||
LET name = INNER_TEXT(details)
|
||
LET url = ELEMENT(details, "a").attributes.href
|
||
LET price = TO_FLOAT(ELEMENT(details, "a").attributes."data-price")
|
||
LET available = LENGTH(ELEMENTS(el, "div .price_wrapper form .STBTN"))>0
|
||
RETURN {
|
||
name: name,
|
||
url: base_url + url + "?storeid=" + store_id,
|
||
price: price,
|
||
price_currency: "USD",
|
||
available: available,
|
||
}
|
||
)
|
||
RETURN products
|
||
)
|
||
|
||
RETURN FLATTEN(results)
|
||
`
|
||
return q
|
||
}
|