Archived
1
0
Fork 0
This repository has been archived on 2024-12-18. You can view files and clone it, but cannot push or open issues or pull requests.
restockbot/parser_url.go

486 lines
14 KiB
Go
Raw Permalink Normal View History

package main
import (
"context"
"encoding/json"
"fmt"
"regexp"
log "github.com/sirupsen/logrus"
"github.com/MontFerret/ferret/pkg/compiler"
"github.com/MontFerret/ferret/pkg/drivers"
"github.com/MontFerret/ferret/pkg/drivers/cdp"
"github.com/MontFerret/ferret/pkg/drivers/http"
)
// URLParser structure to handle websites parsing logic
type URLParser struct {
url string
includeRegex *regexp.Regexp
excludeRegex *regexp.Regexp
ctx context.Context
}
// String to print URLParser
// Implements the Parser interface
func (p *URLParser) String() string {
return fmt.Sprintf("URLParser<%s>", p.url)
}
// ShopName returns shop name from URL
func (p *URLParser) ShopName() (string, error) {
return ExtractShopName(p.url)
}
// NewURLParser to create a new URLParser instance
func NewURLParser(url string, browserAddress string, includeRegex string, excludeRegex string) (*URLParser, error) {
var err error
var includeRegexCompiled, excludeRegexCompiled *regexp.Regexp
log.Debugf("compiling include name regex")
if includeRegex != "" {
includeRegexCompiled, err = regexp.Compile(includeRegex)
if err != nil {
return nil, err
}
}
log.Debugf("compiling exclude name regex")
if excludeRegex != "" {
excludeRegexCompiled, err = regexp.Compile(excludeRegex)
if err != nil {
return nil, err
}
}
log.Debugf("creating context with headless browser drivers")
ctx := context.Background()
ctx = drivers.WithContext(ctx, cdp.NewDriver(cdp.WithAddress(browserAddress)))
ctx = drivers.WithContext(ctx, http.NewDriver(), drivers.AsDefault())
return &URLParser{
url: url,
includeRegex: includeRegexCompiled,
excludeRegex: excludeRegexCompiled,
ctx: ctx,
}, nil
}
// Parse a website to return list of products
// Implements Parser interface
// TODO: redirect output to logger
func (p *URLParser) Parse() ([]*Product, error) {
shopName, err := ExtractShopName(p.url)
if err != nil {
return nil, err
}
query, err := createQuery(shopName, p.url)
if err != nil {
return nil, err
}
comp := compiler.New()
program, err := comp.Compile(string(query))
if err != nil {
return nil, err
}
out, err := program.Run(p.ctx)
if err != nil {
return nil, err
}
var products []*Product
err = json.Unmarshal(out, &products)
if err != nil {
return nil, err
}
// apply filters
products = filterInclusive(p.includeRegex, products)
products = filterExclusive(p.excludeRegex, products)
return products, nil
}
func createQuery(shopName string, url string) (string, error) {
switch shopName {
case "cybertek.fr":
return createQueryForCybertek(url), nil
case "ldlc.com":
return createQueryForLDLC(url), nil
case "materiel.net":
return createQueryForMaterielNet(url), nil
case "microcenter.com":
return createQueryForMicroCenter(url), nil
case "mediamarkt.ch":
return createQueryForMediamarktCh(url), nil
case "newegg.com":
return createQueryForNewegg(url), nil
case "steg-electronics.ch":
return createQueryForStegElectronics(url), nil
case "topachat.com":
return createQueryForTopachat(url), nil
case "vsgamers.es":
return createQueryForVersusGamers(url), nil
default:
return "", fmt.Errorf("shop %s not supported", shopName)
}
}
func createQueryForLDLC(url string) string {
q := `
// gather first page
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
// discover next pages
LET pagination = ELEMENT(doc, ".pagination")
LET next_pages = (
FOR url in ELEMENTS(pagination, "a")
RETURN "https://www.ldlc.com" + url.attributes.href
)
// append first page to pagination and remove duplicates
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
// create a result array containing an one array of products per page
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
LET products = (
FOR el IN ELEMENTS(doc, ".pdt-item")
LET url = ELEMENT(el, "a")
LET name = INNER_TEXT(ELEMENT(el, "h3"))
LET price = TO_FLOAT(SUBSTITUTE(SUBSTITUTE(INNER_TEXT(ELEMENT(el, ".price")), "€", "."), " ", ""))
LET available = !CONTAINS(INNER_TEXT(ELEMENT(el, ".stock-web"), 'span'), "RUPTURE")
RETURN {
name: name,
url: "https://www.ldlc.com" + url.attributes.href,
price: price,
price_currency: "EUR",
available: available,
}
)
RETURN products
)
// combine all arrays to a single one
RETURN FLATTEN(results)
`
return q
}
func createQueryForMaterielNet(url string) string {
q := `
// gather first page
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
// discover next pages
LET pagination = ELEMENT(doc, ".pagination")
LET next_pages = (
FOR url in ELEMENTS(pagination, "a")
RETURN "https://www.materiel.net" + url.attributes.href
)
// append first page to pagination and remove duplicates
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
// create a result array containing an one array of products per page
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
WAIT_ELEMENT(doc, "div .o-product__price")
LET products = (
FOR el IN ELEMENTS(doc, "div .ajax-product-item")
LET image = ELEMENT(el, "img")
LET url = ELEMENT(el, "a")
LET price = TO_FLOAT(SUBSTITUTE(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .o-product__price")), "€", "."), " ", ""))
LET available = !CONTAINS(ELEMENT(el, "div .o-availability__value"), "Rupture")
RETURN {
name: image.attributes.alt,
url: "https://www.materiel.net" + url.attributes.href,
price: price,
price_currency: "EUR",
available: available,
}
)
RETURN products
)
// combine all arrays to a single one
RETURN FLATTEN(results)
`
return q
}
func createQueryForTopachat(url string) string {
q := `
LET page = '` + url + `'
LET doc = DOCUMENT(page, {driver: "cdp"})
FOR el IN ELEMENTS(doc, "article .grille-produit")
LET url = ELEMENT(el, "a")
LET name = INNER_TEXT(ELEMENT(el, "h3"))
LET price = TO_FLOAT(ELEMENT(el, "div .prod_px_euro").attributes.content)
LET available = !CONTAINS(ELEMENT(el, "link").attributes.href, "http://schema.org/OutOfStock")
RETURN {
url: "https://www.topachat.com" + url.attributes.href,
name: name,
price: price,
price_currency: "EUR",
available: available,
}
`
return q
}
func createQueryForCybertek(url string) string {
q := `
// gather first page
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
// discover next pages
LET pagination = ELEMENT(doc, "div .pagination-div")
LET next_pages = (
FOR url in ELEMENTS(pagination, "a")
RETURN url.attributes.href
)
// append first page to pagination, remove "#" link and remove duplicates
LET pages = SORTED_UNIQUE(APPEND(MINUS(next_pages, ["#"]), first_page))
// create a result array containing an one array of products per page
LET results = (
FOR page in pages
NAVIGATE(doc, page)
LET products_available = (
FOR el IN ELEMENTS(doc, "div .listing_dispo")
LET url = ELEMENT(el, "a")
LET name = TRIM(FIRST(SPLIT(INNER_TEXT(ELEMENT(el, "div .height-txt-cat")), "-")))
LET price = TO_FLOAT(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .price_prod_resp")), "€", "."))
RETURN {
name: name,
url: url.attributes.href,
available: true,
price: price,
price_currency: "EUR",
}
)
LET products_not_available = (
FOR el IN ELEMENTS(doc, "div .listing_nodispo")
LET url = ELEMENT(el, "a")
LET name = TRIM(FIRST(SPLIT(INNER_TEXT(ELEMENT(el, "div .height-txt-cat")), "-")))
LET price = TO_FLOAT(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .price_prod_resp")), "€", "."))
RETURN {
name: name,
url: url.attributes.href,
available: false,
price: price,
price_currency: "EUR",
}
)
// combine available and not available list of products into a single array of products
RETURN FLATTEN([products_available, products_not_available])
)
// combine all arrays to a single one
RETURN FLATTEN(results)
`
return q
}
func createQueryForMediamarktCh(url string) string {
q := `
LET page = '` + url + `'
LET doc = DOCUMENT(page, {driver: "cdp"})
LET pagination = (
FOR el IN ELEMENTS(doc, "div .pagination-wrapper a")
RETURN "https://www.mediamarkt.ch" + el.attributes.href
)
LET pages = SORTED_UNIQUE(pagination)
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
LET products = (
FOR el IN ELEMENTS(doc, "div .product-wrapper")
LET name = TRIM(FIRST(SPLIT(INNER_TEXT(ELEMENT(el, "h2")), "-")))
LET url = ELEMENT(el, "a").attributes.href
LET price = TO_FLOAT(CONCAT(POP(ELEMENTS(el, "div .price span"))))
LET available = !REGEX_TEST(INNER_TEXT(ELEMENT(el, "div .availability li")), "^Non disponible(.*)")
RETURN {
name: name,
url: "https://www.mediamarkt.ch" + url,
price: price,
price_currency: "CHF",
available: available,
}
)
RETURN products
)
RETURN FLATTEN(results)
`
return q
}
func createQueryForMicroCenter(url string) string {
q := `
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
LET base_url = 'https://www.microcenter.com'
// determine store id or default to "shippable items" (online store)
LET detected_store_id = LAST(REGEX_MATCH(first_page, '&storeid=([0-9]*)', true))
LET store_id = detected_store_id ? detected_store_id : "029" // id of the online store if not found
COOKIE_SET(doc, {"name": "storeSelected", "value": store_id})
LET next_pages = (
FOR a IN ELEMENTS(doc, "div .pagination .pages a")
RETURN base_url + a.attributes.href
)
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
LET products = (
FOR el IN ELEMENTS(doc, "div .products .product_wrapper")
LET details = ELEMENT(el, "h2")
LET name = INNER_TEXT(details)
LET url = ELEMENT(details, "a").attributes.href
LET price = TO_FLOAT(ELEMENT(details, "a").attributes."data-price")
LET available = LENGTH(ELEMENTS(el, "div .price_wrapper form .STBTN"))>0
RETURN {
name: name,
url: base_url + url + "?storeid=" + store_id,
price: price,
price_currency: "USD",
available: available,
}
)
RETURN products
)
RETURN FLATTEN(results)
`
return q
}
func createQueryForNewegg(url string) string {
q := `
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
LET pagination = LAST(ELEMENTS(doc, 'div .list-tool-pagination'))
LET pages = (
FOR button IN ELEMENTS(pagination, 'button')
RETURN TO_INT(INNER_TEXT(button))
)
LET pages_count = MAX(pages)
LET results = (
FOR page IN 1..pages_count
LET current_page = first_page + '&page=' + page
NAVIGATE(doc, current_page)
LET products = (
FOR el IN ELEMENTS(doc, "div .item-cell")
LET a = ELEMENT(el, "div .item-title")
LET price = TO_FLOAT(SUBSTITUTE(SUBSTITUTE(TRIM(INNER_TEXT(ELEMENT(el, "div .price-current"))), '$', ''), ',', ''))
LET available = !ELEMENT_EXISTS(el, "div .item-promo")
RETURN {
name: INNER_TEXT(a),
url: a.attributes.href,
price: price,
price_currency: "USD",
available: available,
}
)
RETURN products
)
RETURN FLATTEN(results)
`
return q
}
func createQueryForStegElectronics(url string) string {
q := `
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, { driver: "cdp" })
LET next_pages = (
FOR next_page IN ELEMENTS(doc, "div#pagination a.current-pages")
RETURN next_page.attributes.href
)
LET pages = SORTED_UNIQUE(APPEND(REMOVE_VALUE(next_pages, '#'), first_page))
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
LET products = (
FOR el IN ELEMENTS(doc, "article.product-list-container div.productGridElement")
LET link = ELEMENT(el, "h2 a")
LET shipment = ELEMENT(el, "div.availabilityInfo a .iconShipment").attributes.style.color != "#BFBFBF"
LET pickup = ELEMENT(el, "div.availabilityInfo a .iconPickup").attributes.style.color != "#BFBFBF"
LET price = TO_FLOAT(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div.generalPrice")), "'", ""))
RETURN {
name: link.attributes.title,
url: link.attributes.href,
available: shipment OR pickup,
price: price,
price_currency: "CHF",
}
)
RETURN products
)
RETURN FLATTEN(results)
`
return q
}
/*
* TODO:
* - pagination (relies on scroll down move to feed the "div.vs-product-list" element with new items)
* - remove products with "name: null"
*/
func createQueryForVersusGamers(url string) string {
q := `
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, { driver: "cdp" })
LET base_url = 'https://www.vsgamers.es'
FOR el IN ELEMENTS(doc, "div.vs-product-list div.vs-product-list-item")
LET link = ELEMENT(el, "a")
LET hasLink = link != null ? true : false
LET title = ELEMENT(el, "div.vs-product-card-title")
LET name = title != null ? INNER_TEXT(title) : null
LET url = hasLink ? base_url + link.attributes.href : null
LET available = ELEMENT(el, "div.vs-product-card-action button") != null
LET price_element = ELEMENT(el, "div.vs-product-card-prices span")
LET price = price_element != null ? TO_FLOAT(price_element.attributes."data-price") : 0
LET price_currency = "EUR"
RETURN {
name: name,
url: url,
available: available,
price: price,
price_currency: price_currency,
}
`
return q
}