Julien Riou
A shop map was created to group URLs by shops and process them in order. Now that we have Amazon and each URL can be parsed independently, there is no need to group them anymore. Moreover, shops were passed as an argument to the handleProducts function. Shop name can be deduced by the parser itself. The parser has a reference to the database. The parser now select or create the shop before parsing products. Signed-off-by: Julien Riou <julien@riou.xyz>
485 lines
14 KiB
485 lines
14 KiB
package main
import (
log "github.com/sirupsen/logrus"
// URLParser structure to handle websites parsing logic
type URLParser struct {
url string
includeRegex *regexp.Regexp
excludeRegex *regexp.Regexp
ctx context.Context
// String to print URLParser
// Implements the Parser interface
func (p *URLParser) String() string {
return fmt.Sprintf("URLParser<%s>", p.url)
// ShopName returns shop name from URL
func (p *URLParser) ShopName() (string, error) {
return ExtractShopName(p.url)
// NewURLParser to create a new URLParser instance
func NewURLParser(url string, browserAddress string, includeRegex string, excludeRegex string) (*URLParser, error) {
var err error
var includeRegexCompiled, excludeRegexCompiled *regexp.Regexp
log.Debugf("compiling include name regex")
if includeRegex != "" {
includeRegexCompiled, err = regexp.Compile(includeRegex)
if err != nil {
return nil, err
log.Debugf("compiling exclude name regex")
if excludeRegex != "" {
excludeRegexCompiled, err = regexp.Compile(excludeRegex)
if err != nil {
return nil, err
log.Debugf("creating context with headless browser drivers")
ctx := context.Background()
ctx = drivers.WithContext(ctx, cdp.NewDriver(cdp.WithAddress(browserAddress)))
ctx = drivers.WithContext(ctx, http.NewDriver(), drivers.AsDefault())
return &URLParser{
url: url,
includeRegex: includeRegexCompiled,
excludeRegex: excludeRegexCompiled,
ctx: ctx,
}, nil
// Parse a website to return list of products
// Implements Parser interface
// TODO: redirect output to logger
func (p *URLParser) Parse() ([]*Product, error) {
shopName, err := ExtractShopName(p.url)
if err != nil {
return nil, err
query, err := createQuery(shopName, p.url)
if err != nil {
return nil, err
comp := compiler.New()
program, err := comp.Compile(string(query))
if err != nil {
return nil, err
out, err := program.Run(p.ctx)
if err != nil {
return nil, err
var products []*Product
err = json.Unmarshal(out, &products)
if err != nil {
return nil, err
// apply filters
products = filterInclusive(p.includeRegex, products)
products = filterExclusive(p.excludeRegex, products)
return products, nil
func createQuery(shopName string, url string) (string, error) {
switch shopName {
case "cybertek.fr":
return createQueryForCybertek(url), nil
case "ldlc.com":
return createQueryForLDLC(url), nil
case "materiel.net":
return createQueryForMaterielNet(url), nil
case "microcenter.com":
return createQueryForMicroCenter(url), nil
case "mediamarkt.ch":
return createQueryForMediamarktCh(url), nil
case "newegg.com":
return createQueryForNewegg(url), nil
case "steg-electronics.ch":
return createQueryForStegElectronics(url), nil
case "topachat.com":
return createQueryForTopachat(url), nil
case "vsgamers.es":
return createQueryForVersusGamers(url), nil
return "", fmt.Errorf("shop %s not supported", shopName)
func createQueryForLDLC(url string) string {
q := `
// gather first page
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
// discover next pages
LET pagination = ELEMENT(doc, ".pagination")
LET next_pages = (
FOR url in ELEMENTS(pagination, "a")
RETURN "https://www.ldlc.com" + url.attributes.href
// append first page to pagination and remove duplicates
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
// create a result array containing an one array of products per page
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
LET products = (
FOR el IN ELEMENTS(doc, ".pdt-item")
LET url = ELEMENT(el, "a")
LET name = INNER_TEXT(ELEMENT(el, "h3"))
LET price = TO_FLOAT(SUBSTITUTE(SUBSTITUTE(INNER_TEXT(ELEMENT(el, ".price")), "€", "."), " ", ""))
LET available = !CONTAINS(INNER_TEXT(ELEMENT(el, ".stock-web"), 'span'), "RUPTURE")
name: name,
url: "https://www.ldlc.com" + url.attributes.href,
price: price,
price_currency: "EUR",
available: available,
RETURN products
// combine all arrays to a single one
return q
func createQueryForMaterielNet(url string) string {
q := `
// gather first page
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
// discover next pages
LET pagination = ELEMENT(doc, ".pagination")
LET next_pages = (
FOR url in ELEMENTS(pagination, "a")
RETURN "https://www.materiel.net" + url.attributes.href
// append first page to pagination and remove duplicates
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
// create a result array containing an one array of products per page
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
WAIT_ELEMENT(doc, "div .o-product__price")
LET products = (
FOR el IN ELEMENTS(doc, "div .ajax-product-item")
LET image = ELEMENT(el, "img")
LET url = ELEMENT(el, "a")
LET price = TO_FLOAT(SUBSTITUTE(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .o-product__price")), "€", "."), " ", ""))
LET available = !CONTAINS(ELEMENT(el, "div .o-availability__value"), "Rupture")
name: image.attributes.alt,
url: "https://www.materiel.net" + url.attributes.href,
price: price,
price_currency: "EUR",
available: available,
RETURN products
// combine all arrays to a single one
return q
func createQueryForTopachat(url string) string {
q := `
LET page = '` + url + `'
LET doc = DOCUMENT(page, {driver: "cdp"})
FOR el IN ELEMENTS(doc, "article .grille-produit")
LET url = ELEMENT(el, "a")
LET name = INNER_TEXT(ELEMENT(el, "h3"))
LET price = TO_FLOAT(ELEMENT(el, "div .prod_px_euro").attributes.content)
LET available = !CONTAINS(ELEMENT(el, "link").attributes.href, "http://schema.org/OutOfStock")
url: "https://www.topachat.com" + url.attributes.href,
name: name,
price: price,
price_currency: "EUR",
available: available,
return q
func createQueryForCybertek(url string) string {
q := `
// gather first page
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
// discover next pages
LET pagination = ELEMENT(doc, "div .pagination-div")
LET next_pages = (
FOR url in ELEMENTS(pagination, "a")
RETURN url.attributes.href
// append first page to pagination, remove "#" link and remove duplicates
LET pages = SORTED_UNIQUE(APPEND(MINUS(next_pages, ["#"]), first_page))
// create a result array containing an one array of products per page
LET results = (
FOR page in pages
NAVIGATE(doc, page)
LET products_available = (
FOR el IN ELEMENTS(doc, "div .listing_dispo")
LET url = ELEMENT(el, "a")
LET name = TRIM(FIRST(SPLIT(INNER_TEXT(ELEMENT(el, "div .height-txt-cat")), "-")))
LET price = TO_FLOAT(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .price_prod_resp")), "€", "."))
name: name,
url: url.attributes.href,
available: true,
price: price,
price_currency: "EUR",
LET products_not_available = (
FOR el IN ELEMENTS(doc, "div .listing_nodispo")
LET url = ELEMENT(el, "a")
LET name = TRIM(FIRST(SPLIT(INNER_TEXT(ELEMENT(el, "div .height-txt-cat")), "-")))
LET price = TO_FLOAT(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div .price_prod_resp")), "€", "."))
name: name,
url: url.attributes.href,
available: false,
price: price,
price_currency: "EUR",
// combine available and not available list of products into a single array of products
RETURN FLATTEN([products_available, products_not_available])
// combine all arrays to a single one
return q
func createQueryForMediamarktCh(url string) string {
q := `
LET page = '` + url + `'
LET doc = DOCUMENT(page, {driver: "cdp"})
LET pagination = (
FOR el IN ELEMENTS(doc, "div .pagination-wrapper a")
RETURN "https://www.mediamarkt.ch" + el.attributes.href
LET pages = SORTED_UNIQUE(pagination)
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
LET products = (
FOR el IN ELEMENTS(doc, "div .product-wrapper")
LET url = ELEMENT(el, "a").attributes.href
LET price = TO_FLOAT(CONCAT(POP(ELEMENTS(el, "div .price span"))))
LET available = !REGEX_TEST(INNER_TEXT(ELEMENT(el, "div .availability li")), "^Non disponible(.*)")
name: name,
url: "https://www.mediamarkt.ch" + url,
price: price,
price_currency: "CHF",
available: available,
RETURN products
return q
func createQueryForMicroCenter(url string) string {
q := `
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
LET base_url = 'https://www.microcenter.com'
// determine store id or default to "shippable items" (online store)
LET detected_store_id = LAST(REGEX_MATCH(first_page, '&storeid=([0-9]*)', true))
LET store_id = detected_store_id ? detected_store_id : "029" // id of the online store if not found
COOKIE_SET(doc, {"name": "storeSelected", "value": store_id})
LET next_pages = (
FOR a IN ELEMENTS(doc, "div .pagination .pages a")
RETURN base_url + a.attributes.href
LET pages = SORTED_UNIQUE(APPEND(next_pages, first_page))
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
LET products = (
FOR el IN ELEMENTS(doc, "div .products .product_wrapper")
LET details = ELEMENT(el, "h2")
LET name = INNER_TEXT(details)
LET url = ELEMENT(details, "a").attributes.href
LET price = TO_FLOAT(ELEMENT(details, "a").attributes."data-price")
LET available = LENGTH(ELEMENTS(el, "div .price_wrapper form .STBTN"))>0
name: name,
url: base_url + url + "?storeid=" + store_id,
price: price,
price_currency: "USD",
available: available,
RETURN products
return q
func createQueryForNewegg(url string) string {
q := `
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, {driver: "cdp"})
LET pagination = LAST(ELEMENTS(doc, 'div .list-tool-pagination'))
LET pages = (
FOR button IN ELEMENTS(pagination, 'button')
LET pages_count = MAX(pages)
LET results = (
FOR page IN 1..pages_count
LET current_page = first_page + '&page=' + page
NAVIGATE(doc, current_page)
LET products = (
FOR el IN ELEMENTS(doc, "div .item-cell")
LET a = ELEMENT(el, "div .item-title")
LET price = TO_FLOAT(SUBSTITUTE(SUBSTITUTE(TRIM(INNER_TEXT(ELEMENT(el, "div .price-current"))), '$', ''), ',', ''))
LET available = !ELEMENT_EXISTS(el, "div .item-promo")
name: INNER_TEXT(a),
url: a.attributes.href,
price: price,
price_currency: "USD",
available: available,
RETURN products
return q
func createQueryForStegElectronics(url string) string {
q := `
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, { driver: "cdp" })
LET next_pages = (
FOR next_page IN ELEMENTS(doc, "div#pagination a.current-pages")
RETURN next_page.attributes.href
LET pages = SORTED_UNIQUE(APPEND(REMOVE_VALUE(next_pages, '#'), first_page))
LET results = (
FOR page IN pages
NAVIGATE(doc, page)
LET products = (
FOR el IN ELEMENTS(doc, "article.product-list-container div.productGridElement")
LET link = ELEMENT(el, "h2 a")
LET shipment = ELEMENT(el, "div.availabilityInfo a .iconShipment").attributes.style.color != "#BFBFBF"
LET pickup = ELEMENT(el, "div.availabilityInfo a .iconPickup").attributes.style.color != "#BFBFBF"
LET price = TO_FLOAT(SUBSTITUTE(INNER_TEXT(ELEMENT(el, "div.generalPrice")), "'", ""))
name: link.attributes.title,
url: link.attributes.href,
available: shipment OR pickup,
price: price,
price_currency: "CHF",
RETURN products
return q
* - pagination (relies on scroll down move to feed the "div.vs-product-list" element with new items)
* - remove products with "name: null"
func createQueryForVersusGamers(url string) string {
q := `
LET first_page = '` + url + `'
LET doc = DOCUMENT(first_page, { driver: "cdp" })
LET base_url = 'https://www.vsgamers.es'
FOR el IN ELEMENTS(doc, "div.vs-product-list div.vs-product-list-item")
LET link = ELEMENT(el, "a")
LET hasLink = link != null ? true : false
LET title = ELEMENT(el, "div.vs-product-card-title")
LET name = title != null ? INNER_TEXT(title) : null
LET url = hasLink ? base_url + link.attributes.href : null
LET available = ELEMENT(el, "div.vs-product-card-action button") != null
LET price_element = ELEMENT(el, "div.vs-product-card-prices span")
LET price = price_element != null ? TO_FLOAT(price_element.attributes."data-price") : 0
LET price_currency = "EUR"
name: name,
url: url,
available: available,
price: price,
price_currency: price_currency,
return q