Archived
1
0
Fork 0

Add mineshop support

This commit is contained in:
Julien Riou 2021-01-13 09:10:30 +01:00
parent 29c26167dc
commit 2c7189d7fc
No known key found for this signature in database
GPG key ID: FF42D23B580C89F7
4 changed files with 85 additions and 4 deletions

View file

@ -7,7 +7,7 @@ new generation (AMD RX 6000 series, NVIDIA GeForce RTX 3000 series). Even older
Based on Debian 10:
```
apt install python3-selenium python3-sqlalchemy python3-tweepy firefox-esr
apt install python3-selenium python3-sqlalchemy python3-tweepy python3-bs4 firefox-esr
curl -L -s https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-linux64.tar.gz | tar xvpzf - -C /usr/local/bin/
chown root:root /usr/local/bin/geckodriver
chmod +x /usr/local/bin/geckodriver

View file

@ -1,7 +1,7 @@
import logging
from parsers import (AlternateParser, LDLCParser, MaterielNetParser,
TopAchatParser)
MineShopParser, TopAchatParser)
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
@ -94,9 +94,20 @@ class AlternateCrawler(ProductCrawler):
self.products += self.add_shop(parser.products)
class MineShopCrawler(ProductCrawler):
def __init__(self, shop, urls):
super().__init__(shop)
parser = MineShopParser()
for url in urls:
webpage = self.fetch(url=url)
parser.feed(webpage)
self.products += self.add_shop(parser.products)
CRAWLERS = {
'topachat.com': TopAchatCrawler,
'ldlc.com': LDLCCrawler,
'materiel.net': MaterielNetCrawler,
'alternate.be': AlternateCrawler
'alternate.be': AlternateCrawler,
'mineshop.eu': MineShopCrawler
}

View file

@ -59,7 +59,6 @@ def main():
for shop in list_shops():
urls = shops.get(shop.name)
if not urls:
logger.warning(f'cannot find urls for shop {shop} in the configuration file')
continue
all_futures.append(executor.submit(crawl_shop, shop, urls))
for future in futures.as_completed(all_futures):

View file

@ -1,6 +1,8 @@
import logging
from html.parser import HTMLParser
from bs4 import BeautifulSoup
from bs4.element import Tag
from db import Product
from utils import parse_base_url
@ -401,3 +403,72 @@ class AlternateParser(ProductParser):
def parse_url(self, string):
string = string.split('?')[0] # remove query string
return f'{self._base_url}{string}'
class MineShopParser:
def __init__(self, url=None):
self.products = []
self._product = Product()
def feed(self, webpage):
tags = self._find_products(webpage)
for tag in tags:
# product has at least a name
name = self._parse_name(tag)
if not name:
continue
self._product.name = name
# parse all other attributes
price, currency = self._parse_price(tag)
self._product.price = price
self._product.price_currency = currency
self._product.url = self._parse_url(tag)
self._product.available = self._parse_availability(tag)
# then add product to list
self.products.append(self._product)
self._product = Product()
@staticmethod
def _find_products(webpage):
soup = BeautifulSoup(webpage, features='lxml')
products = []
tags = soup.find_all('ul')
for tag in tags:
if 'products' in tag.get('class', []):
for child in tag.children:
products.append(child)
return products
@staticmethod
def _parse_name(product):
title = product.find('h2')
if type(title) is Tag:
return title.text
@staticmethod
def _parse_price(product):
tag = product.find('bdi')
if type(tag) is Tag:
string = tag.text
if '' in string:
currency = 'EUR'
string = string.replace('', '').strip()
price = float(string)
return price, currency
@staticmethod
def _parse_url(product):
tag = product.find('a')
if type(tag) is Tag and tag.get('href'):
return tag['href']
@staticmethod
def _parse_availability(product):
tag = product.find('p')
if type(tag) is Tag:
attributes = tag.get('class', [])
if 'stock' in attributes:
attributes.remove('stock')
availability = attributes[0]
return availability != 'out-of-stock'
return True