Archived
1
0
Fork 0

Add mineshop support

This commit is contained in:
Julien Riou 2021-01-13 09:10:30 +01:00
parent 29c26167dc
commit 2c7189d7fc
No known key found for this signature in database
GPG key ID: FF42D23B580C89F7
4 changed files with 85 additions and 4 deletions

View file

@ -7,7 +7,7 @@ new generation (AMD RX 6000 series, NVIDIA GeForce RTX 3000 series). Even older
Based on Debian 10: Based on Debian 10:
``` ```
apt install python3-selenium python3-sqlalchemy python3-tweepy firefox-esr apt install python3-selenium python3-sqlalchemy python3-tweepy python3-bs4 firefox-esr
curl -L -s https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-linux64.tar.gz | tar xvpzf - -C /usr/local/bin/ curl -L -s https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-linux64.tar.gz | tar xvpzf - -C /usr/local/bin/
chown root:root /usr/local/bin/geckodriver chown root:root /usr/local/bin/geckodriver
chmod +x /usr/local/bin/geckodriver chmod +x /usr/local/bin/geckodriver

View file

@ -1,7 +1,7 @@
import logging import logging
from parsers import (AlternateParser, LDLCParser, MaterielNetParser, from parsers import (AlternateParser, LDLCParser, MaterielNetParser,
TopAchatParser) MineShopParser, TopAchatParser)
from selenium import webdriver from selenium import webdriver
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
@ -94,9 +94,20 @@ class AlternateCrawler(ProductCrawler):
self.products += self.add_shop(parser.products) self.products += self.add_shop(parser.products)
class MineShopCrawler(ProductCrawler):
def __init__(self, shop, urls):
super().__init__(shop)
parser = MineShopParser()
for url in urls:
webpage = self.fetch(url=url)
parser.feed(webpage)
self.products += self.add_shop(parser.products)
CRAWLERS = { CRAWLERS = {
'topachat.com': TopAchatCrawler, 'topachat.com': TopAchatCrawler,
'ldlc.com': LDLCCrawler, 'ldlc.com': LDLCCrawler,
'materiel.net': MaterielNetCrawler, 'materiel.net': MaterielNetCrawler,
'alternate.be': AlternateCrawler 'alternate.be': AlternateCrawler,
'mineshop.eu': MineShopCrawler
} }

View file

@ -59,7 +59,6 @@ def main():
for shop in list_shops(): for shop in list_shops():
urls = shops.get(shop.name) urls = shops.get(shop.name)
if not urls: if not urls:
logger.warning(f'cannot find urls for shop {shop} in the configuration file')
continue continue
all_futures.append(executor.submit(crawl_shop, shop, urls)) all_futures.append(executor.submit(crawl_shop, shop, urls))
for future in futures.as_completed(all_futures): for future in futures.as_completed(all_futures):

View file

@ -1,6 +1,8 @@
import logging import logging
from html.parser import HTMLParser from html.parser import HTMLParser
from bs4 import BeautifulSoup
from bs4.element import Tag
from db import Product from db import Product
from utils import parse_base_url from utils import parse_base_url
@ -401,3 +403,72 @@ class AlternateParser(ProductParser):
def parse_url(self, string): def parse_url(self, string):
string = string.split('?')[0] # remove query string string = string.split('?')[0] # remove query string
return f'{self._base_url}{string}' return f'{self._base_url}{string}'
class MineShopParser:
def __init__(self, url=None):
self.products = []
self._product = Product()
def feed(self, webpage):
tags = self._find_products(webpage)
for tag in tags:
# product has at least a name
name = self._parse_name(tag)
if not name:
continue
self._product.name = name
# parse all other attributes
price, currency = self._parse_price(tag)
self._product.price = price
self._product.price_currency = currency
self._product.url = self._parse_url(tag)
self._product.available = self._parse_availability(tag)
# then add product to list
self.products.append(self._product)
self._product = Product()
@staticmethod
def _find_products(webpage):
soup = BeautifulSoup(webpage, features='lxml')
products = []
tags = soup.find_all('ul')
for tag in tags:
if 'products' in tag.get('class', []):
for child in tag.children:
products.append(child)
return products
@staticmethod
def _parse_name(product):
title = product.find('h2')
if type(title) is Tag:
return title.text
@staticmethod
def _parse_price(product):
tag = product.find('bdi')
if type(tag) is Tag:
string = tag.text
if '' in string:
currency = 'EUR'
string = string.replace('', '').strip()
price = float(string)
return price, currency
@staticmethod
def _parse_url(product):
tag = product.find('a')
if type(tag) is Tag and tag.get('href'):
return tag['href']
@staticmethod
def _parse_availability(product):
tag = product.find('p')
if type(tag) is Tag:
attributes = tag.get('class', [])
if 'stock' in attributes:
attributes.remove('stock')
availability = attributes[0]
return availability != 'out-of-stock'
return True