Add mineshop support
This commit is contained in:
parent
29c26167dc
commit
2c7189d7fc
4 changed files with 85 additions and 4 deletions
|
@ -7,7 +7,7 @@ new generation (AMD RX 6000 series, NVIDIA GeForce RTX 3000 series). Even older
|
||||||
Based on Debian 10:
|
Based on Debian 10:
|
||||||
|
|
||||||
```
|
```
|
||||||
apt install python3-selenium python3-sqlalchemy python3-tweepy firefox-esr
|
apt install python3-selenium python3-sqlalchemy python3-tweepy python3-bs4 firefox-esr
|
||||||
curl -L -s https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-linux64.tar.gz | tar xvpzf - -C /usr/local/bin/
|
curl -L -s https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-linux64.tar.gz | tar xvpzf - -C /usr/local/bin/
|
||||||
chown root:root /usr/local/bin/geckodriver
|
chown root:root /usr/local/bin/geckodriver
|
||||||
chmod +x /usr/local/bin/geckodriver
|
chmod +x /usr/local/bin/geckodriver
|
||||||
|
|
15
crawlers.py
15
crawlers.py
|
@ -1,7 +1,7 @@
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from parsers import (AlternateParser, LDLCParser, MaterielNetParser,
|
from parsers import (AlternateParser, LDLCParser, MaterielNetParser,
|
||||||
TopAchatParser)
|
MineShopParser, TopAchatParser)
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.common.exceptions import TimeoutException
|
from selenium.common.exceptions import TimeoutException
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
|
@ -94,9 +94,20 @@ class AlternateCrawler(ProductCrawler):
|
||||||
self.products += self.add_shop(parser.products)
|
self.products += self.add_shop(parser.products)
|
||||||
|
|
||||||
|
|
||||||
|
class MineShopCrawler(ProductCrawler):
|
||||||
|
def __init__(self, shop, urls):
|
||||||
|
super().__init__(shop)
|
||||||
|
parser = MineShopParser()
|
||||||
|
for url in urls:
|
||||||
|
webpage = self.fetch(url=url)
|
||||||
|
parser.feed(webpage)
|
||||||
|
self.products += self.add_shop(parser.products)
|
||||||
|
|
||||||
|
|
||||||
CRAWLERS = {
|
CRAWLERS = {
|
||||||
'topachat.com': TopAchatCrawler,
|
'topachat.com': TopAchatCrawler,
|
||||||
'ldlc.com': LDLCCrawler,
|
'ldlc.com': LDLCCrawler,
|
||||||
'materiel.net': MaterielNetCrawler,
|
'materiel.net': MaterielNetCrawler,
|
||||||
'alternate.be': AlternateCrawler
|
'alternate.be': AlternateCrawler,
|
||||||
|
'mineshop.eu': MineShopCrawler
|
||||||
}
|
}
|
||||||
|
|
1
main.py
1
main.py
|
@ -59,7 +59,6 @@ def main():
|
||||||
for shop in list_shops():
|
for shop in list_shops():
|
||||||
urls = shops.get(shop.name)
|
urls = shops.get(shop.name)
|
||||||
if not urls:
|
if not urls:
|
||||||
logger.warning(f'cannot find urls for shop {shop} in the configuration file')
|
|
||||||
continue
|
continue
|
||||||
all_futures.append(executor.submit(crawl_shop, shop, urls))
|
all_futures.append(executor.submit(crawl_shop, shop, urls))
|
||||||
for future in futures.as_completed(all_futures):
|
for future in futures.as_completed(all_futures):
|
||||||
|
|
71
parsers.py
71
parsers.py
|
@ -1,6 +1,8 @@
|
||||||
import logging
|
import logging
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Tag
|
||||||
from db import Product
|
from db import Product
|
||||||
from utils import parse_base_url
|
from utils import parse_base_url
|
||||||
|
|
||||||
|
@ -401,3 +403,72 @@ class AlternateParser(ProductParser):
|
||||||
def parse_url(self, string):
|
def parse_url(self, string):
|
||||||
string = string.split('?')[0] # remove query string
|
string = string.split('?')[0] # remove query string
|
||||||
return f'{self._base_url}{string}'
|
return f'{self._base_url}{string}'
|
||||||
|
|
||||||
|
|
||||||
|
class MineShopParser:
|
||||||
|
def __init__(self, url=None):
|
||||||
|
self.products = []
|
||||||
|
self._product = Product()
|
||||||
|
|
||||||
|
def feed(self, webpage):
|
||||||
|
tags = self._find_products(webpage)
|
||||||
|
for tag in tags:
|
||||||
|
# product has at least a name
|
||||||
|
name = self._parse_name(tag)
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
self._product.name = name
|
||||||
|
# parse all other attributes
|
||||||
|
price, currency = self._parse_price(tag)
|
||||||
|
self._product.price = price
|
||||||
|
self._product.price_currency = currency
|
||||||
|
self._product.url = self._parse_url(tag)
|
||||||
|
self._product.available = self._parse_availability(tag)
|
||||||
|
# then add product to list
|
||||||
|
self.products.append(self._product)
|
||||||
|
self._product = Product()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_products(webpage):
|
||||||
|
soup = BeautifulSoup(webpage, features='lxml')
|
||||||
|
products = []
|
||||||
|
tags = soup.find_all('ul')
|
||||||
|
for tag in tags:
|
||||||
|
if 'products' in tag.get('class', []):
|
||||||
|
for child in tag.children:
|
||||||
|
products.append(child)
|
||||||
|
return products
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_name(product):
|
||||||
|
title = product.find('h2')
|
||||||
|
if type(title) is Tag:
|
||||||
|
return title.text
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_price(product):
|
||||||
|
tag = product.find('bdi')
|
||||||
|
if type(tag) is Tag:
|
||||||
|
string = tag.text
|
||||||
|
if '€' in string:
|
||||||
|
currency = 'EUR'
|
||||||
|
string = string.replace('€', '').strip()
|
||||||
|
price = float(string)
|
||||||
|
return price, currency
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_url(product):
|
||||||
|
tag = product.find('a')
|
||||||
|
if type(tag) is Tag and tag.get('href'):
|
||||||
|
return tag['href']
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_availability(product):
|
||||||
|
tag = product.find('p')
|
||||||
|
if type(tag) is Tag:
|
||||||
|
attributes = tag.get('class', [])
|
||||||
|
if 'stock' in attributes:
|
||||||
|
attributes.remove('stock')
|
||||||
|
availability = attributes[0]
|
||||||
|
return availability != 'out-of-stock'
|
||||||
|
return True
|
||||||
|
|
Reference in a new issue