Add mineshop support
This commit is contained in:
parent
29c26167dc
commit
2c7189d7fc
4 changed files with 85 additions and 4 deletions
|
@ -7,7 +7,7 @@ new generation (AMD RX 6000 series, NVIDIA GeForce RTX 3000 series). Even older
|
|||
Based on Debian 10:
|
||||
|
||||
```
|
||||
apt install python3-selenium python3-sqlalchemy python3-tweepy firefox-esr
|
||||
apt install python3-selenium python3-sqlalchemy python3-tweepy python3-bs4 firefox-esr
|
||||
curl -L -s https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-linux64.tar.gz | tar xvpzf - -C /usr/local/bin/
|
||||
chown root:root /usr/local/bin/geckodriver
|
||||
chmod +x /usr/local/bin/geckodriver
|
||||
|
|
15
crawlers.py
15
crawlers.py
|
@ -1,7 +1,7 @@
|
|||
import logging
|
||||
|
||||
from parsers import (AlternateParser, LDLCParser, MaterielNetParser,
|
||||
TopAchatParser)
|
||||
MineShopParser, TopAchatParser)
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.common.by import By
|
||||
|
@ -94,9 +94,20 @@ class AlternateCrawler(ProductCrawler):
|
|||
self.products += self.add_shop(parser.products)
|
||||
|
||||
|
||||
class MineShopCrawler(ProductCrawler):
|
||||
def __init__(self, shop, urls):
|
||||
super().__init__(shop)
|
||||
parser = MineShopParser()
|
||||
for url in urls:
|
||||
webpage = self.fetch(url=url)
|
||||
parser.feed(webpage)
|
||||
self.products += self.add_shop(parser.products)
|
||||
|
||||
|
||||
CRAWLERS = {
|
||||
'topachat.com': TopAchatCrawler,
|
||||
'ldlc.com': LDLCCrawler,
|
||||
'materiel.net': MaterielNetCrawler,
|
||||
'alternate.be': AlternateCrawler
|
||||
'alternate.be': AlternateCrawler,
|
||||
'mineshop.eu': MineShopCrawler
|
||||
}
|
||||
|
|
1
main.py
1
main.py
|
@ -59,7 +59,6 @@ def main():
|
|||
for shop in list_shops():
|
||||
urls = shops.get(shop.name)
|
||||
if not urls:
|
||||
logger.warning(f'cannot find urls for shop {shop} in the configuration file')
|
||||
continue
|
||||
all_futures.append(executor.submit(crawl_shop, shop, urls))
|
||||
for future in futures.as_completed(all_futures):
|
||||
|
|
71
parsers.py
71
parsers.py
|
@ -1,6 +1,8 @@
|
|||
import logging
|
||||
from html.parser import HTMLParser
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
from db import Product
|
||||
from utils import parse_base_url
|
||||
|
||||
|
@ -401,3 +403,72 @@ class AlternateParser(ProductParser):
|
|||
def parse_url(self, string):
|
||||
string = string.split('?')[0] # remove query string
|
||||
return f'{self._base_url}{string}'
|
||||
|
||||
|
||||
class MineShopParser:
|
||||
def __init__(self, url=None):
|
||||
self.products = []
|
||||
self._product = Product()
|
||||
|
||||
def feed(self, webpage):
|
||||
tags = self._find_products(webpage)
|
||||
for tag in tags:
|
||||
# product has at least a name
|
||||
name = self._parse_name(tag)
|
||||
if not name:
|
||||
continue
|
||||
self._product.name = name
|
||||
# parse all other attributes
|
||||
price, currency = self._parse_price(tag)
|
||||
self._product.price = price
|
||||
self._product.price_currency = currency
|
||||
self._product.url = self._parse_url(tag)
|
||||
self._product.available = self._parse_availability(tag)
|
||||
# then add product to list
|
||||
self.products.append(self._product)
|
||||
self._product = Product()
|
||||
|
||||
@staticmethod
|
||||
def _find_products(webpage):
|
||||
soup = BeautifulSoup(webpage, features='lxml')
|
||||
products = []
|
||||
tags = soup.find_all('ul')
|
||||
for tag in tags:
|
||||
if 'products' in tag.get('class', []):
|
||||
for child in tag.children:
|
||||
products.append(child)
|
||||
return products
|
||||
|
||||
@staticmethod
|
||||
def _parse_name(product):
|
||||
title = product.find('h2')
|
||||
if type(title) is Tag:
|
||||
return title.text
|
||||
|
||||
@staticmethod
|
||||
def _parse_price(product):
|
||||
tag = product.find('bdi')
|
||||
if type(tag) is Tag:
|
||||
string = tag.text
|
||||
if '€' in string:
|
||||
currency = 'EUR'
|
||||
string = string.replace('€', '').strip()
|
||||
price = float(string)
|
||||
return price, currency
|
||||
|
||||
@staticmethod
|
||||
def _parse_url(product):
|
||||
tag = product.find('a')
|
||||
if type(tag) is Tag and tag.get('href'):
|
||||
return tag['href']
|
||||
|
||||
@staticmethod
|
||||
def _parse_availability(product):
|
||||
tag = product.find('p')
|
||||
if type(tag) is Tag:
|
||||
attributes = tag.get('class', [])
|
||||
if 'stock' in attributes:
|
||||
attributes.remove('stock')
|
||||
availability = attributes[0]
|
||||
return availability != 'out-of-stock'
|
||||
return True
|
||||
|
|
Reference in a new issue