Add mineshop support

2021-01-13 09:10:30 +01:00 · 2021-01-13 09:10:30 +01:00 · 2c7189d7fc
commit 2c7189d7fc
parent 29c26167dc
4 changed files with 85 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@ new generation (AMD RX 6000 series, NVIDIA GeForce RTX 3000 series). Even older
 Based on Debian 10:
 ```
-apt install python3-selenium python3-sqlalchemy python3-tweepy firefox-esr
+apt install python3-selenium python3-sqlalchemy python3-tweepy python3-bs4 firefox-esr
 curl -L -s https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-linux64.tar.gz | tar xvpzf - -C /usr/local/bin/
 chown root:root /usr/local/bin/geckodriver
 chmod +x /usr/local/bin/geckodriver
--- a/crawlers.py
+++ b/crawlers.py
@ -1,7 +1,7 @@
 import logging
 from parsers import (AlternateParser, LDLCParser, MaterielNetParser,
-                     TopAchatParser)
+                     MineShopParser, TopAchatParser)
 from selenium import webdriver
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.common.by import By
@ -94,9 +94,20 @@ class AlternateCrawler(ProductCrawler):
        self.products += self.add_shop(parser.products)
 class MineShopCrawler(ProductCrawler):
    def __init__(self, shop, urls):
        super().__init__(shop)
        parser = MineShopParser()
        for url in urls:
            webpage = self.fetch(url=url)
            parser.feed(webpage)
            self.products += self.add_shop(parser.products)
 CRAWLERS = {
    'topachat.com': TopAchatCrawler,
    'ldlc.com': LDLCCrawler,
    'materiel.net': MaterielNetCrawler,
-    'alternate.be': AlternateCrawler
+    'alternate.be': AlternateCrawler,
    'mineshop.eu': MineShopCrawler
 }
--- a/main.py
+++ b/main.py
@ -59,7 +59,6 @@ def main():
        for shop in list_shops():
            urls = shops.get(shop.name)
            if not urls:
                logger.warning(f'cannot find urls for shop {shop} in the configuration file')
                continue
            all_futures.append(executor.submit(crawl_shop, shop, urls))
        for future in futures.as_completed(all_futures):
--- a/parsers.py
+++ b/parsers.py
@ -1,6 +1,8 @@
 import logging
 from html.parser import HTMLParser
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 from db import Product
 from utils import parse_base_url
@ -401,3 +403,72 @@ class AlternateParser(ProductParser):
    def parse_url(self, string):
        string = string.split('?')[0]  # remove query string
        return f'{self._base_url}{string}'
 class MineShopParser:
    def __init__(self, url=None):
        self.products = []
        self._product = Product()
    def feed(self, webpage):
        tags = self._find_products(webpage)
        for tag in tags:
            # product has at least a name
            name = self._parse_name(tag)
            if not name:
                continue
            self._product.name = name
            # parse all other attributes
            price, currency = self._parse_price(tag)
            self._product.price = price
            self._product.price_currency = currency
            self._product.url = self._parse_url(tag)
            self._product.available = self._parse_availability(tag)
            # then add product to list
            self.products.append(self._product)
            self._product = Product()
    @staticmethod
    def _find_products(webpage):
        soup = BeautifulSoup(webpage, features='lxml')
        products = []
        tags = soup.find_all('ul')
        for tag in tags:
            if 'products' in tag.get('class', []):
                for child in tag.children:
                    products.append(child)
        return products
    @staticmethod
    def _parse_name(product):
        title = product.find('h2')
        if type(title) is Tag:
            return title.text
    @staticmethod
    def _parse_price(product):
        tag = product.find('bdi')
        if type(tag) is Tag:
            string = tag.text
            if '€' in string:
                currency = 'EUR'
                string = string.replace('€', '').strip()
            price = float(string)
            return price, currency
    @staticmethod
    def _parse_url(product):
        tag = product.find('a')
        if type(tag) is Tag and tag.get('href'):
            return tag['href']
    @staticmethod
    def _parse_availability(product):
        tag = product.find('p')
        if type(tag) is Tag:
            attributes = tag.get('class', [])
            if 'stock' in attributes:
                attributes.remove('stock')
                availability = attributes[0]
                return availability != 'out-of-stock'
        return True