Add mineshop support

2021-01-13 09:10:30 +01:00 · 2021-01-13 09:10:30 +01:00 · 2c7189d7fc
commit 2c7189d7fc
parent 29c26167dc
4 changed files with 85 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@ new generation (AMD RX 6000 series, NVIDIA GeForce RTX 3000 series). Even older
 Based on Debian 10:

 ```
-apt install python3-selenium python3-sqlalchemy python3-tweepy firefox-esr
+apt install python3-selenium python3-sqlalchemy python3-tweepy python3-bs4 firefox-esr
 curl -L -s https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-linux64.tar.gz | tar xvpzf - -C /usr/local/bin/
 chown root:root /usr/local/bin/geckodriver
 chmod +x /usr/local/bin/geckodriver
--- a/crawlers.py
+++ b/crawlers.py
@ -1,7 +1,7 @@
 import logging

 from parsers import (AlternateParser, LDLCParser, MaterielNetParser,
-                     TopAchatParser)
+                     MineShopParser, TopAchatParser)
 from selenium import webdriver
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.common.by import By
@ -94,9 +94,20 @@ class AlternateCrawler(ProductCrawler):
        self.products += self.add_shop(parser.products)


+class MineShopCrawler(ProductCrawler):
+    def __init__(self, shop, urls):
+        super().__init__(shop)
+        parser = MineShopParser()
+        for url in urls:
+            webpage = self.fetch(url=url)
+            parser.feed(webpage)
+            self.products += self.add_shop(parser.products)
+
+
 CRAWLERS = {
    'topachat.com': TopAchatCrawler,
    'ldlc.com': LDLCCrawler,
    'materiel.net': MaterielNetCrawler,
-    'alternate.be': AlternateCrawler
+    'alternate.be': AlternateCrawler,
+    'mineshop.eu': MineShopCrawler
 }
--- a/main.py
+++ b/main.py
@ -59,7 +59,6 @@ def main():
        for shop in list_shops():
            urls = shops.get(shop.name)
            if not urls:
-                logger.warning(f'cannot find urls for shop {shop} in the configuration file')
                continue
            all_futures.append(executor.submit(crawl_shop, shop, urls))
        for future in futures.as_completed(all_futures):
--- a/parsers.py
+++ b/parsers.py
@ -1,6 +1,8 @@
 import logging
 from html.parser import HTMLParser

+from bs4 import BeautifulSoup
+from bs4.element import Tag
 from db import Product
 from utils import parse_base_url

@ -401,3 +403,72 @@ class AlternateParser(ProductParser):
    def parse_url(self, string):
        string = string.split('?')[0]  # remove query string
        return f'{self._base_url}{string}'
+
+
+class MineShopParser:
+    def __init__(self, url=None):
+        self.products = []
+        self._product = Product()
+
+    def feed(self, webpage):
+        tags = self._find_products(webpage)
+        for tag in tags:
+            # product has at least a name
+            name = self._parse_name(tag)
+            if not name:
+                continue
+            self._product.name = name
+            # parse all other attributes
+            price, currency = self._parse_price(tag)
+            self._product.price = price
+            self._product.price_currency = currency
+            self._product.url = self._parse_url(tag)
+            self._product.available = self._parse_availability(tag)
+            # then add product to list
+            self.products.append(self._product)
+            self._product = Product()
+
+    @staticmethod
+    def _find_products(webpage):
+        soup = BeautifulSoup(webpage, features='lxml')
+        products = []
+        tags = soup.find_all('ul')
+        for tag in tags:
+            if 'products' in tag.get('class', []):
+                for child in tag.children:
+                    products.append(child)
+        return products
+
+    @staticmethod
+    def _parse_name(product):
+        title = product.find('h2')
+        if type(title) is Tag:
+            return title.text
+
+    @staticmethod
+    def _parse_price(product):
+        tag = product.find('bdi')
+        if type(tag) is Tag:
+            string = tag.text
+            if '€' in string:
+                currency = 'EUR'
+                string = string.replace('€', '').strip()
+            price = float(string)
+            return price, currency
+
+    @staticmethod
+    def _parse_url(product):
+        tag = product.find('a')
+        if type(tag) is Tag and tag.get('href'):
+            return tag['href']
+
+    @staticmethod
+    def _parse_availability(product):
+        tag = product.find('p')
+        if type(tag) is Tag:
+            attributes = tag.get('class', [])
+            if 'stock' in attributes:
+                attributes.remove('stock')
+                availability = attributes[0]
+                return availability != 'out-of-stock'
+        return True