From 2c7189d7fc9c1b6625e10a213ec578a2479c8725 Mon Sep 17 00:00:00 2001 From: Julien Riou Date: Wed, 13 Jan 2021 09:10:30 +0100 Subject: [PATCH] Add mineshop support --- README.md | 2 +- crawlers.py | 15 +++++++++-- main.py | 1 - parsers.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8135950..f6fac2a 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ new generation (AMD RX 6000 series, NVIDIA GeForce RTX 3000 series). Even older Based on Debian 10: ``` -apt install python3-selenium python3-sqlalchemy python3-tweepy firefox-esr +apt install python3-selenium python3-sqlalchemy python3-tweepy python3-bs4 firefox-esr curl -L -s https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-linux64.tar.gz | tar xvpzf - -C /usr/local/bin/ chown root:root /usr/local/bin/geckodriver chmod +x /usr/local/bin/geckodriver diff --git a/crawlers.py b/crawlers.py index 72f1c74..84cd0be 100644 --- a/crawlers.py +++ b/crawlers.py @@ -1,7 +1,7 @@ import logging from parsers import (AlternateParser, LDLCParser, MaterielNetParser, - TopAchatParser) + MineShopParser, TopAchatParser) from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By @@ -94,9 +94,20 @@ class AlternateCrawler(ProductCrawler): self.products += self.add_shop(parser.products) +class MineShopCrawler(ProductCrawler): + def __init__(self, shop, urls): + super().__init__(shop) + parser = MineShopParser() + for url in urls: + webpage = self.fetch(url=url) + parser.feed(webpage) + self.products += self.add_shop(parser.products) + + CRAWLERS = { 'topachat.com': TopAchatCrawler, 'ldlc.com': LDLCCrawler, 'materiel.net': MaterielNetCrawler, - 'alternate.be': AlternateCrawler + 'alternate.be': AlternateCrawler, + 'mineshop.eu': MineShopCrawler } diff --git a/main.py b/main.py index 3bd8d97..c9b040b 100644 --- a/main.py +++ b/main.py @@ -59,7 +59,6 @@ def main(): for shop in list_shops(): urls = shops.get(shop.name) if not urls: - logger.warning(f'cannot find urls for shop {shop} in the configuration file') continue all_futures.append(executor.submit(crawl_shop, shop, urls)) for future in futures.as_completed(all_futures): diff --git a/parsers.py b/parsers.py index e6fa5b4..0713005 100644 --- a/parsers.py +++ b/parsers.py @@ -1,6 +1,8 @@ import logging from html.parser import HTMLParser +from bs4 import BeautifulSoup +from bs4.element import Tag from db import Product from utils import parse_base_url @@ -401,3 +403,72 @@ class AlternateParser(ProductParser): def parse_url(self, string): string = string.split('?')[0] # remove query string return f'{self._base_url}{string}' + + +class MineShopParser: + def __init__(self, url=None): + self.products = [] + self._product = Product() + + def feed(self, webpage): + tags = self._find_products(webpage) + for tag in tags: + # product has at least a name + name = self._parse_name(tag) + if not name: + continue + self._product.name = name + # parse all other attributes + price, currency = self._parse_price(tag) + self._product.price = price + self._product.price_currency = currency + self._product.url = self._parse_url(tag) + self._product.available = self._parse_availability(tag) + # then add product to list + self.products.append(self._product) + self._product = Product() + + @staticmethod + def _find_products(webpage): + soup = BeautifulSoup(webpage, features='lxml') + products = [] + tags = soup.find_all('ul') + for tag in tags: + if 'products' in tag.get('class', []): + for child in tag.children: + products.append(child) + return products + + @staticmethod + def _parse_name(product): + title = product.find('h2') + if type(title) is Tag: + return title.text + + @staticmethod + def _parse_price(product): + tag = product.find('bdi') + if type(tag) is Tag: + string = tag.text + if '€' in string: + currency = 'EUR' + string = string.replace('€', '').strip() + price = float(string) + return price, currency + + @staticmethod + def _parse_url(product): + tag = product.find('a') + if type(tag) is Tag and tag.get('href'): + return tag['href'] + + @staticmethod + def _parse_availability(product): + tag = product.find('p') + if type(tag) is Tag: + attributes = tag.get('class', []) + if 'stock' in attributes: + attributes.remove('stock') + availability = attributes[0] + return availability != 'out-of-stock' + return True