diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4cb714b --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +__pycache__ +config.json +*.html +TODO.txt +restock.db +geckodriver.log diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..8e71337 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,29 @@ +--- +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: master + hooks: + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: double-quote-string-fixer + - id: end-of-file-fixer + - id: fix-encoding-pragma + args: ['--remove'] + - id: requirements-txt-fixer + - id: trailing-whitespace + - id: check-json + - repo: https://gitlab.com/pycqa/flake8 + rev: master + hooks: + - id: flake8 + args: ['--max-line-length=120'] + - repo: https://github.com/FalconSocial/pre-commit-python-sorter + rev: master + hooks: + - id: python-import-sorter + args: ['--silent-overwrite'] + - repo: https://github.com/chewse/pre-commit-mirrors-pydocstyle + rev: master + hooks: + - id: pydocstyle + args: ['--config=.pydocstyle', '--match="(?!test_).*\.py"'] diff --git a/.pydocstyle b/.pydocstyle new file mode 100644 index 0000000..aef2483 --- /dev/null +++ b/.pydocstyle @@ -0,0 +1,2 @@ +[pydocstyle] +ignore = D100,D104,D400,D203,D204,D101,D213,D202 diff --git a/README.md b/README.md new file mode 100644 index 0000000..950f47d --- /dev/null +++ b/README.md @@ -0,0 +1,89 @@ +Year 2020 has been quite hard for hardware supply. Graphics Cards are out of stock everywhere. Nobody can grab the +new generation (AMD RX 6000 series, NVIDIA GeForce RTX 3000 series). Even older generations are hard to find. +**GraphicRestock** is a bot that crawl retailers websites and notify when a product is available. + +# Setup + +Based on Debian 10: + +``` +apt install python3-selenium python3-sqlalchemy python3-tweepy firefox-esr +curl -L -s https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-linux64.tar.gz | tar xvpzf - -C /usr/local/bin/ +chown root:root /usr/local/bin/geckodriver +chmod +x /usr/local/bin/geckodriver +``` + +# Configure + +Configuration file example can be found [here](config.json.example). + +Options: +* **twitter.consumer_key**: key of your Twitter application +* **twitter.consumer_secret**: secret of your Twitter application +* **twitter.access_token**: authentication token generated by [twitter_auth.py](twitter_auth.py) +* **twitter.access_token_secret**: authentication token secret generated by [twitter_auth.py](twitter_auth.py) +* **urls**: list of retailers web pages (they need to respect crawlers' format) +* **executable_path** (optional): path to selenium driver (firefox/gecko browser) + + +# Twitter authentication + +Create a configuration file with **twitter.consumer_key** and **twitter.consumer_secret** parameters. + +Then authenticate: + +``` +python3 twitter_auth.py +``` + +You will have to open the URL and authenticate: + +``` +Please go to https://api.twitter.com/oauth/authorize?oauth_token=**** + +``` +Click on **Authorize app**. A verifier code will be shown. Go back to your console and enter the code. + +``` +Verifier:******* +``` + +Tokens will be created: + +``` +access_token = ***** +access_token_secret = **** +``` + +Finally, write them to configuration file in **twitter.access_token** and **twitter.access_token_secret** parameters. + + +# Usage + +``` +python3 main.py --help +``` + +# How to contribute + +First things first, check issues to ensure the feature or bug you are facing is not already declared. + +Pull requests are highly appreciated. + +Please lint your code: + +``` +docker run -it -v $(pwd):/mnt/ --rm debian:10 bash +apt-get update && apt-get upgrade -y && apt-get install -y python3-pip git +pip3 install pre-commit +cd /mnt +pre-commit run --all-files +``` + +Happy coding! + + +# Disclaimer + +Crawling a website should be used with caution. Please check with retailers if the bot respects the terms of use for +their websites. Authors of the bot are not responsible of the bot usage. diff --git a/config.json.example b/config.json.example new file mode 100644 index 0000000..2085547 --- /dev/null +++ b/config.json.example @@ -0,0 +1,18 @@ +{ + "twitter": { + "consumer_key": "***", + "consumer_secret": "***", + "access_token": "***", + "access_token_secret": "***" + }, + "urls": [ + "https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_wgfx_pcie_puis_f_est_58-11447,11445,11446,11559,11558.html", + "https://www.ldlc.com/informatique/pieces-informatique/carte-graphique-interne/c4684/+fv121-19183,19184,19185,19339,19340.html", + "https://www.materiel.net/carte-graphique/l426/+fv121-19183,19184,19185,19339,19340/", + "https://www.alternate.be/Hardware/Grafische-kaarten/NVIDIA/RTX-3060-Ti", + "https://www.alternate.be/Hardware/Grafische-kaarten/NVIDIA/RTX-3070", + "https://www.alternate.be/Hardware/Grafische-kaarten/NVIDIA/RTX-3080", + "https://www.alternate.be/Hardware/Grafische-kaarten/NVIDIA/RTX-3090" + ], + "executable_path": "/usr/bin/geckodriver" +} diff --git a/config.py b/config.py new file mode 100644 index 0000000..d354f41 --- /dev/null +++ b/config.py @@ -0,0 +1,24 @@ +import json + +from utils import parse_base_url + + +def read_config(filename): + with open(filename, 'r') as fd: + return json.load(fd) + + +def extract_shops(urls): + """ + Parse shop name and return list of addresses for each shop + Example: {"toto.com/first", "toto.com/second", "tata.com/first"} + -> {"toto.com": ["toto.com/first", "toto.com/second"], "tata.com": ["tata.com/first"]} + """ + result = {} + for url in urls: + base_url = parse_base_url(url, include_scheme=False) + if base_url not in result: + result[base_url] = [url] + else: + result[base_url].append(url) + return result diff --git a/crawlers.py b/crawlers.py new file mode 100644 index 0000000..a04021c --- /dev/null +++ b/crawlers.py @@ -0,0 +1,94 @@ +import logging + +from parsers import (AlternateParser, LDLCParser, MaterielNetParser, + TopAchatParser) +from selenium import webdriver +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.common.by import By +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.support import expected_conditions +from selenium.webdriver.support.ui import WebDriverWait + +logger = logging.getLogger(__name__) + + +class ProductCrawler(object): + + TIMEOUT = 3 + + def __init__(self, shop): + options = Options() + options.headless = True + self._driver = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver', options=options) + self._shop = shop + self.products = [] + + def __del__(self): + self._driver.quit() + + def fetch(self, url, wait_for=None): + self._driver.get(url) + if wait_for: + try: + condition = expected_conditions.presence_of_element_located((By.CLASS_NAME, wait_for)) + WebDriverWait(self._driver, self.TIMEOUT).until(condition) + except TimeoutException: + logger.warning(f'timeout waiting for element "{wait_for}" at {url}') + logger.info(f'url {url} fetched') + webpage = self._driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML") + return webpage + + def add_shop(self, products): + for product in products: + product.shop = self._shop + return products + + +class TopAchatCrawler(ProductCrawler): + def __init__(self, shop, urls): + super().__init__(shop) + parser = TopAchatParser() + for url in urls: + webpage = self.fetch(url=url) + parser.feed(webpage) + self.products += self.add_shop(parser.products) + + +class LDLCCrawler(ProductCrawler): + def __init__(self, shop, urls): + super().__init__(shop) + parser = LDLCParser() + for url in urls: + next_page = url + previous_page = None + while next_page != previous_page: + webpage = self.fetch(url=next_page) + parser.feed(webpage) + previous_page = next_page + next_page = parser.next_page + self.products += self.add_shop(parser.products) + + +class MaterielNetCrawler(ProductCrawler): + def __init__(self, shop, urls): + super().__init__(shop) + parser = MaterielNetParser() + for url in urls: + next_page = url + previous_page = None + while next_page != previous_page: + webpage = self.fetch(url=next_page, wait_for='o-product__price') + parser.feed(webpage) + previous_page = next_page + next_page = parser.next_page + self.products += self.add_shop(parser.products) + + +class AlternateCrawler(ProductCrawler): + def __init__(self, shop, urls): + super().__init__(shop) + parser = AlternateParser() + for url in urls: + webpage = self.fetch(url=url) + parser.feed(webpage) + self.products += self.add_shop(parser.products) diff --git a/db.py b/db.py new file mode 100644 index 0000000..2c51293 --- /dev/null +++ b/db.py @@ -0,0 +1,119 @@ +import logging +from datetime import datetime + +from sqlalchemy import (Boolean, Column, DateTime, Float, ForeignKey, Integer, + String, create_engine, exc) +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import relationship, sessionmaker + +logger = logging.getLogger(__name__) + + +Base = declarative_base() +engine = create_engine('sqlite:///restock.db') +Session = sessionmaker(bind=engine, autoflush=False) + + +class Shop(Base): + __tablename__ = 'shop' + id = Column(Integer, primary_key=True) + name = Column(String, unique=True, nullable=False) + + def __repr__(self): + return f'Shop<{self.name}>' + + def __ne__(self, shop): + return self.name != shop.name + + +class Product(Base): + __tablename__ = 'product' + id = Column(Integer, primary_key=True) + name = Column(String, nullable=False) + url = Column(String, nullable=False, unique=True) + price = Column(Float, nullable=False) + price_currency = Column(String, nullable=False) + available = Column(Boolean, nullable=False) + updated_at = Column(DateTime) + tweet_id = Column(Integer, unique=True) + shop_id = Column(Integer, ForeignKey('shop.id'), nullable=False) + shop = relationship('Shop', foreign_keys=[shop_id]) + + def __repr__(self): + return f'Product<{self.name}@{self.shop.name}>' + + def __ne__(self, product): + return self.name != product.name or self.price != product.price or self.available != product.available \ + or self.url != product.url or self.shop != product.shop + + def ok(self): + return self.name and self.url and self.price and self.price_currency and self.available is not None + + +def create_tables(): + Base.metadata.create_all(engine) + logger.debug('tables created') + + +def list_shops(): + session = Session() + shops = session.query(Shop).all() + session.close() + return shops + + +def upsert_shops(names): + session = Session() + try: + for name in names: + shop = Shop(name=name) + query = session.query(Shop).filter(Shop.name == shop.name) + shop_database = query.first() + if not shop_database: + logger.info(f'{shop} added') + session.add(shop) + session.commit() + logger.debug('transaction committed') + except exc.SQLAlchemyError: + logger.exception('cannot commit transaction') + finally: + session.close() + + +def upsert_products(products, notifier=None): + session = Session() + try: + for product in products: + query = session.query(Product).filter(Product.name == product.name, Product.shop == product.shop) + product_database = query.first() + now = datetime.utcnow() + tweet_id = None + if not product_database: + # product is new and available so we need to create an initial thread + if notifier and product.available: + product.tweet_id = notifier.create_thread(product).id + product.updated_at = now + session.add(product) + logger.info(f'{product} added') + elif product != product_database: + # notifications + if notifier and product.available != product_database.available: + if product.available and not product_database.tweet_id: + # product is now available so we need to create an initial tweet (or thread) + tweet = notifier.create_thread(product) + if tweet: + tweet_id = tweet.id + elif not product.available and product_database.available and product_database.tweet_id: + # product is out of stock so we need to reply to previous tweet to close the thread + notifier.close_thread(tweet_id=product_database.tweet_id, + duration=now-product_database.updated_at) + query.update({Product.price: product.price, Product.price_currency: product.price_currency, + Product.available: product.available, Product.url: product.url, + Product.tweet_id: tweet_id, Product.updated_at: now}) + logger.info(f'{product} updated') + session.commit() + logger.debug('transaction committed') + except exc.SQLAlchemyError: + logger.exception('cannot commit transaction') + finally: + session.close() diff --git a/main.py b/main.py new file mode 100644 index 0000000..9b8d621 --- /dev/null +++ b/main.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +import argparse +import logging + +from config import extract_shops, read_config +from crawlers import (AlternateCrawler, LDLCCrawler, MaterielNetCrawler, + TopAchatCrawler) +from db import create_tables, list_shops, upsert_products, upsert_shops +from notifiers import TwitterNotifier + +logger = logging.getLogger(__name__) + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('-v', '--verbose', dest='loglevel', action='store_const', const=logging.INFO, + help='print more output') + parser.add_argument('-d', '--debug', dest='loglevel', action='store_const', const=logging.DEBUG, + default=logging.WARNING, help='print even more output') + parser.add_argument('-o', '--logfile', help='logging file location') + parser.add_argument('-c', '--config', default='config.json', help='configuration file location') + parser.add_argument('-N', '--disable-notifications', dest='disable_notifications', action='store_true', + help='Do not send notifications') + args = parser.parse_args() + return args + + +def setup_logging(args): + log_format = '%(asctime)s %(levelname)s: %(message)s' if args.logfile else '%(levelname)s: %(message)s' + logging.basicConfig(format=log_format, level=args.loglevel, filename=args.logfile) + + +def main(): + args = parse_arguments() + setup_logging(args) + config = read_config(args.config) + create_tables() + + shops = extract_shops(config['urls']) + upsert_shops(shops.keys()) + + if args.disable_notifications: + notifier = None + else: + notifier = TwitterNotifier(consumer_key=config['twitter']['consumer_key'], + consumer_secret=config['twitter']['consumer_secret'], + access_token=config['twitter']['access_token'], + access_token_secret=config['twitter']['access_token_secret']) + + for shop in list_shops(): + logger.debug(f'processing {shop}') + urls = shops.get(shop.name) + if not urls: + logger.warning(f'cannot find urls for shop {shop} in the configuration file') + continue + if shop.name == 'topachat.com': + crawler = TopAchatCrawler(shop=shop, urls=urls) + elif shop.name == 'ldlc.com': + crawler = LDLCCrawler(shop=shop, urls=urls) + elif shop.name == 'materiel.net': + crawler = MaterielNetCrawler(shop=shop, urls=urls) + elif shop.name == 'alternate.be': + crawler = AlternateCrawler(shop=shop, urls=urls) + else: + logger.warning(f'shop {shop} not supported') + continue + upsert_products(products=crawler.products, notifier=notifier) + + +if __name__ == '__main__': + main() diff --git a/notifiers.py b/notifiers.py new file mode 100644 index 0000000..99c6369 --- /dev/null +++ b/notifiers.py @@ -0,0 +1,57 @@ +import logging + +import tweepy +from utils import format_timedelta + +logger = logging.getLogger(__name__) + + +class TwitterNotifier(object): + + _hashtags_map = { + 'rtx 3060 ti': ['#nvidia', '#rtx3060ti'], + 'rtx 3070': ['#nvidia', '#rtx3070'], + 'rtx 3080': ['#nvidia', '#rtx3080'], + 'rtx 3090': ['#nvidia', '#rtx3090'], + 'rx 6800 xt': ['#amd', '#rx6800xt'], + 'rx 6800': ['#amd', '#rx6800'], + } + + _currency_map = { + 'EUR': '€' + } + + def __init__(self, consumer_key, consumer_secret, access_token, access_token_secret): + auth = tweepy.OAuthHandler(consumer_key, consumer_secret) + auth.set_access_token(access_token, access_token_secret) + self._api = tweepy.API(auth) + + def create_thread(self, product): + currency_sign = self._currency_map[product.price_currency] + shop_name = product.shop.name + price = f'{product.price}{currency_sign}' + message = f'{shop_name}: {product.name} for {price} is available at {product.url}' + hashtags = self._parse_hashtags(product) + if hashtags: + message += f' {hashtags}' + return self._create_tweet(message=message) + + def close_thread(self, tweet_id, duration): + thread = self._api.get_status(id=tweet_id) + duration = format_timedelta(duration, '{hours_total}h{minutes2}m') + message = f'''@{thread.user.screen_name} And it's over ({duration})''' + return self._create_tweet(message=message, tweet_id=tweet_id) + + def _create_tweet(self, message, tweet_id=None): + try: + tweet = self._api.update_status(status=message, in_reply_to_status_id=tweet_id) + logger.info(f'tweet {tweet.id} sent with message "{message}"') + return tweet + except tweepy.error.TweepError as err: + logger.warning('cannot send tweet with message "{message}"') + logger.warning(str(err)) + + def _parse_hashtags(self, product): + for patterns in self._hashtags_map: + if all(elem in product.name.lower().split(' ') for elem in patterns.split(' ')): + return ' '.join(self._hashtags_map[patterns]) diff --git a/parsers.py b/parsers.py new file mode 100644 index 0000000..e6fa5b4 --- /dev/null +++ b/parsers.py @@ -0,0 +1,403 @@ +import logging +from html.parser import HTMLParser + +from db import Product +from utils import parse_base_url + +logger = logging.getLogger(__name__) + + +# Parsers definitively need to be replaced by beautifulsoup because the code is not maintainable + + +class ProductParser(HTMLParser): + def __init__(self): + super().__init__() + self.products = [] + self.next_page = None + + +class TopAchatParser(ProductParser): + def __init__(self, url=None): + super().__init__() + self._parsing_article = False + self._parsing_availability = False + self._parsing_price = False + self._parsing_price_currency = False + self._parsing_name = False + self._parsing_url = False + self._product = Product() + if url: + self._base_url = parse_base_url(url) + else: + self._base_url = 'https://www.topachat.com' + + @staticmethod + def parse_name(data): + return data.split(' + ')[0].strip() + + def handle_starttag(self, tag, attrs): + if tag == 'article': + for name, value in attrs: + if 'grille-produit' in value.split(' '): + self._parsing_article = True + elif self._parsing_article: + if tag == 'link': + for name, value in attrs: + if name == 'itemprop' and value == 'availability': + self._parsing_availability = True + elif self._parsing_availability and name == 'href': + self._product.available = value != 'http://schema.org/OutOfStock' + elif tag == 'div': + for name, value in attrs: + if name == 'itemprop' and value == 'price': + self._parsing_price = True + elif self._parsing_price and name == 'content': + self._product.price = float(value) + elif name == 'class' and value == 'libelle': + self._parsing_url = True + self._parsing_name = True + elif tag == 'meta': + for name, value in attrs: + if name == 'itemprop' and value == 'priceCurrency': + self._parsing_price_currency = True + elif self._parsing_price_currency and name == 'content': + self._product.price_currency = value + elif tag == 'a': + for name, value in attrs: + if self._parsing_url and name == 'href': + self._product.url = f'{self._base_url}{value}' + + def handle_data(self, data): + if self._parsing_name and self.get_starttag_text().startswith('