82 lines
3.2 KiB
Python
82 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
|
import argparse
|
|
import logging
|
|
from concurrent import futures
|
|
|
|
from config import extract_shops, read_config
|
|
from crawlers import (AlternateCrawler, LDLCCrawler, MaterielNetCrawler,
|
|
TopAchatCrawler)
|
|
from db import create_tables, list_shops, upsert_products, upsert_shops
|
|
from notifiers import TwitterNotifier
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def parse_arguments():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('-v', '--verbose', dest='loglevel', action='store_const', const=logging.INFO,
|
|
help='print more output')
|
|
parser.add_argument('-d', '--debug', dest='loglevel', action='store_const', const=logging.DEBUG,
|
|
default=logging.WARNING, help='print even more output')
|
|
parser.add_argument('-o', '--logfile', help='logging file location')
|
|
parser.add_argument('-c', '--config', default='config.json', help='configuration file location')
|
|
parser.add_argument('-N', '--disable-notifications', dest='disable_notifications', action='store_true',
|
|
help='Do not send notifications')
|
|
parser.add_argument('-t', '--workers', type=int, help='number of workers for crawling')
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def setup_logging(args):
|
|
log_format = '%(asctime)s %(levelname)s: %(message)s' if args.logfile else '%(levelname)s: %(message)s'
|
|
logging.basicConfig(format=log_format, level=args.loglevel, filename=args.logfile)
|
|
|
|
|
|
def crawl_shop(shop, urls):
|
|
logger.debug(f'processing {shop}')
|
|
if shop.name == 'topachat.com':
|
|
crawler = TopAchatCrawler(shop=shop, urls=urls)
|
|
elif shop.name == 'ldlc.com':
|
|
crawler = LDLCCrawler(shop=shop, urls=urls)
|
|
elif shop.name == 'materiel.net':
|
|
crawler = MaterielNetCrawler(shop=shop, urls=urls)
|
|
elif shop.name == 'alternate.be':
|
|
crawler = AlternateCrawler(shop=shop, urls=urls)
|
|
else:
|
|
logger.warning(f'shop {shop} not supported')
|
|
return []
|
|
return crawler.products
|
|
|
|
|
|
def main():
|
|
args = parse_arguments()
|
|
setup_logging(args)
|
|
config = read_config(args.config)
|
|
create_tables()
|
|
|
|
shops = extract_shops(config['urls'])
|
|
upsert_shops(shops.keys())
|
|
|
|
if args.disable_notifications:
|
|
notifier = None
|
|
else:
|
|
notifier = TwitterNotifier(consumer_key=config['twitter']['consumer_key'],
|
|
consumer_secret=config['twitter']['consumer_secret'],
|
|
access_token=config['twitter']['access_token'],
|
|
access_token_secret=config['twitter']['access_token_secret'])
|
|
|
|
with futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
|
|
all_futures = []
|
|
for shop in list_shops():
|
|
urls = shops.get(shop.name)
|
|
if not urls:
|
|
logger.warning(f'cannot find urls for shop {shop} in the configuration file')
|
|
continue
|
|
all_futures.append(executor.submit(crawl_shop, shop, urls))
|
|
for future in futures.as_completed(all_futures):
|
|
products = future.result()
|
|
upsert_products(products=products, notifier=notifier)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|