Crawl websites concurrently
This commit is contained in:
parent
0b0d2727e8
commit
39eaf21c0a
1 changed files with 29 additions and 18 deletions
47
main.py
47
main.py
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
from concurrent import futures
|
||||
|
||||
from config import extract_shops, read_config
|
||||
from crawlers import (AlternateCrawler, LDLCCrawler, MaterielNetCrawler,
|
||||
|
@ -21,6 +22,7 @@ def parse_arguments():
|
|||
parser.add_argument('-c', '--config', default='config.json', help='configuration file location')
|
||||
parser.add_argument('-N', '--disable-notifications', dest='disable_notifications', action='store_true',
|
||||
help='Do not send notifications')
|
||||
parser.add_argument('-t', '--workers', type=int, help='number of workers for crawling')
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
@ -30,6 +32,22 @@ def setup_logging(args):
|
|||
logging.basicConfig(format=log_format, level=args.loglevel, filename=args.logfile)
|
||||
|
||||
|
||||
def crawl_shop(shop, urls):
|
||||
logger.debug(f'processing {shop}')
|
||||
if shop.name == 'topachat.com':
|
||||
crawler = TopAchatCrawler(shop=shop, urls=urls)
|
||||
elif shop.name == 'ldlc.com':
|
||||
crawler = LDLCCrawler(shop=shop, urls=urls)
|
||||
elif shop.name == 'materiel.net':
|
||||
crawler = MaterielNetCrawler(shop=shop, urls=urls)
|
||||
elif shop.name == 'alternate.be':
|
||||
crawler = AlternateCrawler(shop=shop, urls=urls)
|
||||
else:
|
||||
logger.warning(f'shop {shop} not supported')
|
||||
return []
|
||||
return crawler.products
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
setup_logging(args)
|
||||
|
@ -47,24 +65,17 @@ def main():
|
|||
access_token=config['twitter']['access_token'],
|
||||
access_token_secret=config['twitter']['access_token_secret'])
|
||||
|
||||
for shop in list_shops():
|
||||
logger.debug(f'processing {shop}')
|
||||
urls = shops.get(shop.name)
|
||||
if not urls:
|
||||
logger.warning(f'cannot find urls for shop {shop} in the configuration file')
|
||||
continue
|
||||
if shop.name == 'topachat.com':
|
||||
crawler = TopAchatCrawler(shop=shop, urls=urls)
|
||||
elif shop.name == 'ldlc.com':
|
||||
crawler = LDLCCrawler(shop=shop, urls=urls)
|
||||
elif shop.name == 'materiel.net':
|
||||
crawler = MaterielNetCrawler(shop=shop, urls=urls)
|
||||
elif shop.name == 'alternate.be':
|
||||
crawler = AlternateCrawler(shop=shop, urls=urls)
|
||||
else:
|
||||
logger.warning(f'shop {shop} not supported')
|
||||
continue
|
||||
upsert_products(products=crawler.products, notifier=notifier)
|
||||
with futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
|
||||
all_futures = []
|
||||
for shop in list_shops():
|
||||
urls = shops.get(shop.name)
|
||||
if not urls:
|
||||
logger.warning(f'cannot find urls for shop {shop} in the configuration file')
|
||||
continue
|
||||
all_futures.append(executor.submit(crawl_shop, shop, urls))
|
||||
for future in futures.as_completed(all_futures):
|
||||
products = future.result()
|
||||
upsert_products(products=products, notifier=notifier)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Reference in a new issue