Archived
1
0
Fork 0

Crawl websites concurrently

This commit is contained in:
Julien Riou 2020-12-30 15:05:28 +01:00
parent 0b0d2727e8
commit 39eaf21c0a
No known key found for this signature in database
GPG key ID: FF42D23B580C89F7

37
main.py
View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import logging import logging
from concurrent import futures
from config import extract_shops, read_config from config import extract_shops, read_config
from crawlers import (AlternateCrawler, LDLCCrawler, MaterielNetCrawler, from crawlers import (AlternateCrawler, LDLCCrawler, MaterielNetCrawler,
@ -21,6 +22,7 @@ def parse_arguments():
parser.add_argument('-c', '--config', default='config.json', help='configuration file location') parser.add_argument('-c', '--config', default='config.json', help='configuration file location')
parser.add_argument('-N', '--disable-notifications', dest='disable_notifications', action='store_true', parser.add_argument('-N', '--disable-notifications', dest='disable_notifications', action='store_true',
help='Do not send notifications') help='Do not send notifications')
parser.add_argument('-t', '--workers', type=int, help='number of workers for crawling')
args = parser.parse_args() args = parser.parse_args()
return args return args
@ -30,6 +32,22 @@ def setup_logging(args):
logging.basicConfig(format=log_format, level=args.loglevel, filename=args.logfile) logging.basicConfig(format=log_format, level=args.loglevel, filename=args.logfile)
def crawl_shop(shop, urls):
logger.debug(f'processing {shop}')
if shop.name == 'topachat.com':
crawler = TopAchatCrawler(shop=shop, urls=urls)
elif shop.name == 'ldlc.com':
crawler = LDLCCrawler(shop=shop, urls=urls)
elif shop.name == 'materiel.net':
crawler = MaterielNetCrawler(shop=shop, urls=urls)
elif shop.name == 'alternate.be':
crawler = AlternateCrawler(shop=shop, urls=urls)
else:
logger.warning(f'shop {shop} not supported')
return []
return crawler.products
def main(): def main():
args = parse_arguments() args = parse_arguments()
setup_logging(args) setup_logging(args)
@ -47,24 +65,17 @@ def main():
access_token=config['twitter']['access_token'], access_token=config['twitter']['access_token'],
access_token_secret=config['twitter']['access_token_secret']) access_token_secret=config['twitter']['access_token_secret'])
with futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
all_futures = []
for shop in list_shops(): for shop in list_shops():
logger.debug(f'processing {shop}')
urls = shops.get(shop.name) urls = shops.get(shop.name)
if not urls: if not urls:
logger.warning(f'cannot find urls for shop {shop} in the configuration file') logger.warning(f'cannot find urls for shop {shop} in the configuration file')
continue continue
if shop.name == 'topachat.com': all_futures.append(executor.submit(crawl_shop, shop, urls))
crawler = TopAchatCrawler(shop=shop, urls=urls) for future in futures.as_completed(all_futures):
elif shop.name == 'ldlc.com': products = future.result()
crawler = LDLCCrawler(shop=shop, urls=urls) upsert_products(products=products, notifier=notifier)
elif shop.name == 'materiel.net':
crawler = MaterielNetCrawler(shop=shop, urls=urls)
elif shop.name == 'alternate.be':
crawler = AlternateCrawler(shop=shop, urls=urls)
else:
logger.warning(f'shop {shop} not supported')
continue
upsert_products(products=crawler.products, notifier=notifier)
if __name__ == '__main__': if __name__ == '__main__':