Crawl websites concurrently
This commit is contained in:
parent
0b0d2727e8
commit
39eaf21c0a
1 changed files with 29 additions and 18 deletions
37
main.py
37
main.py
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
from concurrent import futures
|
||||||
|
|
||||||
from config import extract_shops, read_config
|
from config import extract_shops, read_config
|
||||||
from crawlers import (AlternateCrawler, LDLCCrawler, MaterielNetCrawler,
|
from crawlers import (AlternateCrawler, LDLCCrawler, MaterielNetCrawler,
|
||||||
|
@ -21,6 +22,7 @@ def parse_arguments():
|
||||||
parser.add_argument('-c', '--config', default='config.json', help='configuration file location')
|
parser.add_argument('-c', '--config', default='config.json', help='configuration file location')
|
||||||
parser.add_argument('-N', '--disable-notifications', dest='disable_notifications', action='store_true',
|
parser.add_argument('-N', '--disable-notifications', dest='disable_notifications', action='store_true',
|
||||||
help='Do not send notifications')
|
help='Do not send notifications')
|
||||||
|
parser.add_argument('-t', '--workers', type=int, help='number of workers for crawling')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
@ -30,6 +32,22 @@ def setup_logging(args):
|
||||||
logging.basicConfig(format=log_format, level=args.loglevel, filename=args.logfile)
|
logging.basicConfig(format=log_format, level=args.loglevel, filename=args.logfile)
|
||||||
|
|
||||||
|
|
||||||
|
def crawl_shop(shop, urls):
|
||||||
|
logger.debug(f'processing {shop}')
|
||||||
|
if shop.name == 'topachat.com':
|
||||||
|
crawler = TopAchatCrawler(shop=shop, urls=urls)
|
||||||
|
elif shop.name == 'ldlc.com':
|
||||||
|
crawler = LDLCCrawler(shop=shop, urls=urls)
|
||||||
|
elif shop.name == 'materiel.net':
|
||||||
|
crawler = MaterielNetCrawler(shop=shop, urls=urls)
|
||||||
|
elif shop.name == 'alternate.be':
|
||||||
|
crawler = AlternateCrawler(shop=shop, urls=urls)
|
||||||
|
else:
|
||||||
|
logger.warning(f'shop {shop} not supported')
|
||||||
|
return []
|
||||||
|
return crawler.products
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
setup_logging(args)
|
setup_logging(args)
|
||||||
|
@ -47,24 +65,17 @@ def main():
|
||||||
access_token=config['twitter']['access_token'],
|
access_token=config['twitter']['access_token'],
|
||||||
access_token_secret=config['twitter']['access_token_secret'])
|
access_token_secret=config['twitter']['access_token_secret'])
|
||||||
|
|
||||||
|
with futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
|
||||||
|
all_futures = []
|
||||||
for shop in list_shops():
|
for shop in list_shops():
|
||||||
logger.debug(f'processing {shop}')
|
|
||||||
urls = shops.get(shop.name)
|
urls = shops.get(shop.name)
|
||||||
if not urls:
|
if not urls:
|
||||||
logger.warning(f'cannot find urls for shop {shop} in the configuration file')
|
logger.warning(f'cannot find urls for shop {shop} in the configuration file')
|
||||||
continue
|
continue
|
||||||
if shop.name == 'topachat.com':
|
all_futures.append(executor.submit(crawl_shop, shop, urls))
|
||||||
crawler = TopAchatCrawler(shop=shop, urls=urls)
|
for future in futures.as_completed(all_futures):
|
||||||
elif shop.name == 'ldlc.com':
|
products = future.result()
|
||||||
crawler = LDLCCrawler(shop=shop, urls=urls)
|
upsert_products(products=products, notifier=notifier)
|
||||||
elif shop.name == 'materiel.net':
|
|
||||||
crawler = MaterielNetCrawler(shop=shop, urls=urls)
|
|
||||||
elif shop.name == 'alternate.be':
|
|
||||||
crawler = AlternateCrawler(shop=shop, urls=urls)
|
|
||||||
else:
|
|
||||||
logger.warning(f'shop {shop} not supported')
|
|
||||||
continue
|
|
||||||
upsert_products(products=crawler.products, notifier=notifier)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Reference in a new issue