Archived
1
0
Fork 0
This repository has been archived on 2024-12-18. You can view files and clone it, but cannot push or open issues or pull requests.
restockbot/crawlers.py
2021-01-13 16:48:03 +01:00

113 lines
3.6 KiB
Python

import logging
from parsers import (AlternateParser, LDLCParser, MaterielNetParser,
MineShopParser, TopAchatParser)
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
logger = logging.getLogger(__name__)
class ProductCrawler(object):
TIMEOUT = 3
def __init__(self, shop):
options = Options()
options.headless = True
self._driver = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver', options=options)
self._shop = shop
self.products = []
def __del__(self):
self._driver.quit()
def fetch(self, url, wait_for=None):
self._driver.get(url)
if wait_for:
try:
condition = expected_conditions.presence_of_element_located((By.CLASS_NAME, wait_for))
WebDriverWait(self._driver, self.TIMEOUT).until(condition)
except TimeoutException:
logger.warning(f'timeout waiting for element "{wait_for}" at {url}')
logger.info(f'url {url} fetched')
webpage = self._driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
return webpage
def add_shop(self, products):
for product in products:
product.shop = self._shop
return products
class TopAchatCrawler(ProductCrawler):
def __init__(self, shop, urls):
super().__init__(shop)
parser = TopAchatParser()
for url in urls:
webpage = self.fetch(url=url)
parser.feed(webpage)
self.products += self.add_shop(parser.products)
class LDLCCrawler(ProductCrawler):
def __init__(self, shop, urls):
super().__init__(shop)
parser = LDLCParser()
for url in urls:
next_page = url
previous_page = None
while next_page != previous_page:
webpage = self.fetch(url=next_page)
parser.feed(webpage)
previous_page = next_page
next_page = parser.next_page
self.products += self.add_shop(parser.products)
class MaterielNetCrawler(ProductCrawler):
def __init__(self, shop, urls):
super().__init__(shop)
parser = MaterielNetParser()
for url in urls:
next_page = url
previous_page = None
while next_page != previous_page:
webpage = self.fetch(url=next_page, wait_for='o-product__price')
parser.feed(webpage)
previous_page = next_page
next_page = parser.next_page
self.products += self.add_shop(parser.products)
class AlternateCrawler(ProductCrawler):
def __init__(self, shop, urls):
super().__init__(shop)
parser = AlternateParser()
for url in urls:
webpage = self.fetch(url=url)
parser.feed(webpage)
self.products += self.add_shop(parser.products)
class MineShopCrawler(ProductCrawler):
def __init__(self, shop, urls):
super().__init__(shop)
parser = MineShopParser()
for url in urls:
webpage = self.fetch(url=url)
parser.feed(webpage)
self.products += self.add_shop(parser.products)
CRAWLERS = {
'topachat.com': TopAchatCrawler,
'ldlc.com': LDLCCrawler,
'materiel.net': MaterielNetCrawler,
'alternate.be': AlternateCrawler,
'mineshop.eu': MineShopCrawler
}