102 lines
3.2 KiB
Python
102 lines
3.2 KiB
Python
import logging
|
|
|
|
from parsers import (AlternateParser, LDLCParser, MaterielNetParser,
|
|
TopAchatParser)
|
|
from selenium import webdriver
|
|
from selenium.common.exceptions import TimeoutException
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.firefox.options import Options
|
|
from selenium.webdriver.support import expected_conditions
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ProductCrawler(object):
|
|
|
|
TIMEOUT = 3
|
|
|
|
def __init__(self, shop):
|
|
options = Options()
|
|
options.headless = True
|
|
self._driver = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver', options=options)
|
|
self._shop = shop
|
|
self.products = []
|
|
|
|
def __del__(self):
|
|
self._driver.quit()
|
|
|
|
def fetch(self, url, wait_for=None):
|
|
self._driver.get(url)
|
|
if wait_for:
|
|
try:
|
|
condition = expected_conditions.presence_of_element_located((By.CLASS_NAME, wait_for))
|
|
WebDriverWait(self._driver, self.TIMEOUT).until(condition)
|
|
except TimeoutException:
|
|
logger.warning(f'timeout waiting for element "{wait_for}" at {url}')
|
|
logger.info(f'url {url} fetched')
|
|
webpage = self._driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
|
|
return webpage
|
|
|
|
def add_shop(self, products):
|
|
for product in products:
|
|
product.shop = self._shop
|
|
return products
|
|
|
|
|
|
class TopAchatCrawler(ProductCrawler):
|
|
def __init__(self, shop, urls):
|
|
super().__init__(shop)
|
|
parser = TopAchatParser()
|
|
for url in urls:
|
|
webpage = self.fetch(url=url)
|
|
parser.feed(webpage)
|
|
self.products += self.add_shop(parser.products)
|
|
|
|
|
|
class LDLCCrawler(ProductCrawler):
|
|
def __init__(self, shop, urls):
|
|
super().__init__(shop)
|
|
parser = LDLCParser()
|
|
for url in urls:
|
|
next_page = url
|
|
previous_page = None
|
|
while next_page != previous_page:
|
|
webpage = self.fetch(url=next_page)
|
|
parser.feed(webpage)
|
|
previous_page = next_page
|
|
next_page = parser.next_page
|
|
self.products += self.add_shop(parser.products)
|
|
|
|
|
|
class MaterielNetCrawler(ProductCrawler):
|
|
def __init__(self, shop, urls):
|
|
super().__init__(shop)
|
|
parser = MaterielNetParser()
|
|
for url in urls:
|
|
next_page = url
|
|
previous_page = None
|
|
while next_page != previous_page:
|
|
webpage = self.fetch(url=next_page, wait_for='o-product__price')
|
|
parser.feed(webpage)
|
|
previous_page = next_page
|
|
next_page = parser.next_page
|
|
self.products += self.add_shop(parser.products)
|
|
|
|
|
|
class AlternateCrawler(ProductCrawler):
|
|
def __init__(self, shop, urls):
|
|
super().__init__(shop)
|
|
parser = AlternateParser()
|
|
for url in urls:
|
|
webpage = self.fetch(url=url)
|
|
parser.feed(webpage)
|
|
self.products += self.add_shop(parser.products)
|
|
|
|
|
|
CRAWLERS = {
|
|
'topachat.com': TopAchatCrawler,
|
|
'ldlc.com': LDLCCrawler,
|
|
'materiel.net': MaterielNetCrawler,
|
|
'alternate.be': AlternateCrawler
|
|
}
|