Add mineshop support
This commit is contained in:
		
					parent
					
						
							
								29c26167dc
							
						
					
				
			
			
				commit
				
					
						2c7189d7fc
					
				
			
		
					 4 changed files with 85 additions and 4 deletions
				
			
		| 
						 | 
				
			
			@ -7,7 +7,7 @@ new generation (AMD RX 6000 series, NVIDIA GeForce RTX 3000 series). Even older
 | 
			
		|||
Based on Debian 10:
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
apt install python3-selenium python3-sqlalchemy python3-tweepy firefox-esr
 | 
			
		||||
apt install python3-selenium python3-sqlalchemy python3-tweepy python3-bs4 firefox-esr
 | 
			
		||||
curl -L -s https://github.com/mozilla/geckodriver/releases/download/v0.28.0/geckodriver-v0.28.0-linux64.tar.gz | tar xvpzf - -C /usr/local/bin/
 | 
			
		||||
chown root:root /usr/local/bin/geckodriver
 | 
			
		||||
chmod +x /usr/local/bin/geckodriver
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										15
									
								
								crawlers.py
									
										
									
									
									
								
							
							
						
						
									
										15
									
								
								crawlers.py
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
import logging
 | 
			
		||||
 | 
			
		||||
from parsers import (AlternateParser, LDLCParser, MaterielNetParser,
 | 
			
		||||
                     TopAchatParser)
 | 
			
		||||
                     MineShopParser, TopAchatParser)
 | 
			
		||||
from selenium import webdriver
 | 
			
		||||
from selenium.common.exceptions import TimeoutException
 | 
			
		||||
from selenium.webdriver.common.by import By
 | 
			
		||||
| 
						 | 
				
			
			@ -94,9 +94,20 @@ class AlternateCrawler(ProductCrawler):
 | 
			
		|||
        self.products += self.add_shop(parser.products)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MineShopCrawler(ProductCrawler):
 | 
			
		||||
    def __init__(self, shop, urls):
 | 
			
		||||
        super().__init__(shop)
 | 
			
		||||
        parser = MineShopParser()
 | 
			
		||||
        for url in urls:
 | 
			
		||||
            webpage = self.fetch(url=url)
 | 
			
		||||
            parser.feed(webpage)
 | 
			
		||||
            self.products += self.add_shop(parser.products)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CRAWLERS = {
 | 
			
		||||
    'topachat.com': TopAchatCrawler,
 | 
			
		||||
    'ldlc.com': LDLCCrawler,
 | 
			
		||||
    'materiel.net': MaterielNetCrawler,
 | 
			
		||||
    'alternate.be': AlternateCrawler
 | 
			
		||||
    'alternate.be': AlternateCrawler,
 | 
			
		||||
    'mineshop.eu': MineShopCrawler
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										1
									
								
								main.py
									
										
									
									
									
								
							
							
						
						
									
										1
									
								
								main.py
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -59,7 +59,6 @@ def main():
 | 
			
		|||
        for shop in list_shops():
 | 
			
		||||
            urls = shops.get(shop.name)
 | 
			
		||||
            if not urls:
 | 
			
		||||
                logger.warning(f'cannot find urls for shop {shop} in the configuration file')
 | 
			
		||||
                continue
 | 
			
		||||
            all_futures.append(executor.submit(crawl_shop, shop, urls))
 | 
			
		||||
        for future in futures.as_completed(all_futures):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										71
									
								
								parsers.py
									
										
									
									
									
								
							
							
						
						
									
										71
									
								
								parsers.py
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -1,6 +1,8 @@
 | 
			
		|||
import logging
 | 
			
		||||
from html.parser import HTMLParser
 | 
			
		||||
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
from bs4.element import Tag
 | 
			
		||||
from db import Product
 | 
			
		||||
from utils import parse_base_url
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -401,3 +403,72 @@ class AlternateParser(ProductParser):
 | 
			
		|||
    def parse_url(self, string):
 | 
			
		||||
        string = string.split('?')[0]  # remove query string
 | 
			
		||||
        return f'{self._base_url}{string}'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MineShopParser:
 | 
			
		||||
    def __init__(self, url=None):
 | 
			
		||||
        self.products = []
 | 
			
		||||
        self._product = Product()
 | 
			
		||||
 | 
			
		||||
    def feed(self, webpage):
 | 
			
		||||
        tags = self._find_products(webpage)
 | 
			
		||||
        for tag in tags:
 | 
			
		||||
            # product has at least a name
 | 
			
		||||
            name = self._parse_name(tag)
 | 
			
		||||
            if not name:
 | 
			
		||||
                continue
 | 
			
		||||
            self._product.name = name
 | 
			
		||||
            # parse all other attributes
 | 
			
		||||
            price, currency = self._parse_price(tag)
 | 
			
		||||
            self._product.price = price
 | 
			
		||||
            self._product.price_currency = currency
 | 
			
		||||
            self._product.url = self._parse_url(tag)
 | 
			
		||||
            self._product.available = self._parse_availability(tag)
 | 
			
		||||
            # then add product to list
 | 
			
		||||
            self.products.append(self._product)
 | 
			
		||||
            self._product = Product()
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _find_products(webpage):
 | 
			
		||||
        soup = BeautifulSoup(webpage, features='lxml')
 | 
			
		||||
        products = []
 | 
			
		||||
        tags = soup.find_all('ul')
 | 
			
		||||
        for tag in tags:
 | 
			
		||||
            if 'products' in tag.get('class', []):
 | 
			
		||||
                for child in tag.children:
 | 
			
		||||
                    products.append(child)
 | 
			
		||||
        return products
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _parse_name(product):
 | 
			
		||||
        title = product.find('h2')
 | 
			
		||||
        if type(title) is Tag:
 | 
			
		||||
            return title.text
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _parse_price(product):
 | 
			
		||||
        tag = product.find('bdi')
 | 
			
		||||
        if type(tag) is Tag:
 | 
			
		||||
            string = tag.text
 | 
			
		||||
            if '€' in string:
 | 
			
		||||
                currency = 'EUR'
 | 
			
		||||
                string = string.replace('€', '').strip()
 | 
			
		||||
            price = float(string)
 | 
			
		||||
            return price, currency
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _parse_url(product):
 | 
			
		||||
        tag = product.find('a')
 | 
			
		||||
        if type(tag) is Tag and tag.get('href'):
 | 
			
		||||
            return tag['href']
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _parse_availability(product):
 | 
			
		||||
        tag = product.find('p')
 | 
			
		||||
        if type(tag) is Tag:
 | 
			
		||||
            attributes = tag.get('class', [])
 | 
			
		||||
            if 'stock' in attributes:
 | 
			
		||||
                attributes.remove('stock')
 | 
			
		||||
                availability = attributes[0]
 | 
			
		||||
                return availability != 'out-of-stock'
 | 
			
		||||
        return True
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Reference in a new issue