You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
67 lines
1.9 KiB
67 lines
1.9 KiB
from urllib.request import urlopen, Request
|
|
import re
|
|
from datetime import datetime
|
|
|
|
from strategies.base_scraper import ScraperStrategy
|
|
from models import ScrapedItem, ScrapedData
|
|
from exceptions import NetworkException, ParseException
|
|
|
|
|
|
class BooksScraperStrategy(ScraperStrategy):
|
|
def __init__(self):
|
|
self._name = "books_scraper"
|
|
self._source = "https://books.toscrape.com"
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return self._name
|
|
|
|
@property
|
|
def source(self) -> str:
|
|
return self._source
|
|
|
|
def scrape(self) -> ScrapedData:
|
|
data = ScrapedData(source=self.source, strategy_name=self.name)
|
|
try:
|
|
request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'})
|
|
response = urlopen(request, timeout=10)
|
|
html = response.read().decode('utf-8')
|
|
except Exception as e:
|
|
raise NetworkException(
|
|
f"Failed to fetch books from {self.source}",
|
|
original_exception=e
|
|
)
|
|
|
|
try:
|
|
books = self._extract_books(html)
|
|
|
|
for title, price in books[:20]:
|
|
item = ScrapedItem(
|
|
title=title,
|
|
content=f"Price: {price}",
|
|
url=self.source
|
|
)
|
|
data.add_item(item)
|
|
|
|
except Exception as e:
|
|
raise ParseException(
|
|
"Failed to parse book content",
|
|
selector="article.product_pod",
|
|
original_exception=e
|
|
)
|
|
|
|
return data
|
|
|
|
def _extract_books(self, html):
|
|
books = []
|
|
|
|
title_pattern = r'<h3><a href="[^"]*" title="([^"]+)"'
|
|
price_pattern = r'price_color">([^<]+)<'
|
|
|
|
titles = re.findall(title_pattern, html)
|
|
prices = re.findall(price_pattern, html)
|
|
|
|
for i in range(min(len(titles), len(prices))):
|
|
books.append((titles[i].strip(), prices[i].strip()))
|
|
|
|
return books
|
|
|