You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

67 lines
1.9 KiB

from urllib.request import urlopen, Request
import re
from datetime import datetime
from strategies.base_scraper import ScraperStrategy
from models import ScrapedItem, ScrapedData
from exceptions import NetworkException, ParseException
class BooksScraperStrategy(ScraperStrategy):
def __init__(self):
self._name = "books_scraper"
self._source = "https://books.toscrape.com"
@property
def name(self) -> str:
return self._name
@property
def source(self) -> str:
return self._source
def scrape(self) -> ScrapedData:
data = ScrapedData(source=self.source, strategy_name=self.name)
try:
request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(request, timeout=10)
html = response.read().decode('utf-8')
except Exception as e:
raise NetworkException(
f"Failed to fetch books from {self.source}",
original_exception=e
)
try:
books = self._extract_books(html)
for title, price in books[:20]:
item = ScrapedItem(
title=title,
content=f"Price: {price}",
url=self.source
)
data.add_item(item)
except Exception as e:
raise ParseException(
"Failed to parse book content",
selector="article.product_pod",
original_exception=e
)
return data
def _extract_books(self, html):
books = []
title_pattern = r'<h3><a href="[^"]*" title="([^"]+)"'
price_pattern = r'price_color">([^<]+)<'
titles = re.findall(title_pattern, html)
prices = re.findall(price_pattern, html)
for i in range(min(len(titles), len(prices))):
books.append((titles[i].strip(), prices[i].strip()))
return books