from urllib.request import urlopen, Request import re from datetime import datetime from strategies.base_scraper import ScraperStrategy from models import ScrapedItem, ScrapedData from exceptions import NetworkException, ParseException class NewsScraperStrategy(ScraperStrategy): def __init__(self): self._name = "news_scraper" self._source = "http://quotes.toscrape.com" @property def name(self) -> str: return self._name @property def source(self) -> str: return self._source def scrape(self) -> ScrapedData: data = ScrapedData(source=self.source, strategy_name=self.name) try: request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'}) response = urlopen(request, timeout=10) html = response.read().decode('utf-8') except Exception as e: raise NetworkException( f"Failed to fetch news from {self.source}", original_exception=e ) try: quotes = self._extract_quotes(html) for quote_text, author in quotes[:10]: item = ScrapedItem( title=f"Quote by {author}", content=quote_text, url=self.source ) data.add_item(item) except Exception as e: raise ParseException( "Failed to parse news content", selector="div.quote", original_exception=e ) return data def _extract_quotes(self, html): quotes = [] quote_pattern = r'
]*>.*?]*>([^<]+).*?([^<]+)' matches = re.findall(quote_pattern, html, re.DOTALL) for match in matches: quotes.append((match[0].strip(), match[1].strip())) if not quotes: text_pattern = r'"text">([^<]+)<' author_pattern = r'author">([^<]+)<' texts = re.findall(text_pattern, html) authors = re.findall(author_pattern, html) for i in range(min(len(texts), len(authors))): quotes.append((texts[i].strip(), authors[i].strip())) return quotes