from urllib.request import urlopen, Request import re from strategies.base_scraper import ScraperStrategy from models import ScrapedItem, ScrapedData from exceptions import NetworkException, ParseException class TechNewsScraperStrategy(ScraperStrategy): def __init__(self): self._name = "tech_news_scraper" self._source = "https://www.bbc.com/news" @property def name(self) -> str: return self._name @property def source(self) -> str: return self._source def scrape(self) -> ScrapedData: data = ScrapedData(source=self.source, strategy_name=self.name) try: request = Request(self.source, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) response = urlopen(request, timeout=10) html = response.read().decode('utf-8') except Exception as e: raise NetworkException( f"Failed to fetch tech news from {self.source}", original_exception=e ) try: headlines = self._extract_headlines(html) for headline in headlines[:15]: item = ScrapedItem( title=headline, content="", url=self.source ) data.add_item(item) except Exception as e: raise ParseException( "Failed to parse tech news content", selector="h1, h2, h3", original_exception=e ) return data def _extract_headlines(self, html): headlines = [] h_patterns = [ r'