from urllib.request import urlopen, Request import re from strategies.base_scraper import ScraperStrategy from models import ScrapedItem, ScrapedData from exceptions import NetworkException, ParseException class TechNewsScraperStrategy(ScraperStrategy): def __init__(self): self._name = "tech_news_scraper" self._source = "https://www.bbc.com/news" @property def name(self) -> str: return self._name @property def source(self) -> str: return self._source def scrape(self) -> ScrapedData: data = ScrapedData(source=self.source, strategy_name=self.name) try: request = Request(self.source, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) response = urlopen(request, timeout=10) html = response.read().decode('utf-8') except Exception as e: raise NetworkException( f"Failed to fetch tech news from {self.source}", original_exception=e ) try: headlines = self._extract_headlines(html) for headline in headlines[:15]: item = ScrapedItem( title=headline, content="", url=self.source ) data.add_item(item) except Exception as e: raise ParseException( "Failed to parse tech news content", selector="h1, h2, h3", original_exception=e ) return data def _extract_headlines(self, html): headlines = [] h_patterns = [ r']*class="[^"]*headline[^"]*"[^>]*>([^<]+)<', r']*class="[^"]*headline[^"]*"[^>]*>([^<]+)<', r']*class="[^"]*headline[^"]*"[^>]*>([^<]+)<', r']*class="[^"]*title[^"]*"[^>]*>([^<]+)<', r']*class="[^"]*title[^"]*"[^>]*>([^<]+)<', ] for pattern in h_patterns: matches = re.findall(pattern, html, re.IGNORECASE) for match in matches: headline = match.strip() if headline and len(headline) > 10: headlines.append(headline) seen = set() unique_headlines = [] for h in headlines: if h not in seen: seen.add(h) unique_headlines.append(h) return unique_headlines