java/strategies/tech_news_scraper.py


								from urllib.request import urlopen, Request

								import re


								from strategies.base_scraper import ScraperStrategy

								from models import ScrapedItem, ScrapedData

								from exceptions import NetworkException, ParseException


								class TechNewsScraperStrategy(ScraperStrategy):

								    def __init__(self):

								        self._name = "tech_news_scraper"

								        self._source = "https://www.bbc.com/news"


								    @property

								    def name(self) -> str:

								        return self._name


								    @property

								    def source(self) -> str:

								        return self._source


								    def scrape(self) -> ScrapedData:

								        data = ScrapedData(source=self.source, strategy_name=self.name)

								        try:

								            request = Request(self.source, headers={

								                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'

								            })

								            response = urlopen(request, timeout=10)

								            html = response.read().decode('utf-8')

								        except Exception as e:

								            raise NetworkException(

								                f"Failed to fetch tech news from {self.source}",

								                original_exception=e

								            )


								        try:

								            headlines = self._extract_headlines(html)


								            for headline in headlines[:15]:

								                item = ScrapedItem(

								                    title=headline,

								                    content="",

								                    url=self.source

								                )

								                data.add_item(item)


								        except Exception as e:

								            raise ParseException(

								                "Failed to parse tech news content",

								                selector="h1, h2, h3",

								                original_exception=e

								            )


								        return data


								    def _extract_headlines(self, html):

								        headlines = []


								        h_patterns = [

								            r'<h1[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',

								            r'<h2[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',

								            r'<h3[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',

								            r'<h1[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<',

								            r'<h2[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<',

								        ]


								        for pattern in h_patterns:

								            matches = re.findall(pattern, html, re.IGNORECASE)

								            for match in matches:

								                headline = match.strip()

								                if headline and len(headline) > 10:

								                    headlines.append(headline)


								        seen = set()

								        unique_headlines = []

								        for h in headlines:

								            if h not in seen:

								                seen.add(h)

								                unique_headlines.append(h)


								        return unique_headlines