from urllib.request import urlopen, Request
import re

from strategies.base_scraper import ScraperStrategy
from models import ScrapedItem, ScrapedData
from exceptions import NetworkException, ParseException


class TechNewsScraperStrategy(ScraperStrategy):
    def __init__(self):
        self._name = "tech_news_scraper"
        self._source = "https://www.bbc.com/news"

    @property
    def name(self) -> str:
        return self._name

    @property
    def source(self) -> str:
        return self._source

    def scrape(self) -> ScrapedData:
        data = ScrapedData(source=self.source, strategy_name=self.name)
        try:
            request = Request(self.source, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            })
            response = urlopen(request, timeout=10)
            html = response.read().decode('utf-8')
        except Exception as e:
            raise NetworkException(
                f"Failed to fetch tech news from {self.source}",
                original_exception=e
            )

        try:
            headlines = self._extract_headlines(html)

            for headline in headlines[:15]:
                item = ScrapedItem(
                    title=headline,
                    content="",
                    url=self.source
                )
                data.add_item(item)

        except Exception as e:
            raise ParseException(
                "Failed to parse tech news content",
                selector="h1, h2, h3",
                original_exception=e
            )

        return data

    def _extract_headlines(self, html):
        headlines = []

        h_patterns = [
            r'<h1[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
            r'<h2[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
            r'<h3[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
            r'<h1[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<',
            r'<h2[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<',
        ]

        for pattern in h_patterns:
            matches = re.findall(pattern, html, re.IGNORECASE)
            for match in matches:
                headline = match.strip()
                if headline and len(headline) > 10:
                    headlines.append(headline)

        seen = set()
        unique_headlines = []
        for h in headlines:
            if h not in seen:
                seen.add(h)
                unique_headlines.append(h)

        return unique_headlines