You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

81 lines
2.5 KiB

from urllib.request import urlopen, Request
import re
from strategies.base_scraper import ScraperStrategy
from models import ScrapedItem, ScrapedData
from exceptions import NetworkException, ParseException
class TechNewsScraperStrategy(ScraperStrategy):
def __init__(self):
self._name = "tech_news_scraper"
self._source = "https://www.bbc.com/news"
@property
def name(self) -> str:
return self._name
@property
def source(self) -> str:
return self._source
def scrape(self) -> ScrapedData:
data = ScrapedData(source=self.source, strategy_name=self.name)
try:
request = Request(self.source, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
response = urlopen(request, timeout=10)
html = response.read().decode('utf-8')
except Exception as e:
raise NetworkException(
f"Failed to fetch tech news from {self.source}",
original_exception=e
)
try:
headlines = self._extract_headlines(html)
for headline in headlines[:15]:
item = ScrapedItem(
title=headline,
content="",
url=self.source
)
data.add_item(item)
except Exception as e:
raise ParseException(
"Failed to parse tech news content",
selector="h1, h2, h3",
original_exception=e
)
return data
def _extract_headlines(self, html):
headlines = []
h_patterns = [
r'<h1[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
r'<h2[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
r'<h3[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
r'<h1[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<',
r'<h2[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<',
]
for pattern in h_patterns:
matches = re.findall(pattern, html, re.IGNORECASE)
for match in matches:
headline = match.strip()
if headline and len(headline) > 10:
headlines.append(headline)
seen = set()
unique_headlines = []
for h in headlines:
if h not in seen:
seen.add(h)
unique_headlines.append(h)
return unique_headlines