You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
81 lines
2.5 KiB
81 lines
2.5 KiB
from urllib.request import urlopen, Request
|
|
import re
|
|
|
|
from strategies.base_scraper import ScraperStrategy
|
|
from models import ScrapedItem, ScrapedData
|
|
from exceptions import NetworkException, ParseException
|
|
|
|
|
|
class TechNewsScraperStrategy(ScraperStrategy):
|
|
def __init__(self):
|
|
self._name = "tech_news_scraper"
|
|
self._source = "https://www.bbc.com/news"
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return self._name
|
|
|
|
@property
|
|
def source(self) -> str:
|
|
return self._source
|
|
|
|
def scrape(self) -> ScrapedData:
|
|
data = ScrapedData(source=self.source, strategy_name=self.name)
|
|
try:
|
|
request = Request(self.source, headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
})
|
|
response = urlopen(request, timeout=10)
|
|
html = response.read().decode('utf-8')
|
|
except Exception as e:
|
|
raise NetworkException(
|
|
f"Failed to fetch tech news from {self.source}",
|
|
original_exception=e
|
|
)
|
|
|
|
try:
|
|
headlines = self._extract_headlines(html)
|
|
|
|
for headline in headlines[:15]:
|
|
item = ScrapedItem(
|
|
title=headline,
|
|
content="",
|
|
url=self.source
|
|
)
|
|
data.add_item(item)
|
|
|
|
except Exception as e:
|
|
raise ParseException(
|
|
"Failed to parse tech news content",
|
|
selector="h1, h2, h3",
|
|
original_exception=e
|
|
)
|
|
|
|
return data
|
|
|
|
def _extract_headlines(self, html):
|
|
headlines = []
|
|
|
|
h_patterns = [
|
|
r'<h1[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
|
|
r'<h2[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
|
|
r'<h3[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
|
|
r'<h1[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<',
|
|
r'<h2[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<',
|
|
]
|
|
|
|
for pattern in h_patterns:
|
|
matches = re.findall(pattern, html, re.IGNORECASE)
|
|
for match in matches:
|
|
headline = match.strip()
|
|
if headline and len(headline) > 10:
|
|
headlines.append(headline)
|
|
|
|
seen = set()
|
|
unique_headlines = []
|
|
for h in headlines:
|
|
if h not in seen:
|
|
seen.add(h)
|
|
unique_headlines.append(h)
|
|
|
|
return unique_headlines
|
|
|