You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

72 lines
2.2 KiB

from urllib.request import urlopen, Request
import re
from datetime import datetime
from strategies.base_scraper import ScraperStrategy
from models import ScrapedItem, ScrapedData
from exceptions import NetworkException, ParseException
class NewsScraperStrategy(ScraperStrategy):
def __init__(self):
self._name = "news_scraper"
self._source = "http://quotes.toscrape.com"
@property
def name(self) -> str:
return self._name
@property
def source(self) -> str:
return self._source
def scrape(self) -> ScrapedData:
data = ScrapedData(source=self.source, strategy_name=self.name)
try:
request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(request, timeout=10)
html = response.read().decode('utf-8')
except Exception as e:
raise NetworkException(
f"Failed to fetch news from {self.source}",
original_exception=e
)
try:
quotes = self._extract_quotes(html)
for quote_text, author in quotes[:10]:
item = ScrapedItem(
title=f"Quote by {author}",
content=quote_text,
url=self.source
)
data.add_item(item)
except Exception as e:
raise ParseException(
"Failed to parse news content",
selector="div.quote",
original_exception=e
)
return data
def _extract_quotes(self, html):
quotes = []
quote_pattern = r'<div class="quote"[^>]*>.*?<span class="text"[^>]*>([^<]+)</span>.*?<small class="author">([^<]+)</small>'
matches = re.findall(quote_pattern, html, re.DOTALL)
for match in matches:
quotes.append((match[0].strip(), match[1].strip()))
if not quotes:
text_pattern = r'"text">([^<]+)<'
author_pattern = r'author">([^<]+)<'
texts = re.findall(text_pattern, html)
authors = re.findall(author_pattern, html)
for i in range(min(len(texts), len(authors))):
quotes.append((texts[i].strip(), authors[i].strip()))
return quotes