You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
72 lines
2.2 KiB
72 lines
2.2 KiB
from urllib.request import urlopen, Request
|
|
import re
|
|
from datetime import datetime
|
|
|
|
from strategies.base_scraper import ScraperStrategy
|
|
from models import ScrapedItem, ScrapedData
|
|
from exceptions import NetworkException, ParseException
|
|
|
|
|
|
class NewsScraperStrategy(ScraperStrategy):
|
|
def __init__(self):
|
|
self._name = "news_scraper"
|
|
self._source = "http://quotes.toscrape.com"
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return self._name
|
|
|
|
@property
|
|
def source(self) -> str:
|
|
return self._source
|
|
|
|
def scrape(self) -> ScrapedData:
|
|
data = ScrapedData(source=self.source, strategy_name=self.name)
|
|
try:
|
|
request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'})
|
|
response = urlopen(request, timeout=10)
|
|
html = response.read().decode('utf-8')
|
|
except Exception as e:
|
|
raise NetworkException(
|
|
f"Failed to fetch news from {self.source}",
|
|
original_exception=e
|
|
)
|
|
|
|
try:
|
|
quotes = self._extract_quotes(html)
|
|
|
|
for quote_text, author in quotes[:10]:
|
|
item = ScrapedItem(
|
|
title=f"Quote by {author}",
|
|
content=quote_text,
|
|
url=self.source
|
|
)
|
|
data.add_item(item)
|
|
|
|
except Exception as e:
|
|
raise ParseException(
|
|
"Failed to parse news content",
|
|
selector="div.quote",
|
|
original_exception=e
|
|
)
|
|
|
|
return data
|
|
|
|
def _extract_quotes(self, html):
|
|
quotes = []
|
|
quote_pattern = r'<div class="quote"[^>]*>.*?<span class="text"[^>]*>([^<]+)</span>.*?<small class="author">([^<]+)</small>'
|
|
|
|
matches = re.findall(quote_pattern, html, re.DOTALL)
|
|
for match in matches:
|
|
quotes.append((match[0].strip(), match[1].strip()))
|
|
|
|
if not quotes:
|
|
text_pattern = r'"text">([^<]+)<'
|
|
author_pattern = r'author">([^<]+)<'
|
|
texts = re.findall(text_pattern, html)
|
|
authors = re.findall(author_pattern, html)
|
|
|
|
for i in range(min(len(texts), len(authors))):
|
|
quotes.append((texts[i].strip(), authors[i].strip()))
|
|
|
|
return quotes
|
|
|