commit
0b572260a8
81 changed files with 2264 additions and 0 deletions
Binary file not shown.
|
After Width: | Height: | Size: 68 KiB |
@ -0,0 +1,5 @@ |
|||
from .base_command import Command |
|||
from .scrape_command import ScrapeCommand |
|||
from .list_command import ListCommand |
|||
|
|||
__all__ = ['Command', 'ScrapeCommand', 'ListCommand'] |
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,11 @@ |
|||
from abc import ABC, abstractmethod |
|||
|
|||
|
|||
class Command(ABC): |
|||
@abstractmethod |
|||
def execute(self): |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def undo(self): |
|||
pass |
|||
@ -0,0 +1,13 @@ |
|||
from commands.base_command import Command |
|||
from controllers import ScraperController |
|||
|
|||
|
|||
class ListCommand(Command): |
|||
def __init__(self, controller: ScraperController): |
|||
self.controller = controller |
|||
|
|||
def execute(self): |
|||
return self.controller.list_strategies() |
|||
|
|||
def undo(self): |
|||
pass |
|||
@ -0,0 +1,25 @@ |
|||
from commands.base_command import Command |
|||
from controllers import ScraperController |
|||
from exceptions import ScraperException |
|||
|
|||
|
|||
class ScrapeCommand(Command): |
|||
def __init__(self, controller: ScraperController, strategy_name: str): |
|||
self.controller = controller |
|||
self.strategy_name = strategy_name |
|||
self.scrape_result = None |
|||
self.saved_path = None |
|||
|
|||
def execute(self): |
|||
try: |
|||
self.scrape_result = self.controller.execute_scrape(self.strategy_name) |
|||
self.saved_path = self.controller.save_data(self.scrape_result, self.strategy_name) |
|||
return self.scrape_result, self.saved_path |
|||
except ScraperException as e: |
|||
raise e |
|||
|
|||
def undo(self): |
|||
if self.saved_path and self.controller.delete_data(self.saved_path): |
|||
print(f"Successfully undone: deleted {self.saved_path}") |
|||
return True |
|||
return False |
|||
@ -0,0 +1,3 @@ |
|||
from .scraper_controller import ScraperController |
|||
|
|||
__all__ = ['ScraperController'] |
|||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,112 @@ |
|||
import json |
|||
import os |
|||
from datetime import datetime |
|||
from typing import Dict, List |
|||
|
|||
from strategies import ( |
|||
ScraperStrategy, |
|||
NewsScraperStrategy, |
|||
BooksScraperStrategy, |
|||
TechNewsScraperStrategy |
|||
) |
|||
from models import ScrapedData |
|||
from exceptions import StrategyException, StorageException, ValidationException |
|||
|
|||
|
|||
class ScraperController: |
|||
def __init__(self, output_dir: str = "data"): |
|||
self.output_dir = output_dir |
|||
self.strategies: Dict[str, ScraperStrategy] = {} |
|||
self._register_default_strategies() |
|||
|
|||
def _register_default_strategies(self): |
|||
self.register_strategy(NewsScraperStrategy()) |
|||
self.register_strategy(BooksScraperStrategy()) |
|||
self.register_strategy(TechNewsScraperStrategy()) |
|||
|
|||
def register_strategy(self, strategy: ScraperStrategy): |
|||
self.strategies[strategy.name] = strategy |
|||
|
|||
def get_strategy(self, name: str) -> ScraperStrategy: |
|||
if name not in self.strategies: |
|||
available = ', '.join(self.strategies.keys()) |
|||
raise StrategyException( |
|||
f"Strategy '{name}' not found. Available: {available}", |
|||
strategy_name=name |
|||
) |
|||
return self.strategies[name] |
|||
|
|||
def list_strategies(self) -> List[Dict[str, str]]: |
|||
return [ |
|||
{"name": s.name, "source": s.source} |
|||
for s in self.strategies.values() |
|||
] |
|||
|
|||
def execute_scrape(self, strategy_name: str) -> ScrapedData: |
|||
strategy = self.get_strategy(strategy_name) |
|||
return strategy.scrape() |
|||
|
|||
def save_data(self, data: ScrapedData, strategy_name: str) -> str: |
|||
try: |
|||
folder_path = os.path.join(self.output_dir, strategy_name) |
|||
os.makedirs(folder_path, exist_ok=True) |
|||
|
|||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|||
filename = f"scraped_data_{timestamp}.json" |
|||
file_path = os.path.join(folder_path, filename) |
|||
|
|||
with open(file_path, 'w', encoding='utf-8') as f: |
|||
json.dump(data.to_dict(), f, ensure_ascii=False, indent=2) |
|||
|
|||
return file_path |
|||
|
|||
except Exception as e: |
|||
raise StorageException( |
|||
f"Failed to save data to {folder_path}", |
|||
file_path=folder_path, |
|||
original_exception=e |
|||
) |
|||
|
|||
def delete_data(self, file_path: str) -> bool: |
|||
try: |
|||
if os.path.exists(file_path): |
|||
os.remove(file_path) |
|||
return True |
|||
return False |
|||
except Exception as e: |
|||
raise StorageException( |
|||
f"Failed to delete file {file_path}", |
|||
file_path=file_path, |
|||
original_exception=e |
|||
) |
|||
|
|||
def load_data(self, strategy_name: str, filename: str = None) -> ScrapedData: |
|||
try: |
|||
folder_path = os.path.join(self.output_dir, strategy_name) |
|||
if not os.path.exists(folder_path): |
|||
raise StorageException( |
|||
f"No data found for strategy '{strategy_name}'", |
|||
file_path=folder_path |
|||
) |
|||
|
|||
if filename: |
|||
file_path = os.path.join(folder_path, filename) |
|||
else: |
|||
files = sorted([f for f in os.listdir(folder_path) if f.endswith('.json')]) |
|||
if not files: |
|||
raise StorageException(f"No data files found in {folder_path}") |
|||
file_path = os.path.join(folder_path, files[-1]) |
|||
|
|||
with open(file_path, 'r', encoding='utf-8') as f: |
|||
data_dict = json.load(f) |
|||
|
|||
return data_dict |
|||
|
|||
except Exception as e: |
|||
if isinstance(e, StorageException): |
|||
raise e |
|||
raise StorageException( |
|||
f"Failed to load data", |
|||
file_path=file_path if 'file_path' in locals() else None, |
|||
original_exception=e |
|||
) |
|||
@ -0,0 +1,21 @@ |
|||
from urllib.request import urlopen, Request |
|||
import re |
|||
|
|||
r = urlopen(Request('https://books.toscrape.com', headers={'User-Agent': 'Mozilla/5.0'})) |
|||
html = r.read().decode('utf-8') |
|||
|
|||
price_search = re.search(r'class="price_color[^"]*"[^>]*>([^<]+)<', html) |
|||
if price_search: |
|||
print('Found price pattern 1:', price_search.group(1)) |
|||
else: |
|||
print('Pattern 1 not found') |
|||
|
|||
price_search2 = re.search(r'price_color">([^<]+)<', html) |
|||
if price_search2: |
|||
print('Found price pattern 2:', price_search2.group(1)) |
|||
else: |
|||
print('Pattern 2 not found') |
|||
|
|||
idx = html.find('price_color') |
|||
if idx > 0: |
|||
print('Context around price_color:', html[idx-20:idx+50]) |
|||
@ -0,0 +1,17 @@ |
|||
from .scraper_exceptions import ( |
|||
ScraperException, |
|||
NetworkException, |
|||
ParseException, |
|||
ValidationException, |
|||
StorageException, |
|||
StrategyException |
|||
) |
|||
|
|||
__all__ = [ |
|||
'ScraperException', |
|||
'NetworkException', |
|||
'ParseException', |
|||
'ValidationException', |
|||
'StorageException', |
|||
'StrategyException' |
|||
] |
|||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,34 @@ |
|||
class ScraperException(Exception): |
|||
def __init__(self, message, original_exception=None): |
|||
super().__init__(message) |
|||
self.original_exception = original_exception |
|||
|
|||
|
|||
class NetworkException(ScraperException): |
|||
def __init__(self, message, status_code=None, original_exception=None): |
|||
super().__init__(message, original_exception) |
|||
self.status_code = status_code |
|||
|
|||
|
|||
class ParseException(ScraperException): |
|||
def __init__(self, message, selector=None, original_exception=None): |
|||
super().__init__(message, original_exception) |
|||
self.selector = selector |
|||
|
|||
|
|||
class ValidationException(ScraperException): |
|||
def __init__(self, message, field=None, original_exception=None): |
|||
super().__init__(message, original_exception) |
|||
self.field = field |
|||
|
|||
|
|||
class StorageException(ScraperException): |
|||
def __init__(self, message, file_path=None, original_exception=None): |
|||
super().__init__(message, original_exception) |
|||
self.file_path = file_path |
|||
|
|||
|
|||
class StrategyException(ScraperException): |
|||
def __init__(self, message, strategy_name=None, original_exception=None): |
|||
super().__init__(message, original_exception) |
|||
self.strategy_name = strategy_name |
|||
@ -0,0 +1,83 @@ |
|||
# Java Web Scraper |
|||
|
|||
A complete web scraping application demonstrating: |
|||
- **CLI Interface** |
|||
- **MVC Architecture** |
|||
- **Command Pattern** |
|||
- **Strategy Pattern** |
|||
- **Custom Exception Hierarchy** |
|||
|
|||
## Features |
|||
|
|||
- 3 different scraping strategies: |
|||
- `news_scraper` - Scrapes quotes from http://quotes.toscrape.com |
|||
- `books_scraper` - Scrapes books from https://books.toscrape.com |
|||
- `tech_news_scraper` - Scrapes news from https://www.bbc.com/news |
|||
- Saves data to JSON files |
|||
- Command-line interface |
|||
- Extensible architecture |
|||
|
|||
## Building |
|||
|
|||
```bash |
|||
cd java-scraper |
|||
mvn clean package |
|||
``` |
|||
|
|||
## Usage |
|||
|
|||
### List available scrapers: |
|||
```bash |
|||
mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="list" |
|||
``` |
|||
|
|||
### Scrape using a specific strategy: |
|||
```bash |
|||
mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="scrape news_scraper" |
|||
``` |
|||
|
|||
### Scrape all: |
|||
```bash |
|||
mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="scrape all" |
|||
``` |
|||
|
|||
### Custom output directory: |
|||
```bash |
|||
mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="scrape news_scraper --output my_data" |
|||
``` |
|||
|
|||
### Using the built JAR: |
|||
```bash |
|||
java -jar target/java-scraper-1.0-SNAPSHOT.jar list |
|||
java -jar target/java-scraper-1.0-SNAPSHOT.jar scrape news_scraper |
|||
``` |
|||
|
|||
## Architecture |
|||
|
|||
### MVC |
|||
- **Model**: `ScrapedItem`, `ScrapedData` |
|||
- **View**: `ConsoleView` |
|||
- **Controller**: `ScraperController` |
|||
|
|||
### Command Pattern |
|||
- `Command` interface |
|||
- `ScrapeCommand` |
|||
- `ListCommand` |
|||
|
|||
### Strategy Pattern |
|||
- `ScraperStrategy` interface |
|||
- `NewsScraperStrategy` |
|||
- `BooksScraperStrategy` |
|||
- `TechNewsScraperStrategy` |
|||
|
|||
### Exception Hierarchy |
|||
- `ScraperException` (base) |
|||
- `NetworkException` |
|||
- `ParseException` |
|||
- `StorageException` |
|||
- `StrategyException` |
|||
|
|||
## Requirements |
|||
|
|||
- Java 11 or higher |
|||
- Maven |
|||
@ -0,0 +1,7 @@ |
|||
{ |
|||
"source": "https://books.toscrape.com", |
|||
"strategy_name": "books_scraper", |
|||
"items": [], |
|||
"scraped_at": "2026-05-31T10:46:46.169175", |
|||
"total_items": 0 |
|||
} |
|||
@ -0,0 +1,128 @@ |
|||
{ |
|||
"source": "https://books.toscrape.com", |
|||
"strategy_name": "books_scraper", |
|||
"items": [ |
|||
{ |
|||
"title": "A Light in the Attic", |
|||
"content": "Price: £51.77", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.489985" |
|||
}, |
|||
{ |
|||
"title": "Tipping the Velvet", |
|||
"content": "Price: £53.74", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.489997" |
|||
}, |
|||
{ |
|||
"title": "Soumission", |
|||
"content": "Price: £50.10", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490001" |
|||
}, |
|||
{ |
|||
"title": "Sharp Objects", |
|||
"content": "Price: £47.82", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490004" |
|||
}, |
|||
{ |
|||
"title": "Sapiens: A Brief History of Humankind", |
|||
"content": "Price: £54.23", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490005" |
|||
}, |
|||
{ |
|||
"title": "The Requiem Red", |
|||
"content": "Price: £22.65", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490007" |
|||
}, |
|||
{ |
|||
"title": "The Dirty Little Secrets of Getting Your Dream Job", |
|||
"content": "Price: £33.34", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490009" |
|||
}, |
|||
{ |
|||
"title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull", |
|||
"content": "Price: £17.93", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490011" |
|||
}, |
|||
{ |
|||
"title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics", |
|||
"content": "Price: £22.60", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490012" |
|||
}, |
|||
{ |
|||
"title": "The Black Maria", |
|||
"content": "Price: £52.15", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490014" |
|||
}, |
|||
{ |
|||
"title": "Starving Hearts (Triangular Trade Trilogy, #1)", |
|||
"content": "Price: £13.99", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490015" |
|||
}, |
|||
{ |
|||
"title": "Shakespeare's Sonnets", |
|||
"content": "Price: £20.66", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490017" |
|||
}, |
|||
{ |
|||
"title": "Set Me Free", |
|||
"content": "Price: £17.46", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490019" |
|||
}, |
|||
{ |
|||
"title": "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", |
|||
"content": "Price: £52.29", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490020" |
|||
}, |
|||
{ |
|||
"title": "Rip it Up and Start Again", |
|||
"content": "Price: £35.02", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490022" |
|||
}, |
|||
{ |
|||
"title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", |
|||
"content": "Price: £57.25", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490023" |
|||
}, |
|||
{ |
|||
"title": "Olio", |
|||
"content": "Price: £23.88", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490025" |
|||
}, |
|||
{ |
|||
"title": "Mesaerion: The Best Science Fiction Stories 1800-1849", |
|||
"content": "Price: £37.59", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490035" |
|||
}, |
|||
{ |
|||
"title": "Libertarianism for Beginners", |
|||
"content": "Price: £51.33", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490037" |
|||
}, |
|||
{ |
|||
"title": "It's Only the Himalayas", |
|||
"content": "Price: £45.17", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:48:56.490038" |
|||
} |
|||
], |
|||
"scraped_at": "2026-05-31T10:48:54.348792", |
|||
"total_items": 20 |
|||
} |
|||
@ -0,0 +1,128 @@ |
|||
{ |
|||
"source": "https://books.toscrape.com", |
|||
"strategy_name": "books_scraper", |
|||
"items": [ |
|||
{ |
|||
"title": "A Light in the Attic", |
|||
"content": "Price: £51.77", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674011" |
|||
}, |
|||
{ |
|||
"title": "Tipping the Velvet", |
|||
"content": "Price: £53.74", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674021" |
|||
}, |
|||
{ |
|||
"title": "Soumission", |
|||
"content": "Price: £50.10", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674024" |
|||
}, |
|||
{ |
|||
"title": "Sharp Objects", |
|||
"content": "Price: £47.82", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674026" |
|||
}, |
|||
{ |
|||
"title": "Sapiens: A Brief History of Humankind", |
|||
"content": "Price: £54.23", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674028" |
|||
}, |
|||
{ |
|||
"title": "The Requiem Red", |
|||
"content": "Price: £22.65", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674029" |
|||
}, |
|||
{ |
|||
"title": "The Dirty Little Secrets of Getting Your Dream Job", |
|||
"content": "Price: £33.34", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674031" |
|||
}, |
|||
{ |
|||
"title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull", |
|||
"content": "Price: £17.93", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674032" |
|||
}, |
|||
{ |
|||
"title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics", |
|||
"content": "Price: £22.60", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674033" |
|||
}, |
|||
{ |
|||
"title": "The Black Maria", |
|||
"content": "Price: £52.15", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674034" |
|||
}, |
|||
{ |
|||
"title": "Starving Hearts (Triangular Trade Trilogy, #1)", |
|||
"content": "Price: £13.99", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674035" |
|||
}, |
|||
{ |
|||
"title": "Shakespeare's Sonnets", |
|||
"content": "Price: £20.66", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674041" |
|||
}, |
|||
{ |
|||
"title": "Set Me Free", |
|||
"content": "Price: £17.46", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674043" |
|||
}, |
|||
{ |
|||
"title": "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", |
|||
"content": "Price: £52.29", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674044" |
|||
}, |
|||
{ |
|||
"title": "Rip it Up and Start Again", |
|||
"content": "Price: £35.02", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674045" |
|||
}, |
|||
{ |
|||
"title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", |
|||
"content": "Price: £57.25", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674046" |
|||
}, |
|||
{ |
|||
"title": "Olio", |
|||
"content": "Price: £23.88", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674046" |
|||
}, |
|||
{ |
|||
"title": "Mesaerion: The Best Science Fiction Stories 1800-1849", |
|||
"content": "Price: £37.59", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674048" |
|||
}, |
|||
{ |
|||
"title": "Libertarianism for Beginners", |
|||
"content": "Price: £51.33", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674049" |
|||
}, |
|||
{ |
|||
"title": "It's Only the Himalayas", |
|||
"content": "Price: £45.17", |
|||
"url": "https://books.toscrape.com", |
|||
"timestamp": "2026-05-31T10:50:31.674050" |
|||
} |
|||
], |
|||
"scraped_at": "2026-05-31T10:50:29.355948", |
|||
"total_items": 20 |
|||
} |
|||
@ -0,0 +1,68 @@ |
|||
{ |
|||
"source": "http://quotes.toscrape.com", |
|||
"strategy_name": "news_scraper", |
|||
"items": [ |
|||
{ |
|||
"title": "temprop=", |
|||
"content": "temprop=", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:43:48.501000" |
|||
}, |
|||
{ |
|||
"title": "temprop=", |
|||
"content": "temprop=", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:43:48.501016" |
|||
}, |
|||
{ |
|||
"title": "temprop=", |
|||
"content": "temprop=", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:43:48.501021" |
|||
}, |
|||
{ |
|||
"title": "temprop=", |
|||
"content": "temprop=", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:43:48.501024" |
|||
}, |
|||
{ |
|||
"title": "temprop=", |
|||
"content": "temprop=", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:43:48.501026" |
|||
}, |
|||
{ |
|||
"title": "temprop=", |
|||
"content": "temprop=", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:43:48.501028" |
|||
}, |
|||
{ |
|||
"title": "temprop=", |
|||
"content": "temprop=", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:43:48.501030" |
|||
}, |
|||
{ |
|||
"title": "temprop=", |
|||
"content": "temprop=", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:43:48.501032" |
|||
}, |
|||
{ |
|||
"title": "temprop=", |
|||
"content": "temprop=", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:43:48.501034" |
|||
}, |
|||
{ |
|||
"title": "temprop=", |
|||
"content": "temprop=", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:43:48.501036" |
|||
} |
|||
], |
|||
"scraped_at": "2026-05-31T10:43:45.907587", |
|||
"total_items": 10 |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
{ |
|||
"source": "http://quotes.toscrape.com", |
|||
"strategy_name": "news_scraper", |
|||
"items": [], |
|||
"scraped_at": "2026-05-31T10:45:10.355276", |
|||
"total_items": 0 |
|||
} |
|||
@ -0,0 +1,68 @@ |
|||
{ |
|||
"source": "http://quotes.toscrape.com", |
|||
"strategy_name": "news_scraper", |
|||
"items": [ |
|||
{ |
|||
"title": "Quote by Albert Einstein", |
|||
"content": "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:46:20.434224" |
|||
}, |
|||
{ |
|||
"title": "Quote by J.K. Rowling", |
|||
"content": "“It is our choices, Harry, that show what we truly are, far more than our abilities.”", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:46:20.434236" |
|||
}, |
|||
{ |
|||
"title": "Quote by Albert Einstein", |
|||
"content": "“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:46:20.434250" |
|||
}, |
|||
{ |
|||
"title": "Quote by Jane Austen", |
|||
"content": "“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:46:20.434253" |
|||
}, |
|||
{ |
|||
"title": "Quote by Marilyn Monroe", |
|||
"content": "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:46:20.434255" |
|||
}, |
|||
{ |
|||
"title": "Quote by Albert Einstein", |
|||
"content": "“Try not to become a man of success. Rather become a man of value.”", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:46:20.434257" |
|||
}, |
|||
{ |
|||
"title": "Quote by André Gide", |
|||
"content": "“It is better to be hated for what you are than to be loved for what you are not.”", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:46:20.434259" |
|||
}, |
|||
{ |
|||
"title": "Quote by Thomas A. Edison", |
|||
"content": "“I have not failed. I've just found 10,000 ways that won't work.”", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:46:20.434261" |
|||
}, |
|||
{ |
|||
"title": "Quote by Eleanor Roosevelt", |
|||
"content": "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:46:20.434262" |
|||
}, |
|||
{ |
|||
"title": "Quote by Steve Martin", |
|||
"content": "“A day without sunshine is like, you know, night.”", |
|||
"url": "http://quotes.toscrape.com", |
|||
"timestamp": "2026-05-31T10:46:20.434264" |
|||
} |
|||
], |
|||
"scraped_at": "2026-05-31T10:46:18.193675", |
|||
"total_items": 10 |
|||
} |
|||
@ -0,0 +1,50 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
|
|||
<groupId>com.scraper</groupId> |
|||
<artifactId>java-scraper</artifactId> |
|||
<version>1.0-SNAPSHOT</version> |
|||
|
|||
<properties> |
|||
<maven.compiler.source>11</maven.compiler.source> |
|||
<maven.compiler.target>11</maven.compiler.target> |
|||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
|||
</properties> |
|||
|
|||
<dependencies> |
|||
<!-- JSON serialization (Gson) --> |
|||
<dependency> |
|||
<groupId>com.google.code.gson</groupId> |
|||
<artifactId>gson</artifactId> |
|||
<version>2.10.1</version> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-shade-plugin</artifactId> |
|||
<version>3.5.0</version> |
|||
<executions> |
|||
<execution> |
|||
<phase>package</phase> |
|||
<goals> |
|||
<goal>shade</goal> |
|||
</goals> |
|||
<configuration> |
|||
<transformers> |
|||
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> |
|||
<mainClass>com.scraper.Main</mainClass> |
|||
</transformer> |
|||
</transformers> |
|||
</configuration> |
|||
</execution> |
|||
</executions> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
|||
@ -0,0 +1,108 @@ |
|||
package com.scraper; |
|||
|
|||
import com.scraper.command.ListCommand; |
|||
import com.scraper.command.ScrapeCommand; |
|||
import com.scraper.controller.ScraperController; |
|||
import com.scraper.exception.ScraperException; |
|||
import com.scraper.view.ConsoleView; |
|||
|
|||
/** |
|||
* Main CLI application - Entry point for the scraper |
|||
*/ |
|||
public class Main { |
|||
|
|||
private ScraperController controller; |
|||
private ConsoleView view; |
|||
|
|||
public Main() { |
|||
this.controller = new ScraperController(); |
|||
this.view = new ConsoleView(); |
|||
} |
|||
|
|||
public static void main(String[] args) { |
|||
Main app = new Main(); |
|||
app.run(args); |
|||
} |
|||
|
|||
public void run(String[] args) { |
|||
if (args.length == 0) { |
|||
printHelp(); |
|||
return; |
|||
} |
|||
|
|||
try { |
|||
switch (args[0]) { |
|||
case "list": |
|||
handleList(); |
|||
break; |
|||
case "scrape": |
|||
handleScrape(args); |
|||
break; |
|||
case "help": |
|||
default: |
|||
printHelp(); |
|||
break; |
|||
} |
|||
} catch (ScraperException e) { |
|||
view.displayError(e.getMessage()); |
|||
if (e.getCause() != null) { |
|||
view.displayError("Cause: " + e.getCause().getMessage()); |
|||
} |
|||
System.exit(1); |
|||
} catch (Exception e) { |
|||
view.displayError("Unexpected error: " + e.getMessage()); |
|||
System.exit(1); |
|||
} |
|||
} |
|||
|
|||
private void handleList() throws ScraperException { |
|||
ListCommand cmd = new ListCommand(controller); |
|||
cmd.execute(); |
|||
view.displayStrategies(cmd.getStrategies()); |
|||
} |
|||
|
|||
private void handleScrape(String[] args) throws ScraperException { |
|||
if (args.length < 2) { |
|||
view.displayError("Please specify a scraper to use."); |
|||
printHelp(); |
|||
return; |
|||
} |
|||
|
|||
String strategyName = args[1]; |
|||
String outputDir = "data"; |
|||
if (args.length >= 4 && "--output".equals(args[2])) { |
|||
outputDir = args[3]; |
|||
controller.setOutputDir(outputDir); |
|||
} |
|||
|
|||
if ("all".equals(strategyName)) { |
|||
ListCommand listCmd = new ListCommand(controller); |
|||
listCmd.execute(); |
|||
for (java.util.Map<String, String> strategy : listCmd.getStrategies()) { |
|||
scrapeSingle(strategy.get("name")); |
|||
} |
|||
} else { |
|||
scrapeSingle(strategyName); |
|||
} |
|||
} |
|||
|
|||
private void scrapeSingle(String strategyName) throws ScraperException { |
|||
ScrapeCommand cmd = new ScrapeCommand(controller, strategyName); |
|||
cmd.execute(); |
|||
view.displaySuccess("Scraped " + cmd.getScrapedData().getTotalItems() + " items using " + strategyName); |
|||
view.displayScrapedData(cmd.getScrapedData(), cmd.getSavedPath()); |
|||
} |
|||
|
|||
private void printHelp() { |
|||
System.out.println("=== Web Scraper CLI - MVC + Command Pattern + Strategy Pattern ==="); |
|||
System.out.println(); |
|||
System.out.println("Usage:"); |
|||
System.out.println(" java -jar java-scraper.jar list - List all available scrapers"); |
|||
System.out.println(" java -jar java-scraper.jar scrape <scraper> - Scrape data using specific scraper"); |
|||
System.out.println(" java -jar java-scraper.jar scrape all - Scrape data from all scrapers"); |
|||
System.out.println(" java -jar java-scraper.jar scrape <scraper> --output <dir> - Specify output directory"); |
|||
System.out.println(" java -jar java-scraper.jar help - Show this help message"); |
|||
System.out.println(); |
|||
System.out.println("Available scrapers: news_scraper, books_scraper, tech_news_scraper"); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.scraper.command; |
|||
|
|||
import com.scraper.exception.ScraperException; |
|||
|
|||
/** |
|||
* Command interface for Command pattern |
|||
*/ |
|||
public interface Command { |
|||
void execute() throws ScraperException; |
|||
void undo() throws ScraperException; |
|||
} |
|||
@ -0,0 +1,33 @@ |
|||
package com.scraper.command; |
|||
|
|||
import com.scraper.controller.ScraperController; |
|||
|
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
/** |
|||
* Command to list all available scraping strategies |
|||
*/ |
|||
public class ListCommand implements Command { |
|||
|
|||
private ScraperController controller; |
|||
private List<Map<String, String>> strategies; |
|||
|
|||
public ListCommand(ScraperController controller) { |
|||
this.controller = controller; |
|||
} |
|||
|
|||
@Override |
|||
public void execute() { |
|||
strategies = controller.listStrategies(); |
|||
} |
|||
|
|||
@Override |
|||
public void undo() { |
|||
// List command doesn't support undo
|
|||
} |
|||
|
|||
public List<Map<String, String>> getStrategies() { |
|||
return strategies; |
|||
} |
|||
} |
|||
@ -0,0 +1,42 @@ |
|||
package com.scraper.command; |
|||
|
|||
import com.scraper.controller.ScraperController; |
|||
import com.scraper.exception.ScraperException; |
|||
import com.scraper.model.ScrapedData; |
|||
|
|||
/** |
|||
* Command to scrape data from a specific strategy |
|||
*/ |
|||
public class ScrapeCommand implements Command { |
|||
|
|||
private ScraperController controller; |
|||
private String strategyName; |
|||
private ScrapedData scrapedData; |
|||
private String savedPath; |
|||
|
|||
public ScrapeCommand(ScraperController controller, String strategyName) { |
|||
this.controller = controller; |
|||
this.strategyName = strategyName; |
|||
} |
|||
|
|||
@Override |
|||
public void execute() throws ScraperException { |
|||
scrapedData = controller.executeScrape(strategyName); |
|||
savedPath = controller.saveData(scrapedData, strategyName); |
|||
} |
|||
|
|||
@Override |
|||
public void undo() throws ScraperException { |
|||
if (savedPath != null) { |
|||
controller.deleteData(savedPath); |
|||
} |
|||
} |
|||
|
|||
public ScrapedData getScrapedData() { |
|||
return scrapedData; |
|||
} |
|||
|
|||
public String getSavedPath() { |
|||
return savedPath; |
|||
} |
|||
} |
|||
@ -0,0 +1,138 @@ |
|||
package com.scraper.controller; |
|||
|
|||
import com.google.gson.Gson; |
|||
import com.google.gson.GsonBuilder; |
|||
import com.scraper.exception.StorageException; |
|||
import com.scraper.exception.StrategyException; |
|||
import com.scraper.model.ScrapedData; |
|||
import com.scraper.strategy.BooksScraperStrategy; |
|||
import com.scraper.strategy.NewsScraperStrategy; |
|||
import com.scraper.strategy.ScraperStrategy; |
|||
import com.scraper.strategy.TechNewsScraperStrategy; |
|||
|
|||
import java.io.File; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Path; |
|||
import java.nio.file.Paths; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.ArrayList; |
|||
import java.util.HashMap; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
/** |
|||
* MVC Controller for the scraper application |
|||
*/ |
|||
public class ScraperController { |
|||
|
|||
private String outputDir; |
|||
private Map<String, ScraperStrategy> strategies; |
|||
private Gson gson; |
|||
|
|||
public ScraperController() { |
|||
this("data"); |
|||
} |
|||
|
|||
public ScraperController(String outputDir) { |
|||
this.outputDir = outputDir; |
|||
this.strategies = new HashMap<>(); |
|||
this.gson = new GsonBuilder().setPrettyPrinting().create(); |
|||
registerDefaultStrategies(); |
|||
} |
|||
|
|||
private void registerDefaultStrategies() { |
|||
registerStrategy(new NewsScraperStrategy()); |
|||
registerStrategy(new BooksScraperStrategy()); |
|||
registerStrategy(new TechNewsScraperStrategy()); |
|||
} |
|||
|
|||
public void registerStrategy(ScraperStrategy strategy) { |
|||
strategies.put(strategy.getName(), strategy); |
|||
} |
|||
|
|||
public List<Map<String, String>> listStrategies() { |
|||
List<Map<String, String>> result = new ArrayList<>(); |
|||
for (ScraperStrategy strategy : strategies.values()) { |
|||
Map<String, String> info = new HashMap<>(); |
|||
info.put("name", strategy.getName()); |
|||
info.put("source", strategy.getSource()); |
|||
result.add(info); |
|||
} |
|||
return result; |
|||
} |
|||
|
|||
public ScrapedData executeScrape(String strategyName) throws StrategyException { |
|||
ScraperStrategy strategy = strategies.get(strategyName); |
|||
if (strategy == null) { |
|||
String available = String.join(", ", strategies.keySet()); |
|||
throw new StrategyException( |
|||
"Strategy '" + strategyName + "' not found. Available: " + available, |
|||
strategyName, |
|||
null |
|||
); |
|||
} |
|||
|
|||
try { |
|||
return strategy.scrape(); |
|||
} catch (Exception e) { |
|||
if (e instanceof StrategyException) { |
|||
throw (StrategyException) e; |
|||
} |
|||
throw new StrategyException( |
|||
"Error executing strategy: " + strategyName, |
|||
strategyName, |
|||
e |
|||
); |
|||
} |
|||
} |
|||
|
|||
public String saveData(ScrapedData data, String strategyName) throws StorageException { |
|||
try { |
|||
String folderPath = outputDir + File.separator + strategyName; |
|||
Path folder = Paths.get(folderPath); |
|||
Files.createDirectories(folder); |
|||
|
|||
String timestamp = LocalDateTime.now().format( |
|||
DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss") |
|||
); |
|||
String filename = "scraped_data_" + timestamp + ".json"; |
|||
String filePath = folderPath + File.separator + filename; |
|||
|
|||
try (FileWriter writer = new FileWriter(filePath)) { |
|||
gson.toJson(data, writer); |
|||
} |
|||
|
|||
return filePath; |
|||
} catch (IOException e) { |
|||
throw new StorageException( |
|||
"Failed to save data to: " + outputDir, |
|||
outputDir, |
|||
e |
|||
); |
|||
} |
|||
} |
|||
|
|||
public boolean deleteData(String filePath) throws StorageException { |
|||
try { |
|||
Path path = Paths.get(filePath); |
|||
if (Files.exists(path)) { |
|||
Files.delete(path); |
|||
return true; |
|||
} |
|||
return false; |
|||
} catch (IOException e) { |
|||
throw new StorageException( |
|||
"Failed to delete file: " + filePath, |
|||
filePath, |
|||
e |
|||
); |
|||
} |
|||
} |
|||
|
|||
public void setOutputDir(String outputDir) { |
|||
this.outputDir = outputDir; |
|||
} |
|||
} |
|||
@ -0,0 +1,27 @@ |
|||
package com.scraper.exception; |
|||
|
|||
/** |
|||
* Exception thrown when there is a network related error |
|||
*/ |
|||
public class NetworkException extends ScraperException { |
|||
|
|||
private static final long serialVersionUID = 1L; |
|||
private Integer statusCode; |
|||
|
|||
public NetworkException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public NetworkException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
|
|||
public NetworkException(String message, Integer statusCode, Throwable cause) { |
|||
super(message, cause); |
|||
this.statusCode = statusCode; |
|||
} |
|||
|
|||
public Integer getStatusCode() { |
|||
return statusCode; |
|||
} |
|||
} |
|||
@ -0,0 +1,27 @@ |
|||
package com.scraper.exception; |
|||
|
|||
/** |
|||
* Exception thrown when there is a parsing error |
|||
*/ |
|||
public class ParseException extends ScraperException { |
|||
|
|||
private static final long serialVersionUID = 1L; |
|||
private String selector; |
|||
|
|||
public ParseException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public ParseException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
|
|||
public ParseException(String message, String selector, Throwable cause) { |
|||
super(message, cause); |
|||
this.selector = selector; |
|||
} |
|||
|
|||
public String getSelector() { |
|||
return selector; |
|||
} |
|||
} |
|||
@ -0,0 +1,17 @@ |
|||
package com.scraper.exception; |
|||
|
|||
/** |
|||
* Base exception class for the scraper application |
|||
*/ |
|||
public class ScraperException extends Exception { |
|||
|
|||
private static final long serialVersionUID = 1L; |
|||
|
|||
public ScraperException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public ScraperException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,27 @@ |
|||
package com.scraper.exception; |
|||
|
|||
/** |
|||
* Exception thrown when there is a storage related error |
|||
*/ |
|||
public class StorageException extends ScraperException { |
|||
|
|||
private static final long serialVersionUID = 1L; |
|||
private String filePath; |
|||
|
|||
public StorageException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public StorageException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
|
|||
public StorageException(String message, String filePath, Throwable cause) { |
|||
super(message, cause); |
|||
this.filePath = filePath; |
|||
} |
|||
|
|||
public String getFilePath() { |
|||
return filePath; |
|||
} |
|||
} |
|||
@ -0,0 +1,27 @@ |
|||
package com.scraper.exception; |
|||
|
|||
/** |
|||
* Exception thrown when there is a strategy related error |
|||
*/ |
|||
public class StrategyException extends ScraperException { |
|||
|
|||
private static final long serialVersionUID = 1L; |
|||
private String strategyName; |
|||
|
|||
public StrategyException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public StrategyException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
|
|||
public StrategyException(String message, String strategyName, Throwable cause) { |
|||
super(message, cause); |
|||
this.strategyName = strategyName; |
|||
} |
|||
|
|||
public String getStrategyName() { |
|||
return strategyName; |
|||
} |
|||
} |
|||
@ -0,0 +1,77 @@ |
|||
package com.scraper.model; |
|||
|
|||
import java.time.LocalDateTime; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
/** |
|||
* Model class representing the complete scraped data container |
|||
*/ |
|||
public class ScrapedData { |
|||
private String source; |
|||
private String strategyName; |
|||
private List<ScrapedItem> items; |
|||
private LocalDateTime scrapedAt; |
|||
private int totalItems; |
|||
|
|||
public ScrapedData() { |
|||
this.items = new ArrayList<>(); |
|||
this.scrapedAt = LocalDateTime.now(); |
|||
this.totalItems = 0; |
|||
} |
|||
|
|||
public ScrapedData(String source, String strategyName) { |
|||
this.source = source; |
|||
this.strategyName = strategyName; |
|||
this.items = new ArrayList<>(); |
|||
this.scrapedAt = LocalDateTime.now(); |
|||
this.totalItems = 0; |
|||
} |
|||
|
|||
public void addItem(ScrapedItem item) { |
|||
this.items.add(item); |
|||
this.totalItems = this.items.size(); |
|||
} |
|||
|
|||
// Getters and Setters
|
|||
public String getSource() { |
|||
return source; |
|||
} |
|||
|
|||
public void setSource(String source) { |
|||
this.source = source; |
|||
} |
|||
|
|||
public String getStrategyName() { |
|||
return strategyName; |
|||
} |
|||
|
|||
public void setStrategyName(String strategyName) { |
|||
this.strategyName = strategyName; |
|||
} |
|||
|
|||
public List<ScrapedItem> getItems() { |
|||
return items; |
|||
} |
|||
|
|||
public void setItems(List<ScrapedItem> items) { |
|||
this.items = items; |
|||
this.totalItems = items.size(); |
|||
} |
|||
|
|||
public LocalDateTime getScrapedAt() { |
|||
return scrapedAt; |
|||
} |
|||
|
|||
public void setScrapedAt(LocalDateTime scrapedAt) { |
|||
this.scrapedAt = scrapedAt; |
|||
} |
|||
|
|||
public int getTotalItems() { |
|||
return totalItems; |
|||
} |
|||
|
|||
public void setTotalItems(int totalItems) { |
|||
this.totalItems = totalItems; |
|||
} |
|||
} |
|||
@ -0,0 +1,57 @@ |
|||
package com.scraper.model; |
|||
|
|||
import java.time.LocalDateTime; |
|||
|
|||
/** |
|||
* Model class representing a single scraped item |
|||
*/ |
|||
public class ScrapedItem { |
|||
private String title; |
|||
private String content; |
|||
private String url; |
|||
private LocalDateTime timestamp; |
|||
|
|||
public ScrapedItem() { |
|||
this.timestamp = LocalDateTime.now(); |
|||
} |
|||
|
|||
public ScrapedItem(String title, String content, String url) { |
|||
this.title = title; |
|||
this.content = content; |
|||
this.url = url; |
|||
this.timestamp = LocalDateTime.now(); |
|||
} |
|||
|
|||
// Getters and Setters
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getContent() { |
|||
return content; |
|||
} |
|||
|
|||
public void setContent(String content) { |
|||
this.content = content; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
|
|||
public void setUrl(String url) { |
|||
this.url = url; |
|||
} |
|||
|
|||
public LocalDateTime getTimestamp() { |
|||
return timestamp; |
|||
} |
|||
|
|||
public void setTimestamp(LocalDateTime timestamp) { |
|||
this.timestamp = timestamp; |
|||
} |
|||
} |
|||
@ -0,0 +1,102 @@ |
|||
package com.scraper.strategy; |
|||
|
|||
import com.scraper.exception.NetworkException; |
|||
import com.scraper.exception.ParseException; |
|||
import com.scraper.model.ScrapedData; |
|||
import com.scraper.model.ScrapedItem; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
/** |
|||
* Strategy for scraping books from https://books.toscrape.com
|
|||
*/ |
|||
public class BooksScraperStrategy implements ScraperStrategy { |
|||
|
|||
private static final String NAME = "books_scraper"; |
|||
private static final String SOURCE = "https://books.toscrape.com"; |
|||
|
|||
@Override |
|||
public ScrapedData scrape() throws NetworkException, ParseException { |
|||
ScrapedData data = new ScrapedData(SOURCE, NAME); |
|||
|
|||
try { |
|||
String html = fetchHTML(SOURCE); |
|||
parseBooks(html, data); |
|||
} catch (NetworkException e) { |
|||
throw e; |
|||
} catch (Exception e) { |
|||
throw new ParseException("Failed to parse books", null, e); |
|||
} |
|||
|
|||
return data; |
|||
} |
|||
|
|||
private String fetchHTML(String urlString) throws NetworkException { |
|||
try { |
|||
URL url = new URL(urlString); |
|||
HttpURLConnection connection = (HttpURLConnection) url.openConnection(); |
|||
connection.setRequestMethod("GET"); |
|||
connection.setRequestProperty("User-Agent", "Mozilla/5.0"); |
|||
connection.setConnectTimeout(10000); |
|||
connection.setReadTimeout(10000); |
|||
|
|||
int statusCode = connection.getResponseCode(); |
|||
if (statusCode != 200) { |
|||
throw new NetworkException("Failed to fetch URL: " + urlString, statusCode, null); |
|||
} |
|||
|
|||
StringBuilder response = new StringBuilder(); |
|||
try (BufferedReader in = new BufferedReader( |
|||
new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8))) { |
|||
String inputLine; |
|||
while ((inputLine = in.readLine()) != null) { |
|||
response.append(inputLine); |
|||
response.append("\n"); |
|||
} |
|||
} |
|||
|
|||
connection.disconnect(); |
|||
return response.toString(); |
|||
} catch (Exception e) { |
|||
throw new NetworkException("Network error while fetching: " + urlString, e); |
|||
} |
|||
} |
|||
|
|||
private void parseBooks(String html, ScrapedData data) { |
|||
Pattern titlePattern = Pattern.compile("<h3><a href=\"[^\"]*\" title=\"([^\"]+)\""); |
|||
Pattern pricePattern = Pattern.compile("price_color\">([^<]+)<"); |
|||
|
|||
Matcher titleMatcher = titlePattern.matcher(html); |
|||
Matcher priceMatcher = pricePattern.matcher(html); |
|||
|
|||
int count = 0; |
|||
while (titleMatcher.find() && priceMatcher.find() && count < 20) { |
|||
String title = titleMatcher.group(1).trim(); |
|||
String price = priceMatcher.group(1).trim(); |
|||
|
|||
ScrapedItem item = new ScrapedItem( |
|||
title, |
|||
"Price: " + price, |
|||
SOURCE |
|||
); |
|||
data.addItem(item); |
|||
count++; |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return NAME; |
|||
} |
|||
|
|||
@Override |
|||
public String getSource() { |
|||
return SOURCE; |
|||
} |
|||
} |
|||
@ -0,0 +1,121 @@ |
|||
package com.scraper.strategy; |
|||
|
|||
import com.scraper.exception.NetworkException; |
|||
import com.scraper.exception.ParseException; |
|||
import com.scraper.model.ScrapedData; |
|||
import com.scraper.model.ScrapedItem; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
/** |
|||
* Strategy for scraping quotes from http://quotes.toscrape.com
|
|||
*/ |
|||
public class NewsScraperStrategy implements ScraperStrategy { |
|||
|
|||
private static final String NAME = "news_scraper"; |
|||
private static final String SOURCE = "http://quotes.toscrape.com"; |
|||
|
|||
@Override |
|||
public ScrapedData scrape() throws NetworkException, ParseException { |
|||
ScrapedData data = new ScrapedData(SOURCE, NAME); |
|||
|
|||
try { |
|||
String html = fetchHTML(SOURCE); |
|||
parseQuotes(html, data); |
|||
} catch (NetworkException e) { |
|||
throw e; |
|||
} catch (Exception e) { |
|||
throw new ParseException("Failed to parse quotes", null, e); |
|||
} |
|||
|
|||
return data; |
|||
} |
|||
|
|||
private String fetchHTML(String urlString) throws NetworkException { |
|||
try { |
|||
URL url = new URL(urlString); |
|||
HttpURLConnection connection = (HttpURLConnection) url.openConnection(); |
|||
connection.setRequestMethod("GET"); |
|||
connection.setRequestProperty("User-Agent", "Mozilla/5.0"); |
|||
connection.setConnectTimeout(10000); |
|||
connection.setReadTimeout(10000); |
|||
|
|||
int statusCode = connection.getResponseCode(); |
|||
if (statusCode != 200) { |
|||
throw new NetworkException("Failed to fetch URL: " + urlString, statusCode, null); |
|||
} |
|||
|
|||
StringBuilder response = new StringBuilder(); |
|||
try (BufferedReader in = new BufferedReader( |
|||
new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8))) { |
|||
String inputLine; |
|||
while ((inputLine = in.readLine()) != null) { |
|||
response.append(inputLine); |
|||
response.append("\n"); |
|||
} |
|||
} |
|||
|
|||
connection.disconnect(); |
|||
return response.toString(); |
|||
} catch (Exception e) { |
|||
throw new NetworkException("Network error while fetching: " + urlString, e); |
|||
} |
|||
} |
|||
|
|||
private void parseQuotes(String html, ScrapedData data) { |
|||
Pattern quotePattern = Pattern.compile( |
|||
"<span class=\"text\">([^<]+)</span>.*?<small class=\"author\">([^<]+)</small>", |
|||
Pattern.DOTALL |
|||
); |
|||
|
|||
Matcher matcher = quotePattern.matcher(html); |
|||
int count = 0; |
|||
while (matcher.find() && count < 10) { |
|||
String text = matcher.group(1).trim(); |
|||
String author = matcher.group(2).trim(); |
|||
|
|||
ScrapedItem item = new ScrapedItem( |
|||
"Quote by " + author, |
|||
text, |
|||
SOURCE |
|||
); |
|||
data.addItem(item); |
|||
count++; |
|||
} |
|||
|
|||
if (count == 0) { |
|||
// Fallback to simpler pattern
|
|||
Pattern simpleTextPattern = Pattern.compile("\"text\">([^<]+)<"); |
|||
Pattern simpleAuthorPattern = Pattern.compile("author\">([^<]+)<"); |
|||
Matcher textMatcher = simpleTextPattern.matcher(html); |
|||
Matcher authorMatcher = simpleAuthorPattern.matcher(html); |
|||
|
|||
int itemCount = 0; |
|||
while (textMatcher.find() && authorMatcher.find() && itemCount < 10) { |
|||
ScrapedItem item = new ScrapedItem( |
|||
"Quote by " + authorMatcher.group(1).trim(), |
|||
textMatcher.group(1).trim(), |
|||
SOURCE |
|||
); |
|||
data.addItem(item); |
|||
itemCount++; |
|||
} |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return NAME; |
|||
} |
|||
|
|||
@Override |
|||
public String getSource() { |
|||
return SOURCE; |
|||
} |
|||
} |
|||
@ -0,0 +1,13 @@ |
|||
package com.scraper.strategy; |
|||
|
|||
import com.scraper.exception.ScraperException; |
|||
import com.scraper.model.ScrapedData; |
|||
|
|||
/** |
|||
* Strategy interface for web scrapers |
|||
*/ |
|||
public interface ScraperStrategy { |
|||
ScrapedData scrape() throws ScraperException; |
|||
String getName(); |
|||
String getSource(); |
|||
} |
|||
@ -0,0 +1,114 @@ |
|||
package com.scraper.strategy; |
|||
|
|||
import com.scraper.exception.NetworkException; |
|||
import com.scraper.exception.ParseException; |
|||
import com.scraper.model.ScrapedData; |
|||
import com.scraper.model.ScrapedItem; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
/** |
|||
* Strategy for scraping news from https://www.bbc.com/news
|
|||
*/ |
|||
public class TechNewsScraperStrategy implements ScraperStrategy { |
|||
|
|||
private static final String NAME = "tech_news_scraper"; |
|||
private static final String SOURCE = "https://www.bbc.com/news"; |
|||
|
|||
@Override |
|||
public ScrapedData scrape() throws NetworkException, ParseException { |
|||
ScrapedData data = new ScrapedData(SOURCE, NAME); |
|||
|
|||
try { |
|||
String html = fetchHTML(SOURCE); |
|||
parseHeadlines(html, data); |
|||
} catch (NetworkException e) { |
|||
throw e; |
|||
} catch (Exception e) { |
|||
throw new ParseException("Failed to parse tech news", null, e); |
|||
} |
|||
|
|||
return data; |
|||
} |
|||
|
|||
private String fetchHTML(String urlString) throws NetworkException { |
|||
try { |
|||
URL url = new URL(urlString); |
|||
HttpURLConnection connection = (HttpURLConnection) url.openConnection(); |
|||
connection.setRequestMethod("GET"); |
|||
connection.setRequestProperty("User-Agent", "Mozilla/5.0"); |
|||
connection.setConnectTimeout(10000); |
|||
connection.setReadTimeout(10000); |
|||
|
|||
int statusCode = connection.getResponseCode(); |
|||
if (statusCode != 200) { |
|||
throw new NetworkException("Failed to fetch URL: " + urlString, statusCode, null); |
|||
} |
|||
|
|||
StringBuilder response = new StringBuilder(); |
|||
try (BufferedReader in = new BufferedReader( |
|||
new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8))) { |
|||
String inputLine; |
|||
while ((inputLine = in.readLine()) != null) { |
|||
response.append(inputLine); |
|||
response.append("\n"); |
|||
} |
|||
} |
|||
|
|||
connection.disconnect(); |
|||
return response.toString(); |
|||
} catch (Exception e) { |
|||
throw new NetworkException("Network error while fetching: " + urlString, e); |
|||
} |
|||
} |
|||
|
|||
private void parseHeadlines(String html, ScrapedData data) { |
|||
List<String> headlines = new ArrayList<>(); |
|||
String[] patterns = { |
|||
"<h1[^>]*class=\"[^\"]*headline[^\"]*\"[^>]*>([^<]+)<", |
|||
"<h2[^>]*class=\"[^\"]*headline[^\"]*\"[^>]*>([^<]+)<", |
|||
"<h3[^>]*class=\"[^\"]*headline[^\"]*\"[^>]*>([^<]+)<", |
|||
"<h1[^>]*class=\"[^\"]*title[^\"]*\"[^>]*>([^<]+)<", |
|||
"<h2[^>]*class=\"[^\"]*title[^\"]*\"[^>]*>([^<]+)<" |
|||
}; |
|||
|
|||
for (String patternStr : patterns) { |
|||
Pattern pattern = Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE); |
|||
Matcher matcher = pattern.matcher(html); |
|||
|
|||
while (matcher.find()) { |
|||
String headline = matcher.group(1).trim(); |
|||
if (!headline.isEmpty() && headline.length() > 10 && !headlines.contains(headline)) { |
|||
headlines.add(headline); |
|||
} |
|||
} |
|||
} |
|||
|
|||
for (int i = 0; i < Math.min(headlines.size(), 15); i++) { |
|||
ScrapedItem item = new ScrapedItem( |
|||
headlines.get(i), |
|||
"", |
|||
SOURCE |
|||
); |
|||
data.addItem(item); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return NAME; |
|||
} |
|||
|
|||
@Override |
|||
public String getSource() { |
|||
return SOURCE; |
|||
} |
|||
} |
|||
@ -0,0 +1,72 @@ |
|||
package com.scraper.view; |
|||
|
|||
import com.scraper.model.ScrapedData; |
|||
import com.scraper.model.ScrapedItem; |
|||
|
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
/** |
|||
* MVC View class for console output |
|||
*/ |
|||
public class ConsoleView { |
|||
|
|||
public void displayMessage(String message) { |
|||
System.out.println(message); |
|||
} |
|||
|
|||
public void displayError(String error) { |
|||
System.err.println("[ERROR] " + error); |
|||
} |
|||
|
|||
public void displaySuccess(String message) { |
|||
System.out.println("[SUCCESS] " + message); |
|||
} |
|||
|
|||
public void displayStrategies(List<Map<String, String>> strategies) { |
|||
System.out.println("\n=== Available Scrapers ==="); |
|||
for (int i = 0; i < strategies.size(); i++) { |
|||
Map<String, String> strategy = strategies.get(i); |
|||
System.out.println((i + 1) + ". " + strategy.get("name")); |
|||
System.out.println(" Source: " + strategy.get("source")); |
|||
} |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void displayScrapedData(ScrapedData data, String savedPath) { |
|||
System.out.println("\n=== Scraping Results ==="); |
|||
System.out.println("Source: " + data.getSource()); |
|||
System.out.println("Strategy: " + data.getStrategyName()); |
|||
System.out.println("Total Items: " + data.getTotalItems()); |
|||
System.out.println("Scraped At: " + data.getScrapedAt()); |
|||
|
|||
if (savedPath != null) { |
|||
System.out.println("Saved To: " + savedPath); |
|||
} |
|||
|
|||
System.out.println("\n--- Items Preview ---"); |
|||
List<ScrapedItem> items = data.getItems(); |
|||
int displayCount = Math.min(items.size(), 5); |
|||
for (int i = 0; i < displayCount; i++) { |
|||
ScrapedItem item = items.get(i); |
|||
System.out.println((i + 1) + ". " + safeString(item.getTitle())); |
|||
if (item.getContent() != null && !item.getContent().isEmpty()) { |
|||
String content = safeString(item.getContent()); |
|||
String truncated = content.length() > 80 ? content.substring(0, 80) + "..." : content; |
|||
System.out.println(" " + truncated); |
|||
} |
|||
System.out.println(); |
|||
} |
|||
|
|||
if (items.size() > 5) { |
|||
System.out.println("... and " + (items.size() - 5) + " more items"); |
|||
} |
|||
} |
|||
|
|||
private String safeString(String str) { |
|||
if (str == null) { |
|||
return ""; |
|||
} |
|||
return str.replaceAll("[^\\x20-\\x7E]", "?"); |
|||
} |
|||
} |
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,102 @@ |
|||
import argparse |
|||
import sys |
|||
import os |
|||
|
|||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
|||
|
|||
from controllers import ScraperController |
|||
from commands import ScrapeCommand, ListCommand |
|||
from views import ConsoleView |
|||
from exceptions import ScraperException |
|||
|
|||
|
|||
class CLIApplication: |
|||
def __init__(self): |
|||
self.controller = ScraperController() |
|||
self.view = ConsoleView() |
|||
|
|||
def run(self, args=None): |
|||
parser = self._create_parser() |
|||
parsed_args = parser.parse_args(args) |
|||
|
|||
if hasattr(parsed_args, 'func'): |
|||
try: |
|||
parsed_args.func(parsed_args) |
|||
except ScraperException as e: |
|||
self.view.display_error(str(e)) |
|||
if e.original_exception: |
|||
self.view.display_error(f"Original error: {e.original_exception}") |
|||
sys.exit(1) |
|||
except Exception as e: |
|||
self.view.display_error(f"Unexpected error: {str(e)}") |
|||
sys.exit(1) |
|||
else: |
|||
parser.print_help() |
|||
|
|||
def _create_parser(self) -> argparse.ArgumentParser: |
|||
parser = argparse.ArgumentParser( |
|||
description='Web Scraper CLI - MVC + Command Pattern + Strategy Pattern', |
|||
formatter_class=argparse.RawDescriptionHelpFormatter |
|||
) |
|||
|
|||
subparsers = parser.add_subparsers(dest='command', help='Available commands') |
|||
|
|||
scrape_parser = subparsers.add_parser('scrape', help='Scrape data from a website') |
|||
scrape_parser.add_argument( |
|||
'strategy', |
|||
choices=['news_scraper', 'books_scraper', 'tech_news_scraper', 'all'], |
|||
help='Scraper strategy to use' |
|||
) |
|||
scrape_parser.add_argument( |
|||
'--output', '-o', |
|||
default='data', |
|||
help='Output directory for scraped data' |
|||
) |
|||
scrape_parser.set_defaults(func=self._handle_scrape) |
|||
|
|||
list_parser = subparsers.add_parser('list', help='List all available scrapers') |
|||
list_parser.set_defaults(func=self._handle_list) |
|||
|
|||
info_parser = subparsers.add_parser('info', help='Show detailed info about a scraper') |
|||
info_parser.add_argument('strategy', help='Strategy name') |
|||
info_parser.set_defaults(func=self._handle_info) |
|||
|
|||
return parser |
|||
|
|||
def _handle_scrape(self, args): |
|||
if args.strategy == 'all': |
|||
strategies = ['news_scraper', 'books_scraper', 'tech_news_scraper'] |
|||
for strategy in strategies: |
|||
self._scrape_single(strategy, args.output) |
|||
else: |
|||
self._scrape_single(args.strategy, args.output) |
|||
|
|||
def _scrape_single(self, strategy_name: str, output_dir: str): |
|||
self.controller.output_dir = output_dir |
|||
command = ScrapeCommand(self.controller, strategy_name) |
|||
data, saved_path = command.execute() |
|||
self.view.display_success(f"Scraped {data.total_items} items using {strategy_name}") |
|||
self.view.display_scraped_data(data, saved_path) |
|||
|
|||
def _handle_list(self, args): |
|||
command = ListCommand(self.controller) |
|||
strategies = command.execute() |
|||
self.view.display_strategies(strategies) |
|||
|
|||
def _handle_info(self, args): |
|||
strategies = self.controller.list_strategies() |
|||
strategy = next((s for s in strategies if s['name'] == args.strategy), None) |
|||
if strategy: |
|||
self.view.display_message(f"\n=== {strategy['name']} ===") |
|||
self.view.display_message(f"Source: {strategy['source']}") |
|||
else: |
|||
self.view.display_error(f"Strategy '{args.strategy}' not found") |
|||
|
|||
|
|||
def main(): |
|||
app = CLIApplication() |
|||
app.run() |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
main() |
|||
@ -0,0 +1,3 @@ |
|||
from .scraped_data import ScrapedItem, ScrapedData |
|||
|
|||
__all__ = ['ScrapedItem', 'ScrapedData'] |
|||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,41 @@ |
|||
from dataclasses import dataclass, field |
|||
from datetime import datetime |
|||
from typing import List, Dict, Any |
|||
|
|||
|
|||
@dataclass |
|||
class ScrapedItem: |
|||
title: str |
|||
content: str |
|||
url: str |
|||
timestamp: datetime = field(default_factory=datetime.now) |
|||
|
|||
def to_dict(self) -> Dict[str, Any]: |
|||
return { |
|||
'title': self.title, |
|||
'content': self.content, |
|||
'url': self.url, |
|||
'timestamp': self.timestamp.isoformat() |
|||
} |
|||
|
|||
|
|||
@dataclass |
|||
class ScrapedData: |
|||
source: str |
|||
strategy_name: str |
|||
items: List[ScrapedItem] = field(default_factory=list) |
|||
scraped_at: datetime = field(default_factory=datetime.now) |
|||
total_items: int = 0 |
|||
|
|||
def add_item(self, item: ScrapedItem): |
|||
self.items.append(item) |
|||
self.total_items = len(self.items) |
|||
|
|||
def to_dict(self) -> Dict[str, Any]: |
|||
return { |
|||
'source': self.source, |
|||
'strategy_name': self.strategy_name, |
|||
'items': [item.to_dict() for item in self.items], |
|||
'scraped_at': self.scraped_at.isoformat(), |
|||
'total_items': self.total_items |
|||
} |
|||
@ -0,0 +1,5 @@ |
|||
requests>=2.28.0 |
|||
beautifulsoup4>=4.11.0 |
|||
|
|||
requests is optional, the scraper uses urllib by default. |
|||
beautifulsoup4 is optional, the scraper uses html.parser by default. |
|||
@ -0,0 +1,11 @@ |
|||
from .base_scraper import ScraperStrategy |
|||
from .news_scraper import NewsScraperStrategy |
|||
from .quotes_scraper import BooksScraperStrategy |
|||
from .tech_news_scraper import TechNewsScraperStrategy |
|||
|
|||
__all__ = [ |
|||
'ScraperStrategy', |
|||
'NewsScraperStrategy', |
|||
'BooksScraperStrategy', |
|||
'TechNewsScraperStrategy' |
|||
] |
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,18 @@ |
|||
from abc import ABC, abstractmethod |
|||
from models import ScrapedData |
|||
|
|||
|
|||
class ScraperStrategy(ABC): |
|||
@abstractmethod |
|||
def scrape(self) -> ScrapedData: |
|||
pass |
|||
|
|||
@property |
|||
@abstractmethod |
|||
def name(self) -> str: |
|||
pass |
|||
|
|||
@property |
|||
@abstractmethod |
|||
def source(self) -> str: |
|||
pass |
|||
@ -0,0 +1,72 @@ |
|||
from urllib.request import urlopen, Request |
|||
import re |
|||
from datetime import datetime |
|||
|
|||
from strategies.base_scraper import ScraperStrategy |
|||
from models import ScrapedItem, ScrapedData |
|||
from exceptions import NetworkException, ParseException |
|||
|
|||
|
|||
class NewsScraperStrategy(ScraperStrategy): |
|||
def __init__(self): |
|||
self._name = "news_scraper" |
|||
self._source = "http://quotes.toscrape.com" |
|||
|
|||
@property |
|||
def name(self) -> str: |
|||
return self._name |
|||
|
|||
@property |
|||
def source(self) -> str: |
|||
return self._source |
|||
|
|||
def scrape(self) -> ScrapedData: |
|||
data = ScrapedData(source=self.source, strategy_name=self.name) |
|||
try: |
|||
request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'}) |
|||
response = urlopen(request, timeout=10) |
|||
html = response.read().decode('utf-8') |
|||
except Exception as e: |
|||
raise NetworkException( |
|||
f"Failed to fetch news from {self.source}", |
|||
original_exception=e |
|||
) |
|||
|
|||
try: |
|||
quotes = self._extract_quotes(html) |
|||
|
|||
for quote_text, author in quotes[:10]: |
|||
item = ScrapedItem( |
|||
title=f"Quote by {author}", |
|||
content=quote_text, |
|||
url=self.source |
|||
) |
|||
data.add_item(item) |
|||
|
|||
except Exception as e: |
|||
raise ParseException( |
|||
"Failed to parse news content", |
|||
selector="div.quote", |
|||
original_exception=e |
|||
) |
|||
|
|||
return data |
|||
|
|||
def _extract_quotes(self, html): |
|||
quotes = [] |
|||
quote_pattern = r'<div class="quote"[^>]*>.*?<span class="text"[^>]*>([^<]+)</span>.*?<small class="author">([^<]+)</small>' |
|||
|
|||
matches = re.findall(quote_pattern, html, re.DOTALL) |
|||
for match in matches: |
|||
quotes.append((match[0].strip(), match[1].strip())) |
|||
|
|||
if not quotes: |
|||
text_pattern = r'"text">([^<]+)<' |
|||
author_pattern = r'author">([^<]+)<' |
|||
texts = re.findall(text_pattern, html) |
|||
authors = re.findall(author_pattern, html) |
|||
|
|||
for i in range(min(len(texts), len(authors))): |
|||
quotes.append((texts[i].strip(), authors[i].strip())) |
|||
|
|||
return quotes |
|||
@ -0,0 +1,67 @@ |
|||
from urllib.request import urlopen, Request |
|||
import re |
|||
from datetime import datetime |
|||
|
|||
from strategies.base_scraper import ScraperStrategy |
|||
from models import ScrapedItem, ScrapedData |
|||
from exceptions import NetworkException, ParseException |
|||
|
|||
|
|||
class BooksScraperStrategy(ScraperStrategy): |
|||
def __init__(self): |
|||
self._name = "books_scraper" |
|||
self._source = "https://books.toscrape.com" |
|||
|
|||
@property |
|||
def name(self) -> str: |
|||
return self._name |
|||
|
|||
@property |
|||
def source(self) -> str: |
|||
return self._source |
|||
|
|||
def scrape(self) -> ScrapedData: |
|||
data = ScrapedData(source=self.source, strategy_name=self.name) |
|||
try: |
|||
request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'}) |
|||
response = urlopen(request, timeout=10) |
|||
html = response.read().decode('utf-8') |
|||
except Exception as e: |
|||
raise NetworkException( |
|||
f"Failed to fetch books from {self.source}", |
|||
original_exception=e |
|||
) |
|||
|
|||
try: |
|||
books = self._extract_books(html) |
|||
|
|||
for title, price in books[:20]: |
|||
item = ScrapedItem( |
|||
title=title, |
|||
content=f"Price: {price}", |
|||
url=self.source |
|||
) |
|||
data.add_item(item) |
|||
|
|||
except Exception as e: |
|||
raise ParseException( |
|||
"Failed to parse book content", |
|||
selector="article.product_pod", |
|||
original_exception=e |
|||
) |
|||
|
|||
return data |
|||
|
|||
def _extract_books(self, html): |
|||
books = [] |
|||
|
|||
title_pattern = r'<h3><a href="[^"]*" title="([^"]+)"' |
|||
price_pattern = r'price_color">([^<]+)<' |
|||
|
|||
titles = re.findall(title_pattern, html) |
|||
prices = re.findall(price_pattern, html) |
|||
|
|||
for i in range(min(len(titles), len(prices))): |
|||
books.append((titles[i].strip(), prices[i].strip())) |
|||
|
|||
return books |
|||
@ -0,0 +1,81 @@ |
|||
from urllib.request import urlopen, Request |
|||
import re |
|||
|
|||
from strategies.base_scraper import ScraperStrategy |
|||
from models import ScrapedItem, ScrapedData |
|||
from exceptions import NetworkException, ParseException |
|||
|
|||
|
|||
class TechNewsScraperStrategy(ScraperStrategy): |
|||
def __init__(self): |
|||
self._name = "tech_news_scraper" |
|||
self._source = "https://www.bbc.com/news" |
|||
|
|||
@property |
|||
def name(self) -> str: |
|||
return self._name |
|||
|
|||
@property |
|||
def source(self) -> str: |
|||
return self._source |
|||
|
|||
def scrape(self) -> ScrapedData: |
|||
data = ScrapedData(source=self.source, strategy_name=self.name) |
|||
try: |
|||
request = Request(self.source, headers={ |
|||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
|||
}) |
|||
response = urlopen(request, timeout=10) |
|||
html = response.read().decode('utf-8') |
|||
except Exception as e: |
|||
raise NetworkException( |
|||
f"Failed to fetch tech news from {self.source}", |
|||
original_exception=e |
|||
) |
|||
|
|||
try: |
|||
headlines = self._extract_headlines(html) |
|||
|
|||
for headline in headlines[:15]: |
|||
item = ScrapedItem( |
|||
title=headline, |
|||
content="", |
|||
url=self.source |
|||
) |
|||
data.add_item(item) |
|||
|
|||
except Exception as e: |
|||
raise ParseException( |
|||
"Failed to parse tech news content", |
|||
selector="h1, h2, h3", |
|||
original_exception=e |
|||
) |
|||
|
|||
return data |
|||
|
|||
def _extract_headlines(self, html): |
|||
headlines = [] |
|||
|
|||
h_patterns = [ |
|||
r'<h1[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<', |
|||
r'<h2[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<', |
|||
r'<h3[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<', |
|||
r'<h1[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<', |
|||
r'<h2[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<', |
|||
] |
|||
|
|||
for pattern in h_patterns: |
|||
matches = re.findall(pattern, html, re.IGNORECASE) |
|||
for match in matches: |
|||
headline = match.strip() |
|||
if headline and len(headline) > 10: |
|||
headlines.append(headline) |
|||
|
|||
seen = set() |
|||
unique_headlines = [] |
|||
for h in headlines: |
|||
if h not in seen: |
|||
seen.add(h) |
|||
unique_headlines.append(h) |
|||
|
|||
return unique_headlines |
|||
@ -0,0 +1,3 @@ |
|||
from .console_view import ConsoleView |
|||
|
|||
__all__ = ['ConsoleView'] |
|||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,68 @@ |
|||
import sys |
|||
from typing import List, Dict, Any |
|||
|
|||
|
|||
class ConsoleView: |
|||
def display_message(self, message: str): |
|||
try: |
|||
print(message) |
|||
except UnicodeEncodeError: |
|||
print(message.encode('utf-8', errors='replace').decode('utf-8')) |
|||
|
|||
def display_error(self, error: str): |
|||
try: |
|||
print(f"[ERROR] {error}") |
|||
except UnicodeEncodeError: |
|||
print(f"[ERROR] {error.encode('utf-8', errors='replace').decode('utf-8')}") |
|||
|
|||
def display_success(self, message: str): |
|||
try: |
|||
print(f"[SUCCESS] {message}") |
|||
except UnicodeEncodeError: |
|||
print(f"[SUCCESS] {message.encode('utf-8', errors='replace').decode('utf-8')}") |
|||
|
|||
def display_strategies(self, strategies: List[Dict[str, str]]): |
|||
print("\n=== Available Scrapers ===") |
|||
for idx, strategy in enumerate(strategies, 1): |
|||
print(f"{idx}. {strategy['name']}") |
|||
print(f" Source: {strategy['source']}") |
|||
print() |
|||
|
|||
def display_scraped_data(self, data: Any, saved_path: str = None): |
|||
if hasattr(data, 'to_dict'): |
|||
data = data.to_dict() |
|||
|
|||
print("\n=== Scraping Results ===") |
|||
print(f"Source: {data.get('source', 'N/A')}") |
|||
print(f"Strategy: {data.get('strategy_name', 'N/A')}") |
|||
print(f"Total Items: {data.get('total_items', len(data.get('items', [])))}") |
|||
print(f"Scraped At: {data.get('scraped_at', 'N/A')}") |
|||
|
|||
if saved_path: |
|||
print(f"Saved To: {saved_path}") |
|||
|
|||
print("\n--- Items Preview ---") |
|||
items = data.get('items', []) |
|||
for idx, item in enumerate(items[:5], 1): |
|||
try: |
|||
title = item.get('title', 'N/A') |
|||
print(f"{idx}. {title}") |
|||
except UnicodeEncodeError: |
|||
print(f"{idx}. {item.get('title', 'N/A').encode('utf-8', errors='replace').decode('utf-8')}") |
|||
|
|||
if item.get('content'): |
|||
content = item.get('content', '') |
|||
try: |
|||
truncated = content[:80] + "..." if len(content) > 80 else content |
|||
print(f" {truncated}") |
|||
except UnicodeEncodeError: |
|||
truncated = content[:80].encode('utf-8', errors='replace').decode('utf-8') |
|||
print(f" {truncated}") |
|||
print() |
|||
|
|||
if len(items) > 5: |
|||
print(f"... and {len(items) - 5} more items") |
|||
|
|||
def display_list(self, items: List[Any]): |
|||
for item in items: |
|||
print(item) |
|||
Loading…
Reference in new issue