commit 0b572260a8f5906b36d3ee8a3e489716f3ca408d Author: ZhangJinxuan <2194936226@qq.com> Date: Sun May 31 11:50:30 2026 +0800 期末爬虫项目+实验报告 diff --git a/202506050228-张金炫-期末实验报告.docx b/202506050228-张金炫-期末实验报告.docx new file mode 100644 index 0000000..65ba545 Binary files /dev/null and b/202506050228-张金炫-期末实验报告.docx differ diff --git a/9919d4711bf7a75e286295928b7eb5f0.png b/9919d4711bf7a75e286295928b7eb5f0.png new file mode 100644 index 0000000..8acf462 Binary files /dev/null and b/9919d4711bf7a75e286295928b7eb5f0.png differ diff --git a/commands/__init__.py b/commands/__init__.py new file mode 100644 index 0000000..a251377 --- /dev/null +++ b/commands/__init__.py @@ -0,0 +1,5 @@ +from .base_command import Command +from .scrape_command import ScrapeCommand +from .list_command import ListCommand + +__all__ = ['Command', 'ScrapeCommand', 'ListCommand'] diff --git a/commands/__pycache__/__init__.cpython-314.pyc b/commands/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..1a51c4b Binary files /dev/null and b/commands/__pycache__/__init__.cpython-314.pyc differ diff --git a/commands/__pycache__/base_command.cpython-314.pyc b/commands/__pycache__/base_command.cpython-314.pyc new file mode 100644 index 0000000..7cf80b5 Binary files /dev/null and b/commands/__pycache__/base_command.cpython-314.pyc differ diff --git a/commands/__pycache__/list_command.cpython-314.pyc b/commands/__pycache__/list_command.cpython-314.pyc new file mode 100644 index 0000000..b9e37f8 Binary files /dev/null and b/commands/__pycache__/list_command.cpython-314.pyc differ diff --git a/commands/__pycache__/scrape_command.cpython-314.pyc b/commands/__pycache__/scrape_command.cpython-314.pyc new file mode 100644 index 0000000..026b471 Binary files /dev/null and b/commands/__pycache__/scrape_command.cpython-314.pyc differ diff --git a/commands/base_command.py b/commands/base_command.py new file mode 100644 index 0000000..742007a --- /dev/null +++ b/commands/base_command.py @@ -0,0 +1,11 @@ +from abc import ABC, abstractmethod + + +class Command(ABC): + @abstractmethod + def execute(self): + pass + + @abstractmethod + def undo(self): + pass diff --git a/commands/list_command.py b/commands/list_command.py new file mode 100644 index 0000000..321205e --- /dev/null +++ b/commands/list_command.py @@ -0,0 +1,13 @@ +from commands.base_command import Command +from controllers import ScraperController + + +class ListCommand(Command): + def __init__(self, controller: ScraperController): + self.controller = controller + + def execute(self): + return self.controller.list_strategies() + + def undo(self): + pass diff --git a/commands/scrape_command.py b/commands/scrape_command.py new file mode 100644 index 0000000..b927cfd --- /dev/null +++ b/commands/scrape_command.py @@ -0,0 +1,25 @@ +from commands.base_command import Command +from controllers import ScraperController +from exceptions import ScraperException + + +class ScrapeCommand(Command): + def __init__(self, controller: ScraperController, strategy_name: str): + self.controller = controller + self.strategy_name = strategy_name + self.scrape_result = None + self.saved_path = None + + def execute(self): + try: + self.scrape_result = self.controller.execute_scrape(self.strategy_name) + self.saved_path = self.controller.save_data(self.scrape_result, self.strategy_name) + return self.scrape_result, self.saved_path + except ScraperException as e: + raise e + + def undo(self): + if self.saved_path and self.controller.delete_data(self.saved_path): + print(f"Successfully undone: deleted {self.saved_path}") + return True + return False diff --git a/controllers/__init__.py b/controllers/__init__.py new file mode 100644 index 0000000..1849940 --- /dev/null +++ b/controllers/__init__.py @@ -0,0 +1,3 @@ +from .scraper_controller import ScraperController + +__all__ = ['ScraperController'] diff --git a/controllers/__pycache__/__init__.cpython-314.pyc b/controllers/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..fe13969 Binary files /dev/null and b/controllers/__pycache__/__init__.cpython-314.pyc differ diff --git a/controllers/__pycache__/scraper_controller.cpython-314.pyc b/controllers/__pycache__/scraper_controller.cpython-314.pyc new file mode 100644 index 0000000..6b75467 Binary files /dev/null and b/controllers/__pycache__/scraper_controller.cpython-314.pyc differ diff --git a/controllers/scraper_controller.py b/controllers/scraper_controller.py new file mode 100644 index 0000000..2906d4d --- /dev/null +++ b/controllers/scraper_controller.py @@ -0,0 +1,112 @@ +import json +import os +from datetime import datetime +from typing import Dict, List + +from strategies import ( + ScraperStrategy, + NewsScraperStrategy, + BooksScraperStrategy, + TechNewsScraperStrategy +) +from models import ScrapedData +from exceptions import StrategyException, StorageException, ValidationException + + +class ScraperController: + def __init__(self, output_dir: str = "data"): + self.output_dir = output_dir + self.strategies: Dict[str, ScraperStrategy] = {} + self._register_default_strategies() + + def _register_default_strategies(self): + self.register_strategy(NewsScraperStrategy()) + self.register_strategy(BooksScraperStrategy()) + self.register_strategy(TechNewsScraperStrategy()) + + def register_strategy(self, strategy: ScraperStrategy): + self.strategies[strategy.name] = strategy + + def get_strategy(self, name: str) -> ScraperStrategy: + if name not in self.strategies: + available = ', '.join(self.strategies.keys()) + raise StrategyException( + f"Strategy '{name}' not found. Available: {available}", + strategy_name=name + ) + return self.strategies[name] + + def list_strategies(self) -> List[Dict[str, str]]: + return [ + {"name": s.name, "source": s.source} + for s in self.strategies.values() + ] + + def execute_scrape(self, strategy_name: str) -> ScrapedData: + strategy = self.get_strategy(strategy_name) + return strategy.scrape() + + def save_data(self, data: ScrapedData, strategy_name: str) -> str: + try: + folder_path = os.path.join(self.output_dir, strategy_name) + os.makedirs(folder_path, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"scraped_data_{timestamp}.json" + file_path = os.path.join(folder_path, filename) + + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data.to_dict(), f, ensure_ascii=False, indent=2) + + return file_path + + except Exception as e: + raise StorageException( + f"Failed to save data to {folder_path}", + file_path=folder_path, + original_exception=e + ) + + def delete_data(self, file_path: str) -> bool: + try: + if os.path.exists(file_path): + os.remove(file_path) + return True + return False + except Exception as e: + raise StorageException( + f"Failed to delete file {file_path}", + file_path=file_path, + original_exception=e + ) + + def load_data(self, strategy_name: str, filename: str = None) -> ScrapedData: + try: + folder_path = os.path.join(self.output_dir, strategy_name) + if not os.path.exists(folder_path): + raise StorageException( + f"No data found for strategy '{strategy_name}'", + file_path=folder_path + ) + + if filename: + file_path = os.path.join(folder_path, filename) + else: + files = sorted([f for f in os.listdir(folder_path) if f.endswith('.json')]) + if not files: + raise StorageException(f"No data files found in {folder_path}") + file_path = os.path.join(folder_path, files[-1]) + + with open(file_path, 'r', encoding='utf-8') as f: + data_dict = json.load(f) + + return data_dict + + except Exception as e: + if isinstance(e, StorageException): + raise e + raise StorageException( + f"Failed to load data", + file_path=file_path if 'file_path' in locals() else None, + original_exception=e + ) diff --git a/debug_books.py b/debug_books.py new file mode 100644 index 0000000..afb6dc5 --- /dev/null +++ b/debug_books.py @@ -0,0 +1,21 @@ +from urllib.request import urlopen, Request +import re + +r = urlopen(Request('https://books.toscrape.com', headers={'User-Agent': 'Mozilla/5.0'})) +html = r.read().decode('utf-8') + +price_search = re.search(r'class="price_color[^"]*"[^>]*>([^<]+)<', html) +if price_search: + print('Found price pattern 1:', price_search.group(1)) +else: + print('Pattern 1 not found') + +price_search2 = re.search(r'price_color">([^<]+)<', html) +if price_search2: + print('Found price pattern 2:', price_search2.group(1)) +else: + print('Pattern 2 not found') + +idx = html.find('price_color') +if idx > 0: + print('Context around price_color:', html[idx-20:idx+50]) diff --git a/exceptions/__init__.py b/exceptions/__init__.py new file mode 100644 index 0000000..8d1a9c5 --- /dev/null +++ b/exceptions/__init__.py @@ -0,0 +1,17 @@ +from .scraper_exceptions import ( + ScraperException, + NetworkException, + ParseException, + ValidationException, + StorageException, + StrategyException +) + +__all__ = [ + 'ScraperException', + 'NetworkException', + 'ParseException', + 'ValidationException', + 'StorageException', + 'StrategyException' +] diff --git a/exceptions/__pycache__/__init__.cpython-314.pyc b/exceptions/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..18325d5 Binary files /dev/null and b/exceptions/__pycache__/__init__.cpython-314.pyc differ diff --git a/exceptions/__pycache__/scraper_exceptions.cpython-314.pyc b/exceptions/__pycache__/scraper_exceptions.cpython-314.pyc new file mode 100644 index 0000000..a76f2b5 Binary files /dev/null and b/exceptions/__pycache__/scraper_exceptions.cpython-314.pyc differ diff --git a/exceptions/scraper_exceptions.py b/exceptions/scraper_exceptions.py new file mode 100644 index 0000000..422fb6b --- /dev/null +++ b/exceptions/scraper_exceptions.py @@ -0,0 +1,34 @@ +class ScraperException(Exception): + def __init__(self, message, original_exception=None): + super().__init__(message) + self.original_exception = original_exception + + +class NetworkException(ScraperException): + def __init__(self, message, status_code=None, original_exception=None): + super().__init__(message, original_exception) + self.status_code = status_code + + +class ParseException(ScraperException): + def __init__(self, message, selector=None, original_exception=None): + super().__init__(message, original_exception) + self.selector = selector + + +class ValidationException(ScraperException): + def __init__(self, message, field=None, original_exception=None): + super().__init__(message, original_exception) + self.field = field + + +class StorageException(ScraperException): + def __init__(self, message, file_path=None, original_exception=None): + super().__init__(message, original_exception) + self.file_path = file_path + + +class StrategyException(ScraperException): + def __init__(self, message, strategy_name=None, original_exception=None): + super().__init__(message, original_exception) + self.strategy_name = strategy_name diff --git a/java-scraper/README.md b/java-scraper/README.md new file mode 100644 index 0000000..c7d7dca --- /dev/null +++ b/java-scraper/README.md @@ -0,0 +1,83 @@ +# Java Web Scraper + +A complete web scraping application demonstrating: +- **CLI Interface** +- **MVC Architecture** +- **Command Pattern** +- **Strategy Pattern** +- **Custom Exception Hierarchy** + +## Features + +- 3 different scraping strategies: + - `news_scraper` - Scrapes quotes from http://quotes.toscrape.com + - `books_scraper` - Scrapes books from https://books.toscrape.com + - `tech_news_scraper` - Scrapes news from https://www.bbc.com/news +- Saves data to JSON files +- Command-line interface +- Extensible architecture + +## Building + +```bash +cd java-scraper +mvn clean package +``` + +## Usage + +### List available scrapers: +```bash +mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="list" +``` + +### Scrape using a specific strategy: +```bash +mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="scrape news_scraper" +``` + +### Scrape all: +```bash +mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="scrape all" +``` + +### Custom output directory: +```bash +mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="scrape news_scraper --output my_data" +``` + +### Using the built JAR: +```bash +java -jar target/java-scraper-1.0-SNAPSHOT.jar list +java -jar target/java-scraper-1.0-SNAPSHOT.jar scrape news_scraper +``` + +## Architecture + +### MVC +- **Model**: `ScrapedItem`, `ScrapedData` +- **View**: `ConsoleView` +- **Controller**: `ScraperController` + +### Command Pattern +- `Command` interface +- `ScrapeCommand` +- `ListCommand` + +### Strategy Pattern +- `ScraperStrategy` interface +- `NewsScraperStrategy` +- `BooksScraperStrategy` +- `TechNewsScraperStrategy` + +### Exception Hierarchy +- `ScraperException` (base) +- `NetworkException` +- `ParseException` +- `StorageException` +- `StrategyException` + +## Requirements + +- Java 11 or higher +- Maven diff --git a/java-scraper/data/books_scraper/scraped_data_20260531_104648.json b/java-scraper/data/books_scraper/scraped_data_20260531_104648.json new file mode 100644 index 0000000..d3a501f --- /dev/null +++ b/java-scraper/data/books_scraper/scraped_data_20260531_104648.json @@ -0,0 +1,7 @@ +{ + "source": "https://books.toscrape.com", + "strategy_name": "books_scraper", + "items": [], + "scraped_at": "2026-05-31T10:46:46.169175", + "total_items": 0 +} \ No newline at end of file diff --git a/java-scraper/data/books_scraper/scraped_data_20260531_104856.json b/java-scraper/data/books_scraper/scraped_data_20260531_104856.json new file mode 100644 index 0000000..4a5702b --- /dev/null +++ b/java-scraper/data/books_scraper/scraped_data_20260531_104856.json @@ -0,0 +1,128 @@ +{ + "source": "https://books.toscrape.com", + "strategy_name": "books_scraper", + "items": [ + { + "title": "A Light in the Attic", + "content": "Price: £51.77", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.489985" + }, + { + "title": "Tipping the Velvet", + "content": "Price: £53.74", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.489997" + }, + { + "title": "Soumission", + "content": "Price: £50.10", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490001" + }, + { + "title": "Sharp Objects", + "content": "Price: £47.82", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490004" + }, + { + "title": "Sapiens: A Brief History of Humankind", + "content": "Price: £54.23", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490005" + }, + { + "title": "The Requiem Red", + "content": "Price: £22.65", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490007" + }, + { + "title": "The Dirty Little Secrets of Getting Your Dream Job", + "content": "Price: £33.34", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490009" + }, + { + "title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull", + "content": "Price: £17.93", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490011" + }, + { + "title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics", + "content": "Price: £22.60", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490012" + }, + { + "title": "The Black Maria", + "content": "Price: £52.15", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490014" + }, + { + "title": "Starving Hearts (Triangular Trade Trilogy, #1)", + "content": "Price: £13.99", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490015" + }, + { + "title": "Shakespeare's Sonnets", + "content": "Price: £20.66", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490017" + }, + { + "title": "Set Me Free", + "content": "Price: £17.46", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490019" + }, + { + "title": "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", + "content": "Price: £52.29", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490020" + }, + { + "title": "Rip it Up and Start Again", + "content": "Price: £35.02", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490022" + }, + { + "title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", + "content": "Price: £57.25", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490023" + }, + { + "title": "Olio", + "content": "Price: £23.88", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490025" + }, + { + "title": "Mesaerion: The Best Science Fiction Stories 1800-1849", + "content": "Price: £37.59", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490035" + }, + { + "title": "Libertarianism for Beginners", + "content": "Price: £51.33", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490037" + }, + { + "title": "It's Only the Himalayas", + "content": "Price: £45.17", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:48:56.490038" + } + ], + "scraped_at": "2026-05-31T10:48:54.348792", + "total_items": 20 +} \ No newline at end of file diff --git a/java-scraper/data/books_scraper/scraped_data_20260531_105031.json b/java-scraper/data/books_scraper/scraped_data_20260531_105031.json new file mode 100644 index 0000000..3ce3946 --- /dev/null +++ b/java-scraper/data/books_scraper/scraped_data_20260531_105031.json @@ -0,0 +1,128 @@ +{ + "source": "https://books.toscrape.com", + "strategy_name": "books_scraper", + "items": [ + { + "title": "A Light in the Attic", + "content": "Price: £51.77", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674011" + }, + { + "title": "Tipping the Velvet", + "content": "Price: £53.74", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674021" + }, + { + "title": "Soumission", + "content": "Price: £50.10", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674024" + }, + { + "title": "Sharp Objects", + "content": "Price: £47.82", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674026" + }, + { + "title": "Sapiens: A Brief History of Humankind", + "content": "Price: £54.23", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674028" + }, + { + "title": "The Requiem Red", + "content": "Price: £22.65", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674029" + }, + { + "title": "The Dirty Little Secrets of Getting Your Dream Job", + "content": "Price: £33.34", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674031" + }, + { + "title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull", + "content": "Price: £17.93", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674032" + }, + { + "title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics", + "content": "Price: £22.60", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674033" + }, + { + "title": "The Black Maria", + "content": "Price: £52.15", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674034" + }, + { + "title": "Starving Hearts (Triangular Trade Trilogy, #1)", + "content": "Price: £13.99", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674035" + }, + { + "title": "Shakespeare's Sonnets", + "content": "Price: £20.66", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674041" + }, + { + "title": "Set Me Free", + "content": "Price: £17.46", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674043" + }, + { + "title": "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", + "content": "Price: £52.29", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674044" + }, + { + "title": "Rip it Up and Start Again", + "content": "Price: £35.02", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674045" + }, + { + "title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", + "content": "Price: £57.25", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674046" + }, + { + "title": "Olio", + "content": "Price: £23.88", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674046" + }, + { + "title": "Mesaerion: The Best Science Fiction Stories 1800-1849", + "content": "Price: £37.59", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674048" + }, + { + "title": "Libertarianism for Beginners", + "content": "Price: £51.33", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674049" + }, + { + "title": "It's Only the Himalayas", + "content": "Price: £45.17", + "url": "https://books.toscrape.com", + "timestamp": "2026-05-31T10:50:31.674050" + } + ], + "scraped_at": "2026-05-31T10:50:29.355948", + "total_items": 20 +} \ No newline at end of file diff --git a/java-scraper/data/news_scraper/scraped_data_20260531_104348.json b/java-scraper/data/news_scraper/scraped_data_20260531_104348.json new file mode 100644 index 0000000..85577c4 --- /dev/null +++ b/java-scraper/data/news_scraper/scraped_data_20260531_104348.json @@ -0,0 +1,68 @@ +{ + "source": "http://quotes.toscrape.com", + "strategy_name": "news_scraper", + "items": [ + { + "title": "temprop=", + "content": "temprop=", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:43:48.501000" + }, + { + "title": "temprop=", + "content": "temprop=", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:43:48.501016" + }, + { + "title": "temprop=", + "content": "temprop=", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:43:48.501021" + }, + { + "title": "temprop=", + "content": "temprop=", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:43:48.501024" + }, + { + "title": "temprop=", + "content": "temprop=", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:43:48.501026" + }, + { + "title": "temprop=", + "content": "temprop=", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:43:48.501028" + }, + { + "title": "temprop=", + "content": "temprop=", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:43:48.501030" + }, + { + "title": "temprop=", + "content": "temprop=", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:43:48.501032" + }, + { + "title": "temprop=", + "content": "temprop=", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:43:48.501034" + }, + { + "title": "temprop=", + "content": "temprop=", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:43:48.501036" + } + ], + "scraped_at": "2026-05-31T10:43:45.907587", + "total_items": 10 +} \ No newline at end of file diff --git a/java-scraper/data/news_scraper/scraped_data_20260531_104511.json b/java-scraper/data/news_scraper/scraped_data_20260531_104511.json new file mode 100644 index 0000000..2803469 --- /dev/null +++ b/java-scraper/data/news_scraper/scraped_data_20260531_104511.json @@ -0,0 +1,7 @@ +{ + "source": "http://quotes.toscrape.com", + "strategy_name": "news_scraper", + "items": [], + "scraped_at": "2026-05-31T10:45:10.355276", + "total_items": 0 +} \ No newline at end of file diff --git a/java-scraper/data/news_scraper/scraped_data_20260531_104620.json b/java-scraper/data/news_scraper/scraped_data_20260531_104620.json new file mode 100644 index 0000000..2e6b33e --- /dev/null +++ b/java-scraper/data/news_scraper/scraped_data_20260531_104620.json @@ -0,0 +1,68 @@ +{ + "source": "http://quotes.toscrape.com", + "strategy_name": "news_scraper", + "items": [ + { + "title": "Quote by Albert Einstein", + "content": "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:46:20.434224" + }, + { + "title": "Quote by J.K. Rowling", + "content": "“It is our choices, Harry, that show what we truly are, far more than our abilities.”", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:46:20.434236" + }, + { + "title": "Quote by Albert Einstein", + "content": "“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:46:20.434250" + }, + { + "title": "Quote by Jane Austen", + "content": "“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:46:20.434253" + }, + { + "title": "Quote by Marilyn Monroe", + "content": "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:46:20.434255" + }, + { + "title": "Quote by Albert Einstein", + "content": "“Try not to become a man of success. Rather become a man of value.”", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:46:20.434257" + }, + { + "title": "Quote by André Gide", + "content": "“It is better to be hated for what you are than to be loved for what you are not.”", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:46:20.434259" + }, + { + "title": "Quote by Thomas A. Edison", + "content": "“I have not failed. I've just found 10,000 ways that won't work.”", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:46:20.434261" + }, + { + "title": "Quote by Eleanor Roosevelt", + "content": "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:46:20.434262" + }, + { + "title": "Quote by Steve Martin", + "content": "“A day without sunshine is like, you know, night.”", + "url": "http://quotes.toscrape.com", + "timestamp": "2026-05-31T10:46:20.434264" + } + ], + "scraped_at": "2026-05-31T10:46:18.193675", + "total_items": 10 +} \ No newline at end of file diff --git a/java-scraper/pom.xml b/java-scraper/pom.xml new file mode 100644 index 0000000..da5c6d7 --- /dev/null +++ b/java-scraper/pom.xml @@ -0,0 +1,50 @@ + + + 4.0.0 + + com.scraper + java-scraper + 1.0-SNAPSHOT + + + 11 + 11 + UTF-8 + + + + + + com.google.code.gson + gson + 2.10.1 + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.5.0 + + + package + + shade + + + + + com.scraper.Main + + + + + + + + + diff --git a/java-scraper/src/main/java/com/scraper/Main.java b/java-scraper/src/main/java/com/scraper/Main.java new file mode 100644 index 0000000..0c0464c --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/Main.java @@ -0,0 +1,108 @@ +package com.scraper; + +import com.scraper.command.ListCommand; +import com.scraper.command.ScrapeCommand; +import com.scraper.controller.ScraperController; +import com.scraper.exception.ScraperException; +import com.scraper.view.ConsoleView; + +/** + * Main CLI application - Entry point for the scraper + */ +public class Main { + + private ScraperController controller; + private ConsoleView view; + + public Main() { + this.controller = new ScraperController(); + this.view = new ConsoleView(); + } + + public static void main(String[] args) { + Main app = new Main(); + app.run(args); + } + + public void run(String[] args) { + if (args.length == 0) { + printHelp(); + return; + } + + try { + switch (args[0]) { + case "list": + handleList(); + break; + case "scrape": + handleScrape(args); + break; + case "help": + default: + printHelp(); + break; + } + } catch (ScraperException e) { + view.displayError(e.getMessage()); + if (e.getCause() != null) { + view.displayError("Cause: " + e.getCause().getMessage()); + } + System.exit(1); + } catch (Exception e) { + view.displayError("Unexpected error: " + e.getMessage()); + System.exit(1); + } + } + + private void handleList() throws ScraperException { + ListCommand cmd = new ListCommand(controller); + cmd.execute(); + view.displayStrategies(cmd.getStrategies()); + } + + private void handleScrape(String[] args) throws ScraperException { + if (args.length < 2) { + view.displayError("Please specify a scraper to use."); + printHelp(); + return; + } + + String strategyName = args[1]; + String outputDir = "data"; + if (args.length >= 4 && "--output".equals(args[2])) { + outputDir = args[3]; + controller.setOutputDir(outputDir); + } + + if ("all".equals(strategyName)) { + ListCommand listCmd = new ListCommand(controller); + listCmd.execute(); + for (java.util.Map strategy : listCmd.getStrategies()) { + scrapeSingle(strategy.get("name")); + } + } else { + scrapeSingle(strategyName); + } + } + + private void scrapeSingle(String strategyName) throws ScraperException { + ScrapeCommand cmd = new ScrapeCommand(controller, strategyName); + cmd.execute(); + view.displaySuccess("Scraped " + cmd.getScrapedData().getTotalItems() + " items using " + strategyName); + view.displayScrapedData(cmd.getScrapedData(), cmd.getSavedPath()); + } + + private void printHelp() { + System.out.println("=== Web Scraper CLI - MVC + Command Pattern + Strategy Pattern ==="); + System.out.println(); + System.out.println("Usage:"); + System.out.println(" java -jar java-scraper.jar list - List all available scrapers"); + System.out.println(" java -jar java-scraper.jar scrape - Scrape data using specific scraper"); + System.out.println(" java -jar java-scraper.jar scrape all - Scrape data from all scrapers"); + System.out.println(" java -jar java-scraper.jar scrape --output - Specify output directory"); + System.out.println(" java -jar java-scraper.jar help - Show this help message"); + System.out.println(); + System.out.println("Available scrapers: news_scraper, books_scraper, tech_news_scraper"); + } +} diff --git a/java-scraper/src/main/java/com/scraper/command/Command.java b/java-scraper/src/main/java/com/scraper/command/Command.java new file mode 100644 index 0000000..2d83a7f --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/command/Command.java @@ -0,0 +1,11 @@ +package com.scraper.command; + +import com.scraper.exception.ScraperException; + +/** + * Command interface for Command pattern + */ +public interface Command { + void execute() throws ScraperException; + void undo() throws ScraperException; +} diff --git a/java-scraper/src/main/java/com/scraper/command/ListCommand.java b/java-scraper/src/main/java/com/scraper/command/ListCommand.java new file mode 100644 index 0000000..82038ad --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/command/ListCommand.java @@ -0,0 +1,33 @@ +package com.scraper.command; + +import com.scraper.controller.ScraperController; + +import java.util.List; +import java.util.Map; + +/** + * Command to list all available scraping strategies + */ +public class ListCommand implements Command { + + private ScraperController controller; + private List> strategies; + + public ListCommand(ScraperController controller) { + this.controller = controller; + } + + @Override + public void execute() { + strategies = controller.listStrategies(); + } + + @Override + public void undo() { + // List command doesn't support undo + } + + public List> getStrategies() { + return strategies; + } +} diff --git a/java-scraper/src/main/java/com/scraper/command/ScrapeCommand.java b/java-scraper/src/main/java/com/scraper/command/ScrapeCommand.java new file mode 100644 index 0000000..f2cce1f --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/command/ScrapeCommand.java @@ -0,0 +1,42 @@ +package com.scraper.command; + +import com.scraper.controller.ScraperController; +import com.scraper.exception.ScraperException; +import com.scraper.model.ScrapedData; + +/** + * Command to scrape data from a specific strategy + */ +public class ScrapeCommand implements Command { + + private ScraperController controller; + private String strategyName; + private ScrapedData scrapedData; + private String savedPath; + + public ScrapeCommand(ScraperController controller, String strategyName) { + this.controller = controller; + this.strategyName = strategyName; + } + + @Override + public void execute() throws ScraperException { + scrapedData = controller.executeScrape(strategyName); + savedPath = controller.saveData(scrapedData, strategyName); + } + + @Override + public void undo() throws ScraperException { + if (savedPath != null) { + controller.deleteData(savedPath); + } + } + + public ScrapedData getScrapedData() { + return scrapedData; + } + + public String getSavedPath() { + return savedPath; + } +} diff --git a/java-scraper/src/main/java/com/scraper/controller/ScraperController.java b/java-scraper/src/main/java/com/scraper/controller/ScraperController.java new file mode 100644 index 0000000..88b40ef --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/controller/ScraperController.java @@ -0,0 +1,138 @@ +package com.scraper.controller; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.scraper.exception.StorageException; +import com.scraper.exception.StrategyException; +import com.scraper.model.ScrapedData; +import com.scraper.strategy.BooksScraperStrategy; +import com.scraper.strategy.NewsScraperStrategy; +import com.scraper.strategy.ScraperStrategy; +import com.scraper.strategy.TechNewsScraperStrategy; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * MVC Controller for the scraper application + */ +public class ScraperController { + + private String outputDir; + private Map strategies; + private Gson gson; + + public ScraperController() { + this("data"); + } + + public ScraperController(String outputDir) { + this.outputDir = outputDir; + this.strategies = new HashMap<>(); + this.gson = new GsonBuilder().setPrettyPrinting().create(); + registerDefaultStrategies(); + } + + private void registerDefaultStrategies() { + registerStrategy(new NewsScraperStrategy()); + registerStrategy(new BooksScraperStrategy()); + registerStrategy(new TechNewsScraperStrategy()); + } + + public void registerStrategy(ScraperStrategy strategy) { + strategies.put(strategy.getName(), strategy); + } + + public List> listStrategies() { + List> result = new ArrayList<>(); + for (ScraperStrategy strategy : strategies.values()) { + Map info = new HashMap<>(); + info.put("name", strategy.getName()); + info.put("source", strategy.getSource()); + result.add(info); + } + return result; + } + + public ScrapedData executeScrape(String strategyName) throws StrategyException { + ScraperStrategy strategy = strategies.get(strategyName); + if (strategy == null) { + String available = String.join(", ", strategies.keySet()); + throw new StrategyException( + "Strategy '" + strategyName + "' not found. Available: " + available, + strategyName, + null + ); + } + + try { + return strategy.scrape(); + } catch (Exception e) { + if (e instanceof StrategyException) { + throw (StrategyException) e; + } + throw new StrategyException( + "Error executing strategy: " + strategyName, + strategyName, + e + ); + } + } + + public String saveData(ScrapedData data, String strategyName) throws StorageException { + try { + String folderPath = outputDir + File.separator + strategyName; + Path folder = Paths.get(folderPath); + Files.createDirectories(folder); + + String timestamp = LocalDateTime.now().format( + DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss") + ); + String filename = "scraped_data_" + timestamp + ".json"; + String filePath = folderPath + File.separator + filename; + + try (FileWriter writer = new FileWriter(filePath)) { + gson.toJson(data, writer); + } + + return filePath; + } catch (IOException e) { + throw new StorageException( + "Failed to save data to: " + outputDir, + outputDir, + e + ); + } + } + + public boolean deleteData(String filePath) throws StorageException { + try { + Path path = Paths.get(filePath); + if (Files.exists(path)) { + Files.delete(path); + return true; + } + return false; + } catch (IOException e) { + throw new StorageException( + "Failed to delete file: " + filePath, + filePath, + e + ); + } + } + + public void setOutputDir(String outputDir) { + this.outputDir = outputDir; + } +} diff --git a/java-scraper/src/main/java/com/scraper/exception/NetworkException.java b/java-scraper/src/main/java/com/scraper/exception/NetworkException.java new file mode 100644 index 0000000..241bae3 --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/exception/NetworkException.java @@ -0,0 +1,27 @@ +package com.scraper.exception; + +/** + * Exception thrown when there is a network related error + */ +public class NetworkException extends ScraperException { + + private static final long serialVersionUID = 1L; + private Integer statusCode; + + public NetworkException(String message) { + super(message); + } + + public NetworkException(String message, Throwable cause) { + super(message, cause); + } + + public NetworkException(String message, Integer statusCode, Throwable cause) { + super(message, cause); + this.statusCode = statusCode; + } + + public Integer getStatusCode() { + return statusCode; + } +} diff --git a/java-scraper/src/main/java/com/scraper/exception/ParseException.java b/java-scraper/src/main/java/com/scraper/exception/ParseException.java new file mode 100644 index 0000000..6d2344d --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/exception/ParseException.java @@ -0,0 +1,27 @@ +package com.scraper.exception; + +/** + * Exception thrown when there is a parsing error + */ +public class ParseException extends ScraperException { + + private static final long serialVersionUID = 1L; + private String selector; + + public ParseException(String message) { + super(message); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } + + public ParseException(String message, String selector, Throwable cause) { + super(message, cause); + this.selector = selector; + } + + public String getSelector() { + return selector; + } +} diff --git a/java-scraper/src/main/java/com/scraper/exception/ScraperException.java b/java-scraper/src/main/java/com/scraper/exception/ScraperException.java new file mode 100644 index 0000000..bd54876 --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/exception/ScraperException.java @@ -0,0 +1,17 @@ +package com.scraper.exception; + +/** + * Base exception class for the scraper application + */ +public class ScraperException extends Exception { + + private static final long serialVersionUID = 1L; + + public ScraperException(String message) { + super(message); + } + + public ScraperException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/java-scraper/src/main/java/com/scraper/exception/StorageException.java b/java-scraper/src/main/java/com/scraper/exception/StorageException.java new file mode 100644 index 0000000..0c0be93 --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/exception/StorageException.java @@ -0,0 +1,27 @@ +package com.scraper.exception; + +/** + * Exception thrown when there is a storage related error + */ +public class StorageException extends ScraperException { + + private static final long serialVersionUID = 1L; + private String filePath; + + public StorageException(String message) { + super(message); + } + + public StorageException(String message, Throwable cause) { + super(message, cause); + } + + public StorageException(String message, String filePath, Throwable cause) { + super(message, cause); + this.filePath = filePath; + } + + public String getFilePath() { + return filePath; + } +} diff --git a/java-scraper/src/main/java/com/scraper/exception/StrategyException.java b/java-scraper/src/main/java/com/scraper/exception/StrategyException.java new file mode 100644 index 0000000..fe42e52 --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/exception/StrategyException.java @@ -0,0 +1,27 @@ +package com.scraper.exception; + +/** + * Exception thrown when there is a strategy related error + */ +public class StrategyException extends ScraperException { + + private static final long serialVersionUID = 1L; + private String strategyName; + + public StrategyException(String message) { + super(message); + } + + public StrategyException(String message, Throwable cause) { + super(message, cause); + } + + public StrategyException(String message, String strategyName, Throwable cause) { + super(message, cause); + this.strategyName = strategyName; + } + + public String getStrategyName() { + return strategyName; + } +} diff --git a/java-scraper/src/main/java/com/scraper/model/ScrapedData.java b/java-scraper/src/main/java/com/scraper/model/ScrapedData.java new file mode 100644 index 0000000..5b4a021 --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/model/ScrapedData.java @@ -0,0 +1,77 @@ +package com.scraper.model; + +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.List; + +/** + * Model class representing the complete scraped data container + */ +public class ScrapedData { + private String source; + private String strategyName; + private List items; + private LocalDateTime scrapedAt; + private int totalItems; + + public ScrapedData() { + this.items = new ArrayList<>(); + this.scrapedAt = LocalDateTime.now(); + this.totalItems = 0; + } + + public ScrapedData(String source, String strategyName) { + this.source = source; + this.strategyName = strategyName; + this.items = new ArrayList<>(); + this.scrapedAt = LocalDateTime.now(); + this.totalItems = 0; + } + + public void addItem(ScrapedItem item) { + this.items.add(item); + this.totalItems = this.items.size(); + } + + // Getters and Setters + public String getSource() { + return source; + } + + public void setSource(String source) { + this.source = source; + } + + public String getStrategyName() { + return strategyName; + } + + public void setStrategyName(String strategyName) { + this.strategyName = strategyName; + } + + public List getItems() { + return items; + } + + public void setItems(List items) { + this.items = items; + this.totalItems = items.size(); + } + + public LocalDateTime getScrapedAt() { + return scrapedAt; + } + + public void setScrapedAt(LocalDateTime scrapedAt) { + this.scrapedAt = scrapedAt; + } + + public int getTotalItems() { + return totalItems; + } + + public void setTotalItems(int totalItems) { + this.totalItems = totalItems; + } +} diff --git a/java-scraper/src/main/java/com/scraper/model/ScrapedItem.java b/java-scraper/src/main/java/com/scraper/model/ScrapedItem.java new file mode 100644 index 0000000..5170c6b --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/model/ScrapedItem.java @@ -0,0 +1,57 @@ +package com.scraper.model; + +import java.time.LocalDateTime; + +/** + * Model class representing a single scraped item + */ +public class ScrapedItem { + private String title; + private String content; + private String url; + private LocalDateTime timestamp; + + public ScrapedItem() { + this.timestamp = LocalDateTime.now(); + } + + public ScrapedItem(String title, String content, String url) { + this.title = title; + this.content = content; + this.url = url; + this.timestamp = LocalDateTime.now(); + } + + // Getters and Setters + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public LocalDateTime getTimestamp() { + return timestamp; + } + + public void setTimestamp(LocalDateTime timestamp) { + this.timestamp = timestamp; + } +} diff --git a/java-scraper/src/main/java/com/scraper/strategy/BooksScraperStrategy.java b/java-scraper/src/main/java/com/scraper/strategy/BooksScraperStrategy.java new file mode 100644 index 0000000..6d95ed7 --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/strategy/BooksScraperStrategy.java @@ -0,0 +1,102 @@ +package com.scraper.strategy; + +import com.scraper.exception.NetworkException; +import com.scraper.exception.ParseException; +import com.scraper.model.ScrapedData; +import com.scraper.model.ScrapedItem; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Strategy for scraping books from https://books.toscrape.com + */ +public class BooksScraperStrategy implements ScraperStrategy { + + private static final String NAME = "books_scraper"; + private static final String SOURCE = "https://books.toscrape.com"; + + @Override + public ScrapedData scrape() throws NetworkException, ParseException { + ScrapedData data = new ScrapedData(SOURCE, NAME); + + try { + String html = fetchHTML(SOURCE); + parseBooks(html, data); + } catch (NetworkException e) { + throw e; + } catch (Exception e) { + throw new ParseException("Failed to parse books", null, e); + } + + return data; + } + + private String fetchHTML(String urlString) throws NetworkException { + try { + URL url = new URL(urlString); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("GET"); + connection.setRequestProperty("User-Agent", "Mozilla/5.0"); + connection.setConnectTimeout(10000); + connection.setReadTimeout(10000); + + int statusCode = connection.getResponseCode(); + if (statusCode != 200) { + throw new NetworkException("Failed to fetch URL: " + urlString, statusCode, null); + } + + StringBuilder response = new StringBuilder(); + try (BufferedReader in = new BufferedReader( + new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8))) { + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + + connection.disconnect(); + return response.toString(); + } catch (Exception e) { + throw new NetworkException("Network error while fetching: " + urlString, e); + } + } + + private void parseBooks(String html, ScrapedData data) { + Pattern titlePattern = Pattern.compile("

([^<]+)<"); + + Matcher titleMatcher = titlePattern.matcher(html); + Matcher priceMatcher = pricePattern.matcher(html); + + int count = 0; + while (titleMatcher.find() && priceMatcher.find() && count < 20) { + String title = titleMatcher.group(1).trim(); + String price = priceMatcher.group(1).trim(); + + ScrapedItem item = new ScrapedItem( + title, + "Price: " + price, + SOURCE + ); + data.addItem(item); + count++; + } + } + + @Override + public String getName() { + return NAME; + } + + @Override + public String getSource() { + return SOURCE; + } +} diff --git a/java-scraper/src/main/java/com/scraper/strategy/NewsScraperStrategy.java b/java-scraper/src/main/java/com/scraper/strategy/NewsScraperStrategy.java new file mode 100644 index 0000000..149d2b9 --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/strategy/NewsScraperStrategy.java @@ -0,0 +1,121 @@ +package com.scraper.strategy; + +import com.scraper.exception.NetworkException; +import com.scraper.exception.ParseException; +import com.scraper.model.ScrapedData; +import com.scraper.model.ScrapedItem; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Strategy for scraping quotes from http://quotes.toscrape.com + */ +public class NewsScraperStrategy implements ScraperStrategy { + + private static final String NAME = "news_scraper"; + private static final String SOURCE = "http://quotes.toscrape.com"; + + @Override + public ScrapedData scrape() throws NetworkException, ParseException { + ScrapedData data = new ScrapedData(SOURCE, NAME); + + try { + String html = fetchHTML(SOURCE); + parseQuotes(html, data); + } catch (NetworkException e) { + throw e; + } catch (Exception e) { + throw new ParseException("Failed to parse quotes", null, e); + } + + return data; + } + + private String fetchHTML(String urlString) throws NetworkException { + try { + URL url = new URL(urlString); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("GET"); + connection.setRequestProperty("User-Agent", "Mozilla/5.0"); + connection.setConnectTimeout(10000); + connection.setReadTimeout(10000); + + int statusCode = connection.getResponseCode(); + if (statusCode != 200) { + throw new NetworkException("Failed to fetch URL: " + urlString, statusCode, null); + } + + StringBuilder response = new StringBuilder(); + try (BufferedReader in = new BufferedReader( + new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8))) { + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + + connection.disconnect(); + return response.toString(); + } catch (Exception e) { + throw new NetworkException("Network error while fetching: " + urlString, e); + } + } + + private void parseQuotes(String html, ScrapedData data) { + Pattern quotePattern = Pattern.compile( + "([^<]+).*?([^<]+)", + Pattern.DOTALL + ); + + Matcher matcher = quotePattern.matcher(html); + int count = 0; + while (matcher.find() && count < 10) { + String text = matcher.group(1).trim(); + String author = matcher.group(2).trim(); + + ScrapedItem item = new ScrapedItem( + "Quote by " + author, + text, + SOURCE + ); + data.addItem(item); + count++; + } + + if (count == 0) { + // Fallback to simpler pattern + Pattern simpleTextPattern = Pattern.compile("\"text\">([^<]+)<"); + Pattern simpleAuthorPattern = Pattern.compile("author\">([^<]+)<"); + Matcher textMatcher = simpleTextPattern.matcher(html); + Matcher authorMatcher = simpleAuthorPattern.matcher(html); + + int itemCount = 0; + while (textMatcher.find() && authorMatcher.find() && itemCount < 10) { + ScrapedItem item = new ScrapedItem( + "Quote by " + authorMatcher.group(1).trim(), + textMatcher.group(1).trim(), + SOURCE + ); + data.addItem(item); + itemCount++; + } + } + } + + @Override + public String getName() { + return NAME; + } + + @Override + public String getSource() { + return SOURCE; + } +} diff --git a/java-scraper/src/main/java/com/scraper/strategy/ScraperStrategy.java b/java-scraper/src/main/java/com/scraper/strategy/ScraperStrategy.java new file mode 100644 index 0000000..06e0362 --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/strategy/ScraperStrategy.java @@ -0,0 +1,13 @@ +package com.scraper.strategy; + +import com.scraper.exception.ScraperException; +import com.scraper.model.ScrapedData; + +/** + * Strategy interface for web scrapers + */ +public interface ScraperStrategy { + ScrapedData scrape() throws ScraperException; + String getName(); + String getSource(); +} diff --git a/java-scraper/src/main/java/com/scraper/strategy/TechNewsScraperStrategy.java b/java-scraper/src/main/java/com/scraper/strategy/TechNewsScraperStrategy.java new file mode 100644 index 0000000..5526b1e --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/strategy/TechNewsScraperStrategy.java @@ -0,0 +1,114 @@ +package com.scraper.strategy; + +import com.scraper.exception.NetworkException; +import com.scraper.exception.ParseException; +import com.scraper.model.ScrapedData; +import com.scraper.model.ScrapedItem; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Strategy for scraping news from https://www.bbc.com/news + */ +public class TechNewsScraperStrategy implements ScraperStrategy { + + private static final String NAME = "tech_news_scraper"; + private static final String SOURCE = "https://www.bbc.com/news"; + + @Override + public ScrapedData scrape() throws NetworkException, ParseException { + ScrapedData data = new ScrapedData(SOURCE, NAME); + + try { + String html = fetchHTML(SOURCE); + parseHeadlines(html, data); + } catch (NetworkException e) { + throw e; + } catch (Exception e) { + throw new ParseException("Failed to parse tech news", null, e); + } + + return data; + } + + private String fetchHTML(String urlString) throws NetworkException { + try { + URL url = new URL(urlString); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("GET"); + connection.setRequestProperty("User-Agent", "Mozilla/5.0"); + connection.setConnectTimeout(10000); + connection.setReadTimeout(10000); + + int statusCode = connection.getResponseCode(); + if (statusCode != 200) { + throw new NetworkException("Failed to fetch URL: " + urlString, statusCode, null); + } + + StringBuilder response = new StringBuilder(); + try (BufferedReader in = new BufferedReader( + new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8))) { + String inputLine; + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + response.append("\n"); + } + } + + connection.disconnect(); + return response.toString(); + } catch (Exception e) { + throw new NetworkException("Network error while fetching: " + urlString, e); + } + } + + private void parseHeadlines(String html, ScrapedData data) { + List headlines = new ArrayList<>(); + String[] patterns = { + "]*class=\"[^\"]*headline[^\"]*\"[^>]*>([^<]+)<", + "]*class=\"[^\"]*headline[^\"]*\"[^>]*>([^<]+)<", + "]*class=\"[^\"]*headline[^\"]*\"[^>]*>([^<]+)<", + "]*class=\"[^\"]*title[^\"]*\"[^>]*>([^<]+)<", + "]*class=\"[^\"]*title[^\"]*\"[^>]*>([^<]+)<" + }; + + for (String patternStr : patterns) { + Pattern pattern = Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(html); + + while (matcher.find()) { + String headline = matcher.group(1).trim(); + if (!headline.isEmpty() && headline.length() > 10 && !headlines.contains(headline)) { + headlines.add(headline); + } + } + } + + for (int i = 0; i < Math.min(headlines.size(), 15); i++) { + ScrapedItem item = new ScrapedItem( + headlines.get(i), + "", + SOURCE + ); + data.addItem(item); + } + } + + @Override + public String getName() { + return NAME; + } + + @Override + public String getSource() { + return SOURCE; + } +} diff --git a/java-scraper/src/main/java/com/scraper/view/ConsoleView.java b/java-scraper/src/main/java/com/scraper/view/ConsoleView.java new file mode 100644 index 0000000..ba7965c --- /dev/null +++ b/java-scraper/src/main/java/com/scraper/view/ConsoleView.java @@ -0,0 +1,72 @@ +package com.scraper.view; + +import com.scraper.model.ScrapedData; +import com.scraper.model.ScrapedItem; + +import java.util.List; +import java.util.Map; + +/** + * MVC View class for console output + */ +public class ConsoleView { + + public void displayMessage(String message) { + System.out.println(message); + } + + public void displayError(String error) { + System.err.println("[ERROR] " + error); + } + + public void displaySuccess(String message) { + System.out.println("[SUCCESS] " + message); + } + + public void displayStrategies(List> strategies) { + System.out.println("\n=== Available Scrapers ==="); + for (int i = 0; i < strategies.size(); i++) { + Map strategy = strategies.get(i); + System.out.println((i + 1) + ". " + strategy.get("name")); + System.out.println(" Source: " + strategy.get("source")); + } + System.out.println(); + } + + public void displayScrapedData(ScrapedData data, String savedPath) { + System.out.println("\n=== Scraping Results ==="); + System.out.println("Source: " + data.getSource()); + System.out.println("Strategy: " + data.getStrategyName()); + System.out.println("Total Items: " + data.getTotalItems()); + System.out.println("Scraped At: " + data.getScrapedAt()); + + if (savedPath != null) { + System.out.println("Saved To: " + savedPath); + } + + System.out.println("\n--- Items Preview ---"); + List items = data.getItems(); + int displayCount = Math.min(items.size(), 5); + for (int i = 0; i < displayCount; i++) { + ScrapedItem item = items.get(i); + System.out.println((i + 1) + ". " + safeString(item.getTitle())); + if (item.getContent() != null && !item.getContent().isEmpty()) { + String content = safeString(item.getContent()); + String truncated = content.length() > 80 ? content.substring(0, 80) + "..." : content; + System.out.println(" " + truncated); + } + System.out.println(); + } + + if (items.size() > 5) { + System.out.println("... and " + (items.size() - 5) + " more items"); + } + } + + private String safeString(String str) { + if (str == null) { + return ""; + } + return str.replaceAll("[^\\x20-\\x7E]", "?"); + } +} diff --git a/java-scraper/target/classes/com/scraper/Main.class b/java-scraper/target/classes/com/scraper/Main.class new file mode 100644 index 0000000..cefcf5b Binary files /dev/null and b/java-scraper/target/classes/com/scraper/Main.class differ diff --git a/java-scraper/target/classes/com/scraper/command/Command.class b/java-scraper/target/classes/com/scraper/command/Command.class new file mode 100644 index 0000000..b7ab774 Binary files /dev/null and b/java-scraper/target/classes/com/scraper/command/Command.class differ diff --git a/java-scraper/target/classes/com/scraper/command/ListCommand.class b/java-scraper/target/classes/com/scraper/command/ListCommand.class new file mode 100644 index 0000000..1ef3dbd Binary files /dev/null and b/java-scraper/target/classes/com/scraper/command/ListCommand.class differ diff --git a/java-scraper/target/classes/com/scraper/command/ScrapeCommand.class b/java-scraper/target/classes/com/scraper/command/ScrapeCommand.class new file mode 100644 index 0000000..bfa000f Binary files /dev/null and b/java-scraper/target/classes/com/scraper/command/ScrapeCommand.class differ diff --git a/java-scraper/target/classes/com/scraper/controller/ScraperController.class b/java-scraper/target/classes/com/scraper/controller/ScraperController.class new file mode 100644 index 0000000..3e523d4 Binary files /dev/null and b/java-scraper/target/classes/com/scraper/controller/ScraperController.class differ diff --git a/java-scraper/target/classes/com/scraper/exception/NetworkException.class b/java-scraper/target/classes/com/scraper/exception/NetworkException.class new file mode 100644 index 0000000..7e17126 Binary files /dev/null and b/java-scraper/target/classes/com/scraper/exception/NetworkException.class differ diff --git a/java-scraper/target/classes/com/scraper/exception/ParseException.class b/java-scraper/target/classes/com/scraper/exception/ParseException.class new file mode 100644 index 0000000..d2305cf Binary files /dev/null and b/java-scraper/target/classes/com/scraper/exception/ParseException.class differ diff --git a/java-scraper/target/classes/com/scraper/exception/ScraperException.class b/java-scraper/target/classes/com/scraper/exception/ScraperException.class new file mode 100644 index 0000000..ce80753 Binary files /dev/null and b/java-scraper/target/classes/com/scraper/exception/ScraperException.class differ diff --git a/java-scraper/target/classes/com/scraper/exception/StorageException.class b/java-scraper/target/classes/com/scraper/exception/StorageException.class new file mode 100644 index 0000000..8792b37 Binary files /dev/null and b/java-scraper/target/classes/com/scraper/exception/StorageException.class differ diff --git a/java-scraper/target/classes/com/scraper/exception/StrategyException.class b/java-scraper/target/classes/com/scraper/exception/StrategyException.class new file mode 100644 index 0000000..d2a5a5f Binary files /dev/null and b/java-scraper/target/classes/com/scraper/exception/StrategyException.class differ diff --git a/java-scraper/target/classes/com/scraper/model/ScrapedData.class b/java-scraper/target/classes/com/scraper/model/ScrapedData.class new file mode 100644 index 0000000..1bfb882 Binary files /dev/null and b/java-scraper/target/classes/com/scraper/model/ScrapedData.class differ diff --git a/java-scraper/target/classes/com/scraper/model/ScrapedItem.class b/java-scraper/target/classes/com/scraper/model/ScrapedItem.class new file mode 100644 index 0000000..2224595 Binary files /dev/null and b/java-scraper/target/classes/com/scraper/model/ScrapedItem.class differ diff --git a/java-scraper/target/classes/com/scraper/strategy/BooksScraperStrategy.class b/java-scraper/target/classes/com/scraper/strategy/BooksScraperStrategy.class new file mode 100644 index 0000000..d16cac2 Binary files /dev/null and b/java-scraper/target/classes/com/scraper/strategy/BooksScraperStrategy.class differ diff --git a/java-scraper/target/classes/com/scraper/strategy/NewsScraperStrategy.class b/java-scraper/target/classes/com/scraper/strategy/NewsScraperStrategy.class new file mode 100644 index 0000000..99ec07a Binary files /dev/null and b/java-scraper/target/classes/com/scraper/strategy/NewsScraperStrategy.class differ diff --git a/java-scraper/target/classes/com/scraper/strategy/ScraperStrategy.class b/java-scraper/target/classes/com/scraper/strategy/ScraperStrategy.class new file mode 100644 index 0000000..d735b20 Binary files /dev/null and b/java-scraper/target/classes/com/scraper/strategy/ScraperStrategy.class differ diff --git a/java-scraper/target/classes/com/scraper/strategy/TechNewsScraperStrategy.class b/java-scraper/target/classes/com/scraper/strategy/TechNewsScraperStrategy.class new file mode 100644 index 0000000..4549109 Binary files /dev/null and b/java-scraper/target/classes/com/scraper/strategy/TechNewsScraperStrategy.class differ diff --git a/java-scraper/target/classes/com/scraper/view/ConsoleView.class b/java-scraper/target/classes/com/scraper/view/ConsoleView.class new file mode 100644 index 0000000..99bedc0 Binary files /dev/null and b/java-scraper/target/classes/com/scraper/view/ConsoleView.class differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..c25f084 --- /dev/null +++ b/main.py @@ -0,0 +1,102 @@ +import argparse +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from controllers import ScraperController +from commands import ScrapeCommand, ListCommand +from views import ConsoleView +from exceptions import ScraperException + + +class CLIApplication: + def __init__(self): + self.controller = ScraperController() + self.view = ConsoleView() + + def run(self, args=None): + parser = self._create_parser() + parsed_args = parser.parse_args(args) + + if hasattr(parsed_args, 'func'): + try: + parsed_args.func(parsed_args) + except ScraperException as e: + self.view.display_error(str(e)) + if e.original_exception: + self.view.display_error(f"Original error: {e.original_exception}") + sys.exit(1) + except Exception as e: + self.view.display_error(f"Unexpected error: {str(e)}") + sys.exit(1) + else: + parser.print_help() + + def _create_parser(self) -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description='Web Scraper CLI - MVC + Command Pattern + Strategy Pattern', + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + subparsers = parser.add_subparsers(dest='command', help='Available commands') + + scrape_parser = subparsers.add_parser('scrape', help='Scrape data from a website') + scrape_parser.add_argument( + 'strategy', + choices=['news_scraper', 'books_scraper', 'tech_news_scraper', 'all'], + help='Scraper strategy to use' + ) + scrape_parser.add_argument( + '--output', '-o', + default='data', + help='Output directory for scraped data' + ) + scrape_parser.set_defaults(func=self._handle_scrape) + + list_parser = subparsers.add_parser('list', help='List all available scrapers') + list_parser.set_defaults(func=self._handle_list) + + info_parser = subparsers.add_parser('info', help='Show detailed info about a scraper') + info_parser.add_argument('strategy', help='Strategy name') + info_parser.set_defaults(func=self._handle_info) + + return parser + + def _handle_scrape(self, args): + if args.strategy == 'all': + strategies = ['news_scraper', 'books_scraper', 'tech_news_scraper'] + for strategy in strategies: + self._scrape_single(strategy, args.output) + else: + self._scrape_single(args.strategy, args.output) + + def _scrape_single(self, strategy_name: str, output_dir: str): + self.controller.output_dir = output_dir + command = ScrapeCommand(self.controller, strategy_name) + data, saved_path = command.execute() + self.view.display_success(f"Scraped {data.total_items} items using {strategy_name}") + self.view.display_scraped_data(data, saved_path) + + def _handle_list(self, args): + command = ListCommand(self.controller) + strategies = command.execute() + self.view.display_strategies(strategies) + + def _handle_info(self, args): + strategies = self.controller.list_strategies() + strategy = next((s for s in strategies if s['name'] == args.strategy), None) + if strategy: + self.view.display_message(f"\n=== {strategy['name']} ===") + self.view.display_message(f"Source: {strategy['source']}") + else: + self.view.display_error(f"Strategy '{args.strategy}' not found") + + +def main(): + app = CLIApplication() + app.run() + + +if __name__ == '__main__': + main() diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000..55e2bcc --- /dev/null +++ b/models/__init__.py @@ -0,0 +1,3 @@ +from .scraped_data import ScrapedItem, ScrapedData + +__all__ = ['ScrapedItem', 'ScrapedData'] diff --git a/models/__pycache__/__init__.cpython-314.pyc b/models/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..4575285 Binary files /dev/null and b/models/__pycache__/__init__.cpython-314.pyc differ diff --git a/models/__pycache__/scraped_data.cpython-314.pyc b/models/__pycache__/scraped_data.cpython-314.pyc new file mode 100644 index 0000000..caa53e4 Binary files /dev/null and b/models/__pycache__/scraped_data.cpython-314.pyc differ diff --git a/models/scraped_data.py b/models/scraped_data.py new file mode 100644 index 0000000..80e524b --- /dev/null +++ b/models/scraped_data.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass, field +from datetime import datetime +from typing import List, Dict, Any + + +@dataclass +class ScrapedItem: + title: str + content: str + url: str + timestamp: datetime = field(default_factory=datetime.now) + + def to_dict(self) -> Dict[str, Any]: + return { + 'title': self.title, + 'content': self.content, + 'url': self.url, + 'timestamp': self.timestamp.isoformat() + } + + +@dataclass +class ScrapedData: + source: str + strategy_name: str + items: List[ScrapedItem] = field(default_factory=list) + scraped_at: datetime = field(default_factory=datetime.now) + total_items: int = 0 + + def add_item(self, item: ScrapedItem): + self.items.append(item) + self.total_items = len(self.items) + + def to_dict(self) -> Dict[str, Any]: + return { + 'source': self.source, + 'strategy_name': self.strategy_name, + 'items': [item.to_dict() for item in self.items], + 'scraped_at': self.scraped_at.isoformat(), + 'total_items': self.total_items + } diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ae45045 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +requests>=2.28.0 +beautifulsoup4>=4.11.0 + +requests is optional, the scraper uses urllib by default. +beautifulsoup4 is optional, the scraper uses html.parser by default. diff --git a/strategies/__init__.py b/strategies/__init__.py new file mode 100644 index 0000000..b52c61e --- /dev/null +++ b/strategies/__init__.py @@ -0,0 +1,11 @@ +from .base_scraper import ScraperStrategy +from .news_scraper import NewsScraperStrategy +from .quotes_scraper import BooksScraperStrategy +from .tech_news_scraper import TechNewsScraperStrategy + +__all__ = [ + 'ScraperStrategy', + 'NewsScraperStrategy', + 'BooksScraperStrategy', + 'TechNewsScraperStrategy' +] diff --git a/strategies/__pycache__/__init__.cpython-314.pyc b/strategies/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..1b5185d Binary files /dev/null and b/strategies/__pycache__/__init__.cpython-314.pyc differ diff --git a/strategies/__pycache__/base_scraper.cpython-314.pyc b/strategies/__pycache__/base_scraper.cpython-314.pyc new file mode 100644 index 0000000..eda532f Binary files /dev/null and b/strategies/__pycache__/base_scraper.cpython-314.pyc differ diff --git a/strategies/__pycache__/news_scraper.cpython-314.pyc b/strategies/__pycache__/news_scraper.cpython-314.pyc new file mode 100644 index 0000000..5f26ebe Binary files /dev/null and b/strategies/__pycache__/news_scraper.cpython-314.pyc differ diff --git a/strategies/__pycache__/quotes_scraper.cpython-314.pyc b/strategies/__pycache__/quotes_scraper.cpython-314.pyc new file mode 100644 index 0000000..fed4be5 Binary files /dev/null and b/strategies/__pycache__/quotes_scraper.cpython-314.pyc differ diff --git a/strategies/__pycache__/tech_news_scraper.cpython-314.pyc b/strategies/__pycache__/tech_news_scraper.cpython-314.pyc new file mode 100644 index 0000000..915421c Binary files /dev/null and b/strategies/__pycache__/tech_news_scraper.cpython-314.pyc differ diff --git a/strategies/base_scraper.py b/strategies/base_scraper.py new file mode 100644 index 0000000..4da220b --- /dev/null +++ b/strategies/base_scraper.py @@ -0,0 +1,18 @@ +from abc import ABC, abstractmethod +from models import ScrapedData + + +class ScraperStrategy(ABC): + @abstractmethod + def scrape(self) -> ScrapedData: + pass + + @property + @abstractmethod + def name(self) -> str: + pass + + @property + @abstractmethod + def source(self) -> str: + pass diff --git a/strategies/news_scraper.py b/strategies/news_scraper.py new file mode 100644 index 0000000..5e7e238 --- /dev/null +++ b/strategies/news_scraper.py @@ -0,0 +1,72 @@ +from urllib.request import urlopen, Request +import re +from datetime import datetime + +from strategies.base_scraper import ScraperStrategy +from models import ScrapedItem, ScrapedData +from exceptions import NetworkException, ParseException + + +class NewsScraperStrategy(ScraperStrategy): + def __init__(self): + self._name = "news_scraper" + self._source = "http://quotes.toscrape.com" + + @property + def name(self) -> str: + return self._name + + @property + def source(self) -> str: + return self._source + + def scrape(self) -> ScrapedData: + data = ScrapedData(source=self.source, strategy_name=self.name) + try: + request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'}) + response = urlopen(request, timeout=10) + html = response.read().decode('utf-8') + except Exception as e: + raise NetworkException( + f"Failed to fetch news from {self.source}", + original_exception=e + ) + + try: + quotes = self._extract_quotes(html) + + for quote_text, author in quotes[:10]: + item = ScrapedItem( + title=f"Quote by {author}", + content=quote_text, + url=self.source + ) + data.add_item(item) + + except Exception as e: + raise ParseException( + "Failed to parse news content", + selector="div.quote", + original_exception=e + ) + + return data + + def _extract_quotes(self, html): + quotes = [] + quote_pattern = r'
]*>.*?]*>([^<]+).*?([^<]+)' + + matches = re.findall(quote_pattern, html, re.DOTALL) + for match in matches: + quotes.append((match[0].strip(), match[1].strip())) + + if not quotes: + text_pattern = r'"text">([^<]+)<' + author_pattern = r'author">([^<]+)<' + texts = re.findall(text_pattern, html) + authors = re.findall(author_pattern, html) + + for i in range(min(len(texts), len(authors))): + quotes.append((texts[i].strip(), authors[i].strip())) + + return quotes diff --git a/strategies/quotes_scraper.py b/strategies/quotes_scraper.py new file mode 100644 index 0000000..b2c2dbb --- /dev/null +++ b/strategies/quotes_scraper.py @@ -0,0 +1,67 @@ +from urllib.request import urlopen, Request +import re +from datetime import datetime + +from strategies.base_scraper import ScraperStrategy +from models import ScrapedItem, ScrapedData +from exceptions import NetworkException, ParseException + + +class BooksScraperStrategy(ScraperStrategy): + def __init__(self): + self._name = "books_scraper" + self._source = "https://books.toscrape.com" + + @property + def name(self) -> str: + return self._name + + @property + def source(self) -> str: + return self._source + + def scrape(self) -> ScrapedData: + data = ScrapedData(source=self.source, strategy_name=self.name) + try: + request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'}) + response = urlopen(request, timeout=10) + html = response.read().decode('utf-8') + except Exception as e: + raise NetworkException( + f"Failed to fetch books from {self.source}", + original_exception=e + ) + + try: + books = self._extract_books(html) + + for title, price in books[:20]: + item = ScrapedItem( + title=title, + content=f"Price: {price}", + url=self.source + ) + data.add_item(item) + + except Exception as e: + raise ParseException( + "Failed to parse book content", + selector="article.product_pod", + original_exception=e + ) + + return data + + def _extract_books(self, html): + books = [] + + title_pattern = r'

([^<]+)<' + + titles = re.findall(title_pattern, html) + prices = re.findall(price_pattern, html) + + for i in range(min(len(titles), len(prices))): + books.append((titles[i].strip(), prices[i].strip())) + + return books diff --git a/strategies/tech_news_scraper.py b/strategies/tech_news_scraper.py new file mode 100644 index 0000000..c8f3969 --- /dev/null +++ b/strategies/tech_news_scraper.py @@ -0,0 +1,81 @@ +from urllib.request import urlopen, Request +import re + +from strategies.base_scraper import ScraperStrategy +from models import ScrapedItem, ScrapedData +from exceptions import NetworkException, ParseException + + +class TechNewsScraperStrategy(ScraperStrategy): + def __init__(self): + self._name = "tech_news_scraper" + self._source = "https://www.bbc.com/news" + + @property + def name(self) -> str: + return self._name + + @property + def source(self) -> str: + return self._source + + def scrape(self) -> ScrapedData: + data = ScrapedData(source=self.source, strategy_name=self.name) + try: + request = Request(self.source, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }) + response = urlopen(request, timeout=10) + html = response.read().decode('utf-8') + except Exception as e: + raise NetworkException( + f"Failed to fetch tech news from {self.source}", + original_exception=e + ) + + try: + headlines = self._extract_headlines(html) + + for headline in headlines[:15]: + item = ScrapedItem( + title=headline, + content="", + url=self.source + ) + data.add_item(item) + + except Exception as e: + raise ParseException( + "Failed to parse tech news content", + selector="h1, h2, h3", + original_exception=e + ) + + return data + + def _extract_headlines(self, html): + headlines = [] + + h_patterns = [ + r']*class="[^"]*headline[^"]*"[^>]*>([^<]+)<', + r']*class="[^"]*headline[^"]*"[^>]*>([^<]+)<', + r']*class="[^"]*headline[^"]*"[^>]*>([^<]+)<', + r']*class="[^"]*title[^"]*"[^>]*>([^<]+)<', + r']*class="[^"]*title[^"]*"[^>]*>([^<]+)<', + ] + + for pattern in h_patterns: + matches = re.findall(pattern, html, re.IGNORECASE) + for match in matches: + headline = match.strip() + if headline and len(headline) > 10: + headlines.append(headline) + + seen = set() + unique_headlines = [] + for h in headlines: + if h not in seen: + seen.add(h) + unique_headlines.append(h) + + return unique_headlines diff --git a/views/__init__.py b/views/__init__.py new file mode 100644 index 0000000..c5de988 --- /dev/null +++ b/views/__init__.py @@ -0,0 +1,3 @@ +from .console_view import ConsoleView + +__all__ = ['ConsoleView'] diff --git a/views/__pycache__/__init__.cpython-314.pyc b/views/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..0a4ac0a Binary files /dev/null and b/views/__pycache__/__init__.cpython-314.pyc differ diff --git a/views/__pycache__/console_view.cpython-314.pyc b/views/__pycache__/console_view.cpython-314.pyc new file mode 100644 index 0000000..474e27c Binary files /dev/null and b/views/__pycache__/console_view.cpython-314.pyc differ diff --git a/views/console_view.py b/views/console_view.py new file mode 100644 index 0000000..b712181 --- /dev/null +++ b/views/console_view.py @@ -0,0 +1,68 @@ +import sys +from typing import List, Dict, Any + + +class ConsoleView: + def display_message(self, message: str): + try: + print(message) + except UnicodeEncodeError: + print(message.encode('utf-8', errors='replace').decode('utf-8')) + + def display_error(self, error: str): + try: + print(f"[ERROR] {error}") + except UnicodeEncodeError: + print(f"[ERROR] {error.encode('utf-8', errors='replace').decode('utf-8')}") + + def display_success(self, message: str): + try: + print(f"[SUCCESS] {message}") + except UnicodeEncodeError: + print(f"[SUCCESS] {message.encode('utf-8', errors='replace').decode('utf-8')}") + + def display_strategies(self, strategies: List[Dict[str, str]]): + print("\n=== Available Scrapers ===") + for idx, strategy in enumerate(strategies, 1): + print(f"{idx}. {strategy['name']}") + print(f" Source: {strategy['source']}") + print() + + def display_scraped_data(self, data: Any, saved_path: str = None): + if hasattr(data, 'to_dict'): + data = data.to_dict() + + print("\n=== Scraping Results ===") + print(f"Source: {data.get('source', 'N/A')}") + print(f"Strategy: {data.get('strategy_name', 'N/A')}") + print(f"Total Items: {data.get('total_items', len(data.get('items', [])))}") + print(f"Scraped At: {data.get('scraped_at', 'N/A')}") + + if saved_path: + print(f"Saved To: {saved_path}") + + print("\n--- Items Preview ---") + items = data.get('items', []) + for idx, item in enumerate(items[:5], 1): + try: + title = item.get('title', 'N/A') + print(f"{idx}. {title}") + except UnicodeEncodeError: + print(f"{idx}. {item.get('title', 'N/A').encode('utf-8', errors='replace').decode('utf-8')}") + + if item.get('content'): + content = item.get('content', '') + try: + truncated = content[:80] + "..." if len(content) > 80 else content + print(f" {truncated}") + except UnicodeEncodeError: + truncated = content[:80].encode('utf-8', errors='replace').decode('utf-8') + print(f" {truncated}") + print() + + if len(items) > 5: + print(f"... and {len(items) - 5} more items") + + def display_list(self, items: List[Any]): + for item in items: + print(item)