Browse Source

期末爬虫项目+实验报告

master
ZhangJinxuan 3 weeks ago
commit
0b572260a8
  1. BIN
      202506050228-张金炫-期末实验报告.docx
  2. BIN
      9919d4711bf7a75e286295928b7eb5f0.png
  3. 5
      commands/__init__.py
  4. BIN
      commands/__pycache__/__init__.cpython-314.pyc
  5. BIN
      commands/__pycache__/base_command.cpython-314.pyc
  6. BIN
      commands/__pycache__/list_command.cpython-314.pyc
  7. BIN
      commands/__pycache__/scrape_command.cpython-314.pyc
  8. 11
      commands/base_command.py
  9. 13
      commands/list_command.py
  10. 25
      commands/scrape_command.py
  11. 3
      controllers/__init__.py
  12. BIN
      controllers/__pycache__/__init__.cpython-314.pyc
  13. BIN
      controllers/__pycache__/scraper_controller.cpython-314.pyc
  14. 112
      controllers/scraper_controller.py
  15. 21
      debug_books.py
  16. 17
      exceptions/__init__.py
  17. BIN
      exceptions/__pycache__/__init__.cpython-314.pyc
  18. BIN
      exceptions/__pycache__/scraper_exceptions.cpython-314.pyc
  19. 34
      exceptions/scraper_exceptions.py
  20. 83
      java-scraper/README.md
  21. 7
      java-scraper/data/books_scraper/scraped_data_20260531_104648.json
  22. 128
      java-scraper/data/books_scraper/scraped_data_20260531_104856.json
  23. 128
      java-scraper/data/books_scraper/scraped_data_20260531_105031.json
  24. 68
      java-scraper/data/news_scraper/scraped_data_20260531_104348.json
  25. 7
      java-scraper/data/news_scraper/scraped_data_20260531_104511.json
  26. 68
      java-scraper/data/news_scraper/scraped_data_20260531_104620.json
  27. 50
      java-scraper/pom.xml
  28. 108
      java-scraper/src/main/java/com/scraper/Main.java
  29. 11
      java-scraper/src/main/java/com/scraper/command/Command.java
  30. 33
      java-scraper/src/main/java/com/scraper/command/ListCommand.java
  31. 42
      java-scraper/src/main/java/com/scraper/command/ScrapeCommand.java
  32. 138
      java-scraper/src/main/java/com/scraper/controller/ScraperController.java
  33. 27
      java-scraper/src/main/java/com/scraper/exception/NetworkException.java
  34. 27
      java-scraper/src/main/java/com/scraper/exception/ParseException.java
  35. 17
      java-scraper/src/main/java/com/scraper/exception/ScraperException.java
  36. 27
      java-scraper/src/main/java/com/scraper/exception/StorageException.java
  37. 27
      java-scraper/src/main/java/com/scraper/exception/StrategyException.java
  38. 77
      java-scraper/src/main/java/com/scraper/model/ScrapedData.java
  39. 57
      java-scraper/src/main/java/com/scraper/model/ScrapedItem.java
  40. 102
      java-scraper/src/main/java/com/scraper/strategy/BooksScraperStrategy.java
  41. 121
      java-scraper/src/main/java/com/scraper/strategy/NewsScraperStrategy.java
  42. 13
      java-scraper/src/main/java/com/scraper/strategy/ScraperStrategy.java
  43. 114
      java-scraper/src/main/java/com/scraper/strategy/TechNewsScraperStrategy.java
  44. 72
      java-scraper/src/main/java/com/scraper/view/ConsoleView.java
  45. BIN
      java-scraper/target/classes/com/scraper/Main.class
  46. BIN
      java-scraper/target/classes/com/scraper/command/Command.class
  47. BIN
      java-scraper/target/classes/com/scraper/command/ListCommand.class
  48. BIN
      java-scraper/target/classes/com/scraper/command/ScrapeCommand.class
  49. BIN
      java-scraper/target/classes/com/scraper/controller/ScraperController.class
  50. BIN
      java-scraper/target/classes/com/scraper/exception/NetworkException.class
  51. BIN
      java-scraper/target/classes/com/scraper/exception/ParseException.class
  52. BIN
      java-scraper/target/classes/com/scraper/exception/ScraperException.class
  53. BIN
      java-scraper/target/classes/com/scraper/exception/StorageException.class
  54. BIN
      java-scraper/target/classes/com/scraper/exception/StrategyException.class
  55. BIN
      java-scraper/target/classes/com/scraper/model/ScrapedData.class
  56. BIN
      java-scraper/target/classes/com/scraper/model/ScrapedItem.class
  57. BIN
      java-scraper/target/classes/com/scraper/strategy/BooksScraperStrategy.class
  58. BIN
      java-scraper/target/classes/com/scraper/strategy/NewsScraperStrategy.class
  59. BIN
      java-scraper/target/classes/com/scraper/strategy/ScraperStrategy.class
  60. BIN
      java-scraper/target/classes/com/scraper/strategy/TechNewsScraperStrategy.class
  61. BIN
      java-scraper/target/classes/com/scraper/view/ConsoleView.class
  62. 102
      main.py
  63. 3
      models/__init__.py
  64. BIN
      models/__pycache__/__init__.cpython-314.pyc
  65. BIN
      models/__pycache__/scraped_data.cpython-314.pyc
  66. 41
      models/scraped_data.py
  67. 5
      requirements.txt
  68. 11
      strategies/__init__.py
  69. BIN
      strategies/__pycache__/__init__.cpython-314.pyc
  70. BIN
      strategies/__pycache__/base_scraper.cpython-314.pyc
  71. BIN
      strategies/__pycache__/news_scraper.cpython-314.pyc
  72. BIN
      strategies/__pycache__/quotes_scraper.cpython-314.pyc
  73. BIN
      strategies/__pycache__/tech_news_scraper.cpython-314.pyc
  74. 18
      strategies/base_scraper.py
  75. 72
      strategies/news_scraper.py
  76. 67
      strategies/quotes_scraper.py
  77. 81
      strategies/tech_news_scraper.py
  78. 3
      views/__init__.py
  79. BIN
      views/__pycache__/__init__.cpython-314.pyc
  80. BIN
      views/__pycache__/console_view.cpython-314.pyc
  81. 68
      views/console_view.py

BIN
202506050228-张金炫-期末实验报告.docx

Binary file not shown.

BIN
9919d4711bf7a75e286295928b7eb5f0.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 68 KiB

5
commands/__init__.py

@ -0,0 +1,5 @@
from .base_command import Command
from .scrape_command import ScrapeCommand
from .list_command import ListCommand
__all__ = ['Command', 'ScrapeCommand', 'ListCommand']

BIN
commands/__pycache__/__init__.cpython-314.pyc

Binary file not shown.

BIN
commands/__pycache__/base_command.cpython-314.pyc

Binary file not shown.

BIN
commands/__pycache__/list_command.cpython-314.pyc

Binary file not shown.

BIN
commands/__pycache__/scrape_command.cpython-314.pyc

Binary file not shown.

11
commands/base_command.py

@ -0,0 +1,11 @@
from abc import ABC, abstractmethod
class Command(ABC):
@abstractmethod
def execute(self):
pass
@abstractmethod
def undo(self):
pass

13
commands/list_command.py

@ -0,0 +1,13 @@
from commands.base_command import Command
from controllers import ScraperController
class ListCommand(Command):
def __init__(self, controller: ScraperController):
self.controller = controller
def execute(self):
return self.controller.list_strategies()
def undo(self):
pass

25
commands/scrape_command.py

@ -0,0 +1,25 @@
from commands.base_command import Command
from controllers import ScraperController
from exceptions import ScraperException
class ScrapeCommand(Command):
def __init__(self, controller: ScraperController, strategy_name: str):
self.controller = controller
self.strategy_name = strategy_name
self.scrape_result = None
self.saved_path = None
def execute(self):
try:
self.scrape_result = self.controller.execute_scrape(self.strategy_name)
self.saved_path = self.controller.save_data(self.scrape_result, self.strategy_name)
return self.scrape_result, self.saved_path
except ScraperException as e:
raise e
def undo(self):
if self.saved_path and self.controller.delete_data(self.saved_path):
print(f"Successfully undone: deleted {self.saved_path}")
return True
return False

3
controllers/__init__.py

@ -0,0 +1,3 @@
from .scraper_controller import ScraperController
__all__ = ['ScraperController']

BIN
controllers/__pycache__/__init__.cpython-314.pyc

Binary file not shown.

BIN
controllers/__pycache__/scraper_controller.cpython-314.pyc

Binary file not shown.

112
controllers/scraper_controller.py

@ -0,0 +1,112 @@
import json
import os
from datetime import datetime
from typing import Dict, List
from strategies import (
ScraperStrategy,
NewsScraperStrategy,
BooksScraperStrategy,
TechNewsScraperStrategy
)
from models import ScrapedData
from exceptions import StrategyException, StorageException, ValidationException
class ScraperController:
def __init__(self, output_dir: str = "data"):
self.output_dir = output_dir
self.strategies: Dict[str, ScraperStrategy] = {}
self._register_default_strategies()
def _register_default_strategies(self):
self.register_strategy(NewsScraperStrategy())
self.register_strategy(BooksScraperStrategy())
self.register_strategy(TechNewsScraperStrategy())
def register_strategy(self, strategy: ScraperStrategy):
self.strategies[strategy.name] = strategy
def get_strategy(self, name: str) -> ScraperStrategy:
if name not in self.strategies:
available = ', '.join(self.strategies.keys())
raise StrategyException(
f"Strategy '{name}' not found. Available: {available}",
strategy_name=name
)
return self.strategies[name]
def list_strategies(self) -> List[Dict[str, str]]:
return [
{"name": s.name, "source": s.source}
for s in self.strategies.values()
]
def execute_scrape(self, strategy_name: str) -> ScrapedData:
strategy = self.get_strategy(strategy_name)
return strategy.scrape()
def save_data(self, data: ScrapedData, strategy_name: str) -> str:
try:
folder_path = os.path.join(self.output_dir, strategy_name)
os.makedirs(folder_path, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"scraped_data_{timestamp}.json"
file_path = os.path.join(folder_path, filename)
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data.to_dict(), f, ensure_ascii=False, indent=2)
return file_path
except Exception as e:
raise StorageException(
f"Failed to save data to {folder_path}",
file_path=folder_path,
original_exception=e
)
def delete_data(self, file_path: str) -> bool:
try:
if os.path.exists(file_path):
os.remove(file_path)
return True
return False
except Exception as e:
raise StorageException(
f"Failed to delete file {file_path}",
file_path=file_path,
original_exception=e
)
def load_data(self, strategy_name: str, filename: str = None) -> ScrapedData:
try:
folder_path = os.path.join(self.output_dir, strategy_name)
if not os.path.exists(folder_path):
raise StorageException(
f"No data found for strategy '{strategy_name}'",
file_path=folder_path
)
if filename:
file_path = os.path.join(folder_path, filename)
else:
files = sorted([f for f in os.listdir(folder_path) if f.endswith('.json')])
if not files:
raise StorageException(f"No data files found in {folder_path}")
file_path = os.path.join(folder_path, files[-1])
with open(file_path, 'r', encoding='utf-8') as f:
data_dict = json.load(f)
return data_dict
except Exception as e:
if isinstance(e, StorageException):
raise e
raise StorageException(
f"Failed to load data",
file_path=file_path if 'file_path' in locals() else None,
original_exception=e
)

21
debug_books.py

@ -0,0 +1,21 @@
from urllib.request import urlopen, Request
import re
r = urlopen(Request('https://books.toscrape.com', headers={'User-Agent': 'Mozilla/5.0'}))
html = r.read().decode('utf-8')
price_search = re.search(r'class="price_color[^"]*"[^>]*>([^<]+)<', html)
if price_search:
print('Found price pattern 1:', price_search.group(1))
else:
print('Pattern 1 not found')
price_search2 = re.search(r'price_color">([^<]+)<', html)
if price_search2:
print('Found price pattern 2:', price_search2.group(1))
else:
print('Pattern 2 not found')
idx = html.find('price_color')
if idx > 0:
print('Context around price_color:', html[idx-20:idx+50])

17
exceptions/__init__.py

@ -0,0 +1,17 @@
from .scraper_exceptions import (
ScraperException,
NetworkException,
ParseException,
ValidationException,
StorageException,
StrategyException
)
__all__ = [
'ScraperException',
'NetworkException',
'ParseException',
'ValidationException',
'StorageException',
'StrategyException'
]

BIN
exceptions/__pycache__/__init__.cpython-314.pyc

Binary file not shown.

BIN
exceptions/__pycache__/scraper_exceptions.cpython-314.pyc

Binary file not shown.

34
exceptions/scraper_exceptions.py

@ -0,0 +1,34 @@
class ScraperException(Exception):
def __init__(self, message, original_exception=None):
super().__init__(message)
self.original_exception = original_exception
class NetworkException(ScraperException):
def __init__(self, message, status_code=None, original_exception=None):
super().__init__(message, original_exception)
self.status_code = status_code
class ParseException(ScraperException):
def __init__(self, message, selector=None, original_exception=None):
super().__init__(message, original_exception)
self.selector = selector
class ValidationException(ScraperException):
def __init__(self, message, field=None, original_exception=None):
super().__init__(message, original_exception)
self.field = field
class StorageException(ScraperException):
def __init__(self, message, file_path=None, original_exception=None):
super().__init__(message, original_exception)
self.file_path = file_path
class StrategyException(ScraperException):
def __init__(self, message, strategy_name=None, original_exception=None):
super().__init__(message, original_exception)
self.strategy_name = strategy_name

83
java-scraper/README.md

@ -0,0 +1,83 @@
# Java Web Scraper
A complete web scraping application demonstrating:
- **CLI Interface**
- **MVC Architecture**
- **Command Pattern**
- **Strategy Pattern**
- **Custom Exception Hierarchy**
## Features
- 3 different scraping strategies:
- `news_scraper` - Scrapes quotes from http://quotes.toscrape.com
- `books_scraper` - Scrapes books from https://books.toscrape.com
- `tech_news_scraper` - Scrapes news from https://www.bbc.com/news
- Saves data to JSON files
- Command-line interface
- Extensible architecture
## Building
```bash
cd java-scraper
mvn clean package
```
## Usage
### List available scrapers:
```bash
mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="list"
```
### Scrape using a specific strategy:
```bash
mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="scrape news_scraper"
```
### Scrape all:
```bash
mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="scrape all"
```
### Custom output directory:
```bash
mvn exec:java -Dexec.mainClass="com.scraper.Main" -Dexec.args="scrape news_scraper --output my_data"
```
### Using the built JAR:
```bash
java -jar target/java-scraper-1.0-SNAPSHOT.jar list
java -jar target/java-scraper-1.0-SNAPSHOT.jar scrape news_scraper
```
## Architecture
### MVC
- **Model**: `ScrapedItem`, `ScrapedData`
- **View**: `ConsoleView`
- **Controller**: `ScraperController`
### Command Pattern
- `Command` interface
- `ScrapeCommand`
- `ListCommand`
### Strategy Pattern
- `ScraperStrategy` interface
- `NewsScraperStrategy`
- `BooksScraperStrategy`
- `TechNewsScraperStrategy`
### Exception Hierarchy
- `ScraperException` (base)
- `NetworkException`
- `ParseException`
- `StorageException`
- `StrategyException`
## Requirements
- Java 11 or higher
- Maven

7
java-scraper/data/books_scraper/scraped_data_20260531_104648.json

@ -0,0 +1,7 @@
{
"source": "https://books.toscrape.com",
"strategy_name": "books_scraper",
"items": [],
"scraped_at": "2026-05-31T10:46:46.169175",
"total_items": 0
}

128
java-scraper/data/books_scraper/scraped_data_20260531_104856.json

@ -0,0 +1,128 @@
{
"source": "https://books.toscrape.com",
"strategy_name": "books_scraper",
"items": [
{
"title": "A Light in the Attic",
"content": "Price: £51.77",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.489985"
},
{
"title": "Tipping the Velvet",
"content": "Price: £53.74",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.489997"
},
{
"title": "Soumission",
"content": "Price: £50.10",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490001"
},
{
"title": "Sharp Objects",
"content": "Price: £47.82",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490004"
},
{
"title": "Sapiens: A Brief History of Humankind",
"content": "Price: £54.23",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490005"
},
{
"title": "The Requiem Red",
"content": "Price: £22.65",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490007"
},
{
"title": "The Dirty Little Secrets of Getting Your Dream Job",
"content": "Price: £33.34",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490009"
},
{
"title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull",
"content": "Price: £17.93",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490011"
},
{
"title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics",
"content": "Price: £22.60",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490012"
},
{
"title": "The Black Maria",
"content": "Price: £52.15",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490014"
},
{
"title": "Starving Hearts (Triangular Trade Trilogy, #1)",
"content": "Price: £13.99",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490015"
},
{
"title": "Shakespeare&#39;s Sonnets",
"content": "Price: £20.66",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490017"
},
{
"title": "Set Me Free",
"content": "Price: £17.46",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490019"
},
{
"title": "Scott Pilgrim&#39;s Precious Little Life (Scott Pilgrim #1)",
"content": "Price: £52.29",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490020"
},
{
"title": "Rip it Up and Start Again",
"content": "Price: £35.02",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490022"
},
{
"title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991",
"content": "Price: £57.25",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490023"
},
{
"title": "Olio",
"content": "Price: £23.88",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490025"
},
{
"title": "Mesaerion: The Best Science Fiction Stories 1800-1849",
"content": "Price: £37.59",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490035"
},
{
"title": "Libertarianism for Beginners",
"content": "Price: £51.33",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490037"
},
{
"title": "It&#39;s Only the Himalayas",
"content": "Price: £45.17",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:48:56.490038"
}
],
"scraped_at": "2026-05-31T10:48:54.348792",
"total_items": 20
}

128
java-scraper/data/books_scraper/scraped_data_20260531_105031.json

@ -0,0 +1,128 @@
{
"source": "https://books.toscrape.com",
"strategy_name": "books_scraper",
"items": [
{
"title": "A Light in the Attic",
"content": "Price: £51.77",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674011"
},
{
"title": "Tipping the Velvet",
"content": "Price: £53.74",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674021"
},
{
"title": "Soumission",
"content": "Price: £50.10",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674024"
},
{
"title": "Sharp Objects",
"content": "Price: £47.82",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674026"
},
{
"title": "Sapiens: A Brief History of Humankind",
"content": "Price: £54.23",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674028"
},
{
"title": "The Requiem Red",
"content": "Price: £22.65",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674029"
},
{
"title": "The Dirty Little Secrets of Getting Your Dream Job",
"content": "Price: £33.34",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674031"
},
{
"title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull",
"content": "Price: £17.93",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674032"
},
{
"title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics",
"content": "Price: £22.60",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674033"
},
{
"title": "The Black Maria",
"content": "Price: £52.15",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674034"
},
{
"title": "Starving Hearts (Triangular Trade Trilogy, #1)",
"content": "Price: £13.99",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674035"
},
{
"title": "Shakespeare&#39;s Sonnets",
"content": "Price: £20.66",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674041"
},
{
"title": "Set Me Free",
"content": "Price: £17.46",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674043"
},
{
"title": "Scott Pilgrim&#39;s Precious Little Life (Scott Pilgrim #1)",
"content": "Price: £52.29",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674044"
},
{
"title": "Rip it Up and Start Again",
"content": "Price: £35.02",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674045"
},
{
"title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991",
"content": "Price: £57.25",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674046"
},
{
"title": "Olio",
"content": "Price: £23.88",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674046"
},
{
"title": "Mesaerion: The Best Science Fiction Stories 1800-1849",
"content": "Price: £37.59",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674048"
},
{
"title": "Libertarianism for Beginners",
"content": "Price: £51.33",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674049"
},
{
"title": "It&#39;s Only the Himalayas",
"content": "Price: £45.17",
"url": "https://books.toscrape.com",
"timestamp": "2026-05-31T10:50:31.674050"
}
],
"scraped_at": "2026-05-31T10:50:29.355948",
"total_items": 20
}

68
java-scraper/data/news_scraper/scraped_data_20260531_104348.json

@ -0,0 +1,68 @@
{
"source": "http://quotes.toscrape.com",
"strategy_name": "news_scraper",
"items": [
{
"title": "temprop=",
"content": "temprop=",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:43:48.501000"
},
{
"title": "temprop=",
"content": "temprop=",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:43:48.501016"
},
{
"title": "temprop=",
"content": "temprop=",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:43:48.501021"
},
{
"title": "temprop=",
"content": "temprop=",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:43:48.501024"
},
{
"title": "temprop=",
"content": "temprop=",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:43:48.501026"
},
{
"title": "temprop=",
"content": "temprop=",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:43:48.501028"
},
{
"title": "temprop=",
"content": "temprop=",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:43:48.501030"
},
{
"title": "temprop=",
"content": "temprop=",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:43:48.501032"
},
{
"title": "temprop=",
"content": "temprop=",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:43:48.501034"
},
{
"title": "temprop=",
"content": "temprop=",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:43:48.501036"
}
],
"scraped_at": "2026-05-31T10:43:45.907587",
"total_items": 10
}

7
java-scraper/data/news_scraper/scraped_data_20260531_104511.json

@ -0,0 +1,7 @@
{
"source": "http://quotes.toscrape.com",
"strategy_name": "news_scraper",
"items": [],
"scraped_at": "2026-05-31T10:45:10.355276",
"total_items": 0
}

68
java-scraper/data/news_scraper/scraped_data_20260531_104620.json

@ -0,0 +1,68 @@
{
"source": "http://quotes.toscrape.com",
"strategy_name": "news_scraper",
"items": [
{
"title": "Quote by Albert Einstein",
"content": "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:46:20.434224"
},
{
"title": "Quote by J.K. Rowling",
"content": "“It is our choices, Harry, that show what we truly are, far more than our abilities.”",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:46:20.434236"
},
{
"title": "Quote by Albert Einstein",
"content": "“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:46:20.434250"
},
{
"title": "Quote by Jane Austen",
"content": "“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:46:20.434253"
},
{
"title": "Quote by Marilyn Monroe",
"content": "“Imperfection is beauty, madness is genius and it&#39;s better to be absolutely ridiculous than absolutely boring.”",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:46:20.434255"
},
{
"title": "Quote by Albert Einstein",
"content": "“Try not to become a man of success. Rather become a man of value.”",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:46:20.434257"
},
{
"title": "Quote by André Gide",
"content": "“It is better to be hated for what you are than to be loved for what you are not.”",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:46:20.434259"
},
{
"title": "Quote by Thomas A. Edison",
"content": "“I have not failed. I&#39;ve just found 10,000 ways that won&#39;t work.”",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:46:20.434261"
},
{
"title": "Quote by Eleanor Roosevelt",
"content": "“A woman is like a tea bag; you never know how strong it is until it&#39;s in hot water.”",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:46:20.434262"
},
{
"title": "Quote by Steve Martin",
"content": "“A day without sunshine is like, you know, night.”",
"url": "http://quotes.toscrape.com",
"timestamp": "2026-05-31T10:46:20.434264"
}
],
"scraped_at": "2026-05-31T10:46:18.193675",
"total_items": 10
}

50
java-scraper/pom.xml

@ -0,0 +1,50 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.scraper</groupId>
<artifactId>java-scraper</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- JSON serialization (Gson) -->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.5.0</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.scraper.Main</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

108
java-scraper/src/main/java/com/scraper/Main.java

@ -0,0 +1,108 @@
package com.scraper;
import com.scraper.command.ListCommand;
import com.scraper.command.ScrapeCommand;
import com.scraper.controller.ScraperController;
import com.scraper.exception.ScraperException;
import com.scraper.view.ConsoleView;
/**
* Main CLI application - Entry point for the scraper
*/
public class Main {
private ScraperController controller;
private ConsoleView view;
public Main() {
this.controller = new ScraperController();
this.view = new ConsoleView();
}
public static void main(String[] args) {
Main app = new Main();
app.run(args);
}
public void run(String[] args) {
if (args.length == 0) {
printHelp();
return;
}
try {
switch (args[0]) {
case "list":
handleList();
break;
case "scrape":
handleScrape(args);
break;
case "help":
default:
printHelp();
break;
}
} catch (ScraperException e) {
view.displayError(e.getMessage());
if (e.getCause() != null) {
view.displayError("Cause: " + e.getCause().getMessage());
}
System.exit(1);
} catch (Exception e) {
view.displayError("Unexpected error: " + e.getMessage());
System.exit(1);
}
}
private void handleList() throws ScraperException {
ListCommand cmd = new ListCommand(controller);
cmd.execute();
view.displayStrategies(cmd.getStrategies());
}
private void handleScrape(String[] args) throws ScraperException {
if (args.length < 2) {
view.displayError("Please specify a scraper to use.");
printHelp();
return;
}
String strategyName = args[1];
String outputDir = "data";
if (args.length >= 4 && "--output".equals(args[2])) {
outputDir = args[3];
controller.setOutputDir(outputDir);
}
if ("all".equals(strategyName)) {
ListCommand listCmd = new ListCommand(controller);
listCmd.execute();
for (java.util.Map<String, String> strategy : listCmd.getStrategies()) {
scrapeSingle(strategy.get("name"));
}
} else {
scrapeSingle(strategyName);
}
}
private void scrapeSingle(String strategyName) throws ScraperException {
ScrapeCommand cmd = new ScrapeCommand(controller, strategyName);
cmd.execute();
view.displaySuccess("Scraped " + cmd.getScrapedData().getTotalItems() + " items using " + strategyName);
view.displayScrapedData(cmd.getScrapedData(), cmd.getSavedPath());
}
private void printHelp() {
System.out.println("=== Web Scraper CLI - MVC + Command Pattern + Strategy Pattern ===");
System.out.println();
System.out.println("Usage:");
System.out.println(" java -jar java-scraper.jar list - List all available scrapers");
System.out.println(" java -jar java-scraper.jar scrape <scraper> - Scrape data using specific scraper");
System.out.println(" java -jar java-scraper.jar scrape all - Scrape data from all scrapers");
System.out.println(" java -jar java-scraper.jar scrape <scraper> --output <dir> - Specify output directory");
System.out.println(" java -jar java-scraper.jar help - Show this help message");
System.out.println();
System.out.println("Available scrapers: news_scraper, books_scraper, tech_news_scraper");
}
}

11
java-scraper/src/main/java/com/scraper/command/Command.java

@ -0,0 +1,11 @@
package com.scraper.command;
import com.scraper.exception.ScraperException;
/**
* Command interface for Command pattern
*/
public interface Command {
void execute() throws ScraperException;
void undo() throws ScraperException;
}

33
java-scraper/src/main/java/com/scraper/command/ListCommand.java

@ -0,0 +1,33 @@
package com.scraper.command;
import com.scraper.controller.ScraperController;
import java.util.List;
import java.util.Map;
/**
* Command to list all available scraping strategies
*/
public class ListCommand implements Command {
private ScraperController controller;
private List<Map<String, String>> strategies;
public ListCommand(ScraperController controller) {
this.controller = controller;
}
@Override
public void execute() {
strategies = controller.listStrategies();
}
@Override
public void undo() {
// List command doesn't support undo
}
public List<Map<String, String>> getStrategies() {
return strategies;
}
}

42
java-scraper/src/main/java/com/scraper/command/ScrapeCommand.java

@ -0,0 +1,42 @@
package com.scraper.command;
import com.scraper.controller.ScraperController;
import com.scraper.exception.ScraperException;
import com.scraper.model.ScrapedData;
/**
* Command to scrape data from a specific strategy
*/
public class ScrapeCommand implements Command {
private ScraperController controller;
private String strategyName;
private ScrapedData scrapedData;
private String savedPath;
public ScrapeCommand(ScraperController controller, String strategyName) {
this.controller = controller;
this.strategyName = strategyName;
}
@Override
public void execute() throws ScraperException {
scrapedData = controller.executeScrape(strategyName);
savedPath = controller.saveData(scrapedData, strategyName);
}
@Override
public void undo() throws ScraperException {
if (savedPath != null) {
controller.deleteData(savedPath);
}
}
public ScrapedData getScrapedData() {
return scrapedData;
}
public String getSavedPath() {
return savedPath;
}
}

138
java-scraper/src/main/java/com/scraper/controller/ScraperController.java

@ -0,0 +1,138 @@
package com.scraper.controller;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.scraper.exception.StorageException;
import com.scraper.exception.StrategyException;
import com.scraper.model.ScrapedData;
import com.scraper.strategy.BooksScraperStrategy;
import com.scraper.strategy.NewsScraperStrategy;
import com.scraper.strategy.ScraperStrategy;
import com.scraper.strategy.TechNewsScraperStrategy;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* MVC Controller for the scraper application
*/
public class ScraperController {
private String outputDir;
private Map<String, ScraperStrategy> strategies;
private Gson gson;
public ScraperController() {
this("data");
}
public ScraperController(String outputDir) {
this.outputDir = outputDir;
this.strategies = new HashMap<>();
this.gson = new GsonBuilder().setPrettyPrinting().create();
registerDefaultStrategies();
}
private void registerDefaultStrategies() {
registerStrategy(new NewsScraperStrategy());
registerStrategy(new BooksScraperStrategy());
registerStrategy(new TechNewsScraperStrategy());
}
public void registerStrategy(ScraperStrategy strategy) {
strategies.put(strategy.getName(), strategy);
}
public List<Map<String, String>> listStrategies() {
List<Map<String, String>> result = new ArrayList<>();
for (ScraperStrategy strategy : strategies.values()) {
Map<String, String> info = new HashMap<>();
info.put("name", strategy.getName());
info.put("source", strategy.getSource());
result.add(info);
}
return result;
}
public ScrapedData executeScrape(String strategyName) throws StrategyException {
ScraperStrategy strategy = strategies.get(strategyName);
if (strategy == null) {
String available = String.join(", ", strategies.keySet());
throw new StrategyException(
"Strategy '" + strategyName + "' not found. Available: " + available,
strategyName,
null
);
}
try {
return strategy.scrape();
} catch (Exception e) {
if (e instanceof StrategyException) {
throw (StrategyException) e;
}
throw new StrategyException(
"Error executing strategy: " + strategyName,
strategyName,
e
);
}
}
public String saveData(ScrapedData data, String strategyName) throws StorageException {
try {
String folderPath = outputDir + File.separator + strategyName;
Path folder = Paths.get(folderPath);
Files.createDirectories(folder);
String timestamp = LocalDateTime.now().format(
DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")
);
String filename = "scraped_data_" + timestamp + ".json";
String filePath = folderPath + File.separator + filename;
try (FileWriter writer = new FileWriter(filePath)) {
gson.toJson(data, writer);
}
return filePath;
} catch (IOException e) {
throw new StorageException(
"Failed to save data to: " + outputDir,
outputDir,
e
);
}
}
public boolean deleteData(String filePath) throws StorageException {
try {
Path path = Paths.get(filePath);
if (Files.exists(path)) {
Files.delete(path);
return true;
}
return false;
} catch (IOException e) {
throw new StorageException(
"Failed to delete file: " + filePath,
filePath,
e
);
}
}
public void setOutputDir(String outputDir) {
this.outputDir = outputDir;
}
}

27
java-scraper/src/main/java/com/scraper/exception/NetworkException.java

@ -0,0 +1,27 @@
package com.scraper.exception;
/**
* Exception thrown when there is a network related error
*/
public class NetworkException extends ScraperException {
private static final long serialVersionUID = 1L;
private Integer statusCode;
public NetworkException(String message) {
super(message);
}
public NetworkException(String message, Throwable cause) {
super(message, cause);
}
public NetworkException(String message, Integer statusCode, Throwable cause) {
super(message, cause);
this.statusCode = statusCode;
}
public Integer getStatusCode() {
return statusCode;
}
}

27
java-scraper/src/main/java/com/scraper/exception/ParseException.java

@ -0,0 +1,27 @@
package com.scraper.exception;
/**
* Exception thrown when there is a parsing error
*/
public class ParseException extends ScraperException {
private static final long serialVersionUID = 1L;
private String selector;
public ParseException(String message) {
super(message);
}
public ParseException(String message, Throwable cause) {
super(message, cause);
}
public ParseException(String message, String selector, Throwable cause) {
super(message, cause);
this.selector = selector;
}
public String getSelector() {
return selector;
}
}

17
java-scraper/src/main/java/com/scraper/exception/ScraperException.java

@ -0,0 +1,17 @@
package com.scraper.exception;
/**
* Base exception class for the scraper application
*/
public class ScraperException extends Exception {
private static final long serialVersionUID = 1L;
public ScraperException(String message) {
super(message);
}
public ScraperException(String message, Throwable cause) {
super(message, cause);
}
}

27
java-scraper/src/main/java/com/scraper/exception/StorageException.java

@ -0,0 +1,27 @@
package com.scraper.exception;
/**
* Exception thrown when there is a storage related error
*/
public class StorageException extends ScraperException {
private static final long serialVersionUID = 1L;
private String filePath;
public StorageException(String message) {
super(message);
}
public StorageException(String message, Throwable cause) {
super(message, cause);
}
public StorageException(String message, String filePath, Throwable cause) {
super(message, cause);
this.filePath = filePath;
}
public String getFilePath() {
return filePath;
}
}

27
java-scraper/src/main/java/com/scraper/exception/StrategyException.java

@ -0,0 +1,27 @@
package com.scraper.exception;
/**
* Exception thrown when there is a strategy related error
*/
public class StrategyException extends ScraperException {
private static final long serialVersionUID = 1L;
private String strategyName;
public StrategyException(String message) {
super(message);
}
public StrategyException(String message, Throwable cause) {
super(message, cause);
}
public StrategyException(String message, String strategyName, Throwable cause) {
super(message, cause);
this.strategyName = strategyName;
}
public String getStrategyName() {
return strategyName;
}
}

77
java-scraper/src/main/java/com/scraper/model/ScrapedData.java

@ -0,0 +1,77 @@
package com.scraper.model;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
/**
* Model class representing the complete scraped data container
*/
public class ScrapedData {
private String source;
private String strategyName;
private List<ScrapedItem> items;
private LocalDateTime scrapedAt;
private int totalItems;
public ScrapedData() {
this.items = new ArrayList<>();
this.scrapedAt = LocalDateTime.now();
this.totalItems = 0;
}
public ScrapedData(String source, String strategyName) {
this.source = source;
this.strategyName = strategyName;
this.items = new ArrayList<>();
this.scrapedAt = LocalDateTime.now();
this.totalItems = 0;
}
public void addItem(ScrapedItem item) {
this.items.add(item);
this.totalItems = this.items.size();
}
// Getters and Setters
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getStrategyName() {
return strategyName;
}
public void setStrategyName(String strategyName) {
this.strategyName = strategyName;
}
public List<ScrapedItem> getItems() {
return items;
}
public void setItems(List<ScrapedItem> items) {
this.items = items;
this.totalItems = items.size();
}
public LocalDateTime getScrapedAt() {
return scrapedAt;
}
public void setScrapedAt(LocalDateTime scrapedAt) {
this.scrapedAt = scrapedAt;
}
public int getTotalItems() {
return totalItems;
}
public void setTotalItems(int totalItems) {
this.totalItems = totalItems;
}
}

57
java-scraper/src/main/java/com/scraper/model/ScrapedItem.java

@ -0,0 +1,57 @@
package com.scraper.model;
import java.time.LocalDateTime;
/**
* Model class representing a single scraped item
*/
public class ScrapedItem {
private String title;
private String content;
private String url;
private LocalDateTime timestamp;
public ScrapedItem() {
this.timestamp = LocalDateTime.now();
}
public ScrapedItem(String title, String content, String url) {
this.title = title;
this.content = content;
this.url = url;
this.timestamp = LocalDateTime.now();
}
// Getters and Setters
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public LocalDateTime getTimestamp() {
return timestamp;
}
public void setTimestamp(LocalDateTime timestamp) {
this.timestamp = timestamp;
}
}

102
java-scraper/src/main/java/com/scraper/strategy/BooksScraperStrategy.java

@ -0,0 +1,102 @@
package com.scraper.strategy;
import com.scraper.exception.NetworkException;
import com.scraper.exception.ParseException;
import com.scraper.model.ScrapedData;
import com.scraper.model.ScrapedItem;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Strategy for scraping books from https://books.toscrape.com
*/
public class BooksScraperStrategy implements ScraperStrategy {
private static final String NAME = "books_scraper";
private static final String SOURCE = "https://books.toscrape.com";
@Override
public ScrapedData scrape() throws NetworkException, ParseException {
ScrapedData data = new ScrapedData(SOURCE, NAME);
try {
String html = fetchHTML(SOURCE);
parseBooks(html, data);
} catch (NetworkException e) {
throw e;
} catch (Exception e) {
throw new ParseException("Failed to parse books", null, e);
}
return data;
}
private String fetchHTML(String urlString) throws NetworkException {
try {
URL url = new URL(urlString);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0");
connection.setConnectTimeout(10000);
connection.setReadTimeout(10000);
int statusCode = connection.getResponseCode();
if (statusCode != 200) {
throw new NetworkException("Failed to fetch URL: " + urlString, statusCode, null);
}
StringBuilder response = new StringBuilder();
try (BufferedReader in = new BufferedReader(
new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8))) {
String inputLine;
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
response.append("\n");
}
}
connection.disconnect();
return response.toString();
} catch (Exception e) {
throw new NetworkException("Network error while fetching: " + urlString, e);
}
}
private void parseBooks(String html, ScrapedData data) {
Pattern titlePattern = Pattern.compile("<h3><a href=\"[^\"]*\" title=\"([^\"]+)\"");
Pattern pricePattern = Pattern.compile("price_color\">([^<]+)<");
Matcher titleMatcher = titlePattern.matcher(html);
Matcher priceMatcher = pricePattern.matcher(html);
int count = 0;
while (titleMatcher.find() && priceMatcher.find() && count < 20) {
String title = titleMatcher.group(1).trim();
String price = priceMatcher.group(1).trim();
ScrapedItem item = new ScrapedItem(
title,
"Price: " + price,
SOURCE
);
data.addItem(item);
count++;
}
}
@Override
public String getName() {
return NAME;
}
@Override
public String getSource() {
return SOURCE;
}
}

121
java-scraper/src/main/java/com/scraper/strategy/NewsScraperStrategy.java

@ -0,0 +1,121 @@
package com.scraper.strategy;
import com.scraper.exception.NetworkException;
import com.scraper.exception.ParseException;
import com.scraper.model.ScrapedData;
import com.scraper.model.ScrapedItem;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Strategy for scraping quotes from http://quotes.toscrape.com
*/
public class NewsScraperStrategy implements ScraperStrategy {
private static final String NAME = "news_scraper";
private static final String SOURCE = "http://quotes.toscrape.com";
@Override
public ScrapedData scrape() throws NetworkException, ParseException {
ScrapedData data = new ScrapedData(SOURCE, NAME);
try {
String html = fetchHTML(SOURCE);
parseQuotes(html, data);
} catch (NetworkException e) {
throw e;
} catch (Exception e) {
throw new ParseException("Failed to parse quotes", null, e);
}
return data;
}
private String fetchHTML(String urlString) throws NetworkException {
try {
URL url = new URL(urlString);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0");
connection.setConnectTimeout(10000);
connection.setReadTimeout(10000);
int statusCode = connection.getResponseCode();
if (statusCode != 200) {
throw new NetworkException("Failed to fetch URL: " + urlString, statusCode, null);
}
StringBuilder response = new StringBuilder();
try (BufferedReader in = new BufferedReader(
new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8))) {
String inputLine;
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
response.append("\n");
}
}
connection.disconnect();
return response.toString();
} catch (Exception e) {
throw new NetworkException("Network error while fetching: " + urlString, e);
}
}
private void parseQuotes(String html, ScrapedData data) {
Pattern quotePattern = Pattern.compile(
"<span class=\"text\">([^<]+)</span>.*?<small class=\"author\">([^<]+)</small>",
Pattern.DOTALL
);
Matcher matcher = quotePattern.matcher(html);
int count = 0;
while (matcher.find() && count < 10) {
String text = matcher.group(1).trim();
String author = matcher.group(2).trim();
ScrapedItem item = new ScrapedItem(
"Quote by " + author,
text,
SOURCE
);
data.addItem(item);
count++;
}
if (count == 0) {
// Fallback to simpler pattern
Pattern simpleTextPattern = Pattern.compile("\"text\">([^<]+)<");
Pattern simpleAuthorPattern = Pattern.compile("author\">([^<]+)<");
Matcher textMatcher = simpleTextPattern.matcher(html);
Matcher authorMatcher = simpleAuthorPattern.matcher(html);
int itemCount = 0;
while (textMatcher.find() && authorMatcher.find() && itemCount < 10) {
ScrapedItem item = new ScrapedItem(
"Quote by " + authorMatcher.group(1).trim(),
textMatcher.group(1).trim(),
SOURCE
);
data.addItem(item);
itemCount++;
}
}
}
@Override
public String getName() {
return NAME;
}
@Override
public String getSource() {
return SOURCE;
}
}

13
java-scraper/src/main/java/com/scraper/strategy/ScraperStrategy.java

@ -0,0 +1,13 @@
package com.scraper.strategy;
import com.scraper.exception.ScraperException;
import com.scraper.model.ScrapedData;
/**
* Strategy interface for web scrapers
*/
public interface ScraperStrategy {
ScrapedData scrape() throws ScraperException;
String getName();
String getSource();
}

114
java-scraper/src/main/java/com/scraper/strategy/TechNewsScraperStrategy.java

@ -0,0 +1,114 @@
package com.scraper.strategy;
import com.scraper.exception.NetworkException;
import com.scraper.exception.ParseException;
import com.scraper.model.ScrapedData;
import com.scraper.model.ScrapedItem;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Strategy for scraping news from https://www.bbc.com/news
*/
public class TechNewsScraperStrategy implements ScraperStrategy {
private static final String NAME = "tech_news_scraper";
private static final String SOURCE = "https://www.bbc.com/news";
@Override
public ScrapedData scrape() throws NetworkException, ParseException {
ScrapedData data = new ScrapedData(SOURCE, NAME);
try {
String html = fetchHTML(SOURCE);
parseHeadlines(html, data);
} catch (NetworkException e) {
throw e;
} catch (Exception e) {
throw new ParseException("Failed to parse tech news", null, e);
}
return data;
}
private String fetchHTML(String urlString) throws NetworkException {
try {
URL url = new URL(urlString);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0");
connection.setConnectTimeout(10000);
connection.setReadTimeout(10000);
int statusCode = connection.getResponseCode();
if (statusCode != 200) {
throw new NetworkException("Failed to fetch URL: " + urlString, statusCode, null);
}
StringBuilder response = new StringBuilder();
try (BufferedReader in = new BufferedReader(
new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8))) {
String inputLine;
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
response.append("\n");
}
}
connection.disconnect();
return response.toString();
} catch (Exception e) {
throw new NetworkException("Network error while fetching: " + urlString, e);
}
}
private void parseHeadlines(String html, ScrapedData data) {
List<String> headlines = new ArrayList<>();
String[] patterns = {
"<h1[^>]*class=\"[^\"]*headline[^\"]*\"[^>]*>([^<]+)<",
"<h2[^>]*class=\"[^\"]*headline[^\"]*\"[^>]*>([^<]+)<",
"<h3[^>]*class=\"[^\"]*headline[^\"]*\"[^>]*>([^<]+)<",
"<h1[^>]*class=\"[^\"]*title[^\"]*\"[^>]*>([^<]+)<",
"<h2[^>]*class=\"[^\"]*title[^\"]*\"[^>]*>([^<]+)<"
};
for (String patternStr : patterns) {
Pattern pattern = Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
String headline = matcher.group(1).trim();
if (!headline.isEmpty() && headline.length() > 10 && !headlines.contains(headline)) {
headlines.add(headline);
}
}
}
for (int i = 0; i < Math.min(headlines.size(), 15); i++) {
ScrapedItem item = new ScrapedItem(
headlines.get(i),
"",
SOURCE
);
data.addItem(item);
}
}
@Override
public String getName() {
return NAME;
}
@Override
public String getSource() {
return SOURCE;
}
}

72
java-scraper/src/main/java/com/scraper/view/ConsoleView.java

@ -0,0 +1,72 @@
package com.scraper.view;
import com.scraper.model.ScrapedData;
import com.scraper.model.ScrapedItem;
import java.util.List;
import java.util.Map;
/**
* MVC View class for console output
*/
public class ConsoleView {
public void displayMessage(String message) {
System.out.println(message);
}
public void displayError(String error) {
System.err.println("[ERROR] " + error);
}
public void displaySuccess(String message) {
System.out.println("[SUCCESS] " + message);
}
public void displayStrategies(List<Map<String, String>> strategies) {
System.out.println("\n=== Available Scrapers ===");
for (int i = 0; i < strategies.size(); i++) {
Map<String, String> strategy = strategies.get(i);
System.out.println((i + 1) + ". " + strategy.get("name"));
System.out.println(" Source: " + strategy.get("source"));
}
System.out.println();
}
public void displayScrapedData(ScrapedData data, String savedPath) {
System.out.println("\n=== Scraping Results ===");
System.out.println("Source: " + data.getSource());
System.out.println("Strategy: " + data.getStrategyName());
System.out.println("Total Items: " + data.getTotalItems());
System.out.println("Scraped At: " + data.getScrapedAt());
if (savedPath != null) {
System.out.println("Saved To: " + savedPath);
}
System.out.println("\n--- Items Preview ---");
List<ScrapedItem> items = data.getItems();
int displayCount = Math.min(items.size(), 5);
for (int i = 0; i < displayCount; i++) {
ScrapedItem item = items.get(i);
System.out.println((i + 1) + ". " + safeString(item.getTitle()));
if (item.getContent() != null && !item.getContent().isEmpty()) {
String content = safeString(item.getContent());
String truncated = content.length() > 80 ? content.substring(0, 80) + "..." : content;
System.out.println(" " + truncated);
}
System.out.println();
}
if (items.size() > 5) {
System.out.println("... and " + (items.size() - 5) + " more items");
}
}
private String safeString(String str) {
if (str == null) {
return "";
}
return str.replaceAll("[^\\x20-\\x7E]", "?");
}
}

BIN
java-scraper/target/classes/com/scraper/Main.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/command/Command.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/command/ListCommand.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/command/ScrapeCommand.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/controller/ScraperController.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/exception/NetworkException.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/exception/ParseException.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/exception/ScraperException.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/exception/StorageException.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/exception/StrategyException.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/model/ScrapedData.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/model/ScrapedItem.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/strategy/BooksScraperStrategy.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/strategy/NewsScraperStrategy.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/strategy/ScraperStrategy.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/strategy/TechNewsScraperStrategy.class

Binary file not shown.

BIN
java-scraper/target/classes/com/scraper/view/ConsoleView.class

Binary file not shown.

102
main.py

@ -0,0 +1,102 @@
import argparse
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from controllers import ScraperController
from commands import ScrapeCommand, ListCommand
from views import ConsoleView
from exceptions import ScraperException
class CLIApplication:
def __init__(self):
self.controller = ScraperController()
self.view = ConsoleView()
def run(self, args=None):
parser = self._create_parser()
parsed_args = parser.parse_args(args)
if hasattr(parsed_args, 'func'):
try:
parsed_args.func(parsed_args)
except ScraperException as e:
self.view.display_error(str(e))
if e.original_exception:
self.view.display_error(f"Original error: {e.original_exception}")
sys.exit(1)
except Exception as e:
self.view.display_error(f"Unexpected error: {str(e)}")
sys.exit(1)
else:
parser.print_help()
def _create_parser(self) -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description='Web Scraper CLI - MVC + Command Pattern + Strategy Pattern',
formatter_class=argparse.RawDescriptionHelpFormatter
)
subparsers = parser.add_subparsers(dest='command', help='Available commands')
scrape_parser = subparsers.add_parser('scrape', help='Scrape data from a website')
scrape_parser.add_argument(
'strategy',
choices=['news_scraper', 'books_scraper', 'tech_news_scraper', 'all'],
help='Scraper strategy to use'
)
scrape_parser.add_argument(
'--output', '-o',
default='data',
help='Output directory for scraped data'
)
scrape_parser.set_defaults(func=self._handle_scrape)
list_parser = subparsers.add_parser('list', help='List all available scrapers')
list_parser.set_defaults(func=self._handle_list)
info_parser = subparsers.add_parser('info', help='Show detailed info about a scraper')
info_parser.add_argument('strategy', help='Strategy name')
info_parser.set_defaults(func=self._handle_info)
return parser
def _handle_scrape(self, args):
if args.strategy == 'all':
strategies = ['news_scraper', 'books_scraper', 'tech_news_scraper']
for strategy in strategies:
self._scrape_single(strategy, args.output)
else:
self._scrape_single(args.strategy, args.output)
def _scrape_single(self, strategy_name: str, output_dir: str):
self.controller.output_dir = output_dir
command = ScrapeCommand(self.controller, strategy_name)
data, saved_path = command.execute()
self.view.display_success(f"Scraped {data.total_items} items using {strategy_name}")
self.view.display_scraped_data(data, saved_path)
def _handle_list(self, args):
command = ListCommand(self.controller)
strategies = command.execute()
self.view.display_strategies(strategies)
def _handle_info(self, args):
strategies = self.controller.list_strategies()
strategy = next((s for s in strategies if s['name'] == args.strategy), None)
if strategy:
self.view.display_message(f"\n=== {strategy['name']} ===")
self.view.display_message(f"Source: {strategy['source']}")
else:
self.view.display_error(f"Strategy '{args.strategy}' not found")
def main():
app = CLIApplication()
app.run()
if __name__ == '__main__':
main()

3
models/__init__.py

@ -0,0 +1,3 @@
from .scraped_data import ScrapedItem, ScrapedData
__all__ = ['ScrapedItem', 'ScrapedData']

BIN
models/__pycache__/__init__.cpython-314.pyc

Binary file not shown.

BIN
models/__pycache__/scraped_data.cpython-314.pyc

Binary file not shown.

41
models/scraped_data.py

@ -0,0 +1,41 @@
from dataclasses import dataclass, field
from datetime import datetime
from typing import List, Dict, Any
@dataclass
class ScrapedItem:
title: str
content: str
url: str
timestamp: datetime = field(default_factory=datetime.now)
def to_dict(self) -> Dict[str, Any]:
return {
'title': self.title,
'content': self.content,
'url': self.url,
'timestamp': self.timestamp.isoformat()
}
@dataclass
class ScrapedData:
source: str
strategy_name: str
items: List[ScrapedItem] = field(default_factory=list)
scraped_at: datetime = field(default_factory=datetime.now)
total_items: int = 0
def add_item(self, item: ScrapedItem):
self.items.append(item)
self.total_items = len(self.items)
def to_dict(self) -> Dict[str, Any]:
return {
'source': self.source,
'strategy_name': self.strategy_name,
'items': [item.to_dict() for item in self.items],
'scraped_at': self.scraped_at.isoformat(),
'total_items': self.total_items
}

5
requirements.txt

@ -0,0 +1,5 @@
requests>=2.28.0
beautifulsoup4>=4.11.0
requests is optional, the scraper uses urllib by default.
beautifulsoup4 is optional, the scraper uses html.parser by default.

11
strategies/__init__.py

@ -0,0 +1,11 @@
from .base_scraper import ScraperStrategy
from .news_scraper import NewsScraperStrategy
from .quotes_scraper import BooksScraperStrategy
from .tech_news_scraper import TechNewsScraperStrategy
__all__ = [
'ScraperStrategy',
'NewsScraperStrategy',
'BooksScraperStrategy',
'TechNewsScraperStrategy'
]

BIN
strategies/__pycache__/__init__.cpython-314.pyc

Binary file not shown.

BIN
strategies/__pycache__/base_scraper.cpython-314.pyc

Binary file not shown.

BIN
strategies/__pycache__/news_scraper.cpython-314.pyc

Binary file not shown.

BIN
strategies/__pycache__/quotes_scraper.cpython-314.pyc

Binary file not shown.

BIN
strategies/__pycache__/tech_news_scraper.cpython-314.pyc

Binary file not shown.

18
strategies/base_scraper.py

@ -0,0 +1,18 @@
from abc import ABC, abstractmethod
from models import ScrapedData
class ScraperStrategy(ABC):
@abstractmethod
def scrape(self) -> ScrapedData:
pass
@property
@abstractmethod
def name(self) -> str:
pass
@property
@abstractmethod
def source(self) -> str:
pass

72
strategies/news_scraper.py

@ -0,0 +1,72 @@
from urllib.request import urlopen, Request
import re
from datetime import datetime
from strategies.base_scraper import ScraperStrategy
from models import ScrapedItem, ScrapedData
from exceptions import NetworkException, ParseException
class NewsScraperStrategy(ScraperStrategy):
def __init__(self):
self._name = "news_scraper"
self._source = "http://quotes.toscrape.com"
@property
def name(self) -> str:
return self._name
@property
def source(self) -> str:
return self._source
def scrape(self) -> ScrapedData:
data = ScrapedData(source=self.source, strategy_name=self.name)
try:
request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(request, timeout=10)
html = response.read().decode('utf-8')
except Exception as e:
raise NetworkException(
f"Failed to fetch news from {self.source}",
original_exception=e
)
try:
quotes = self._extract_quotes(html)
for quote_text, author in quotes[:10]:
item = ScrapedItem(
title=f"Quote by {author}",
content=quote_text,
url=self.source
)
data.add_item(item)
except Exception as e:
raise ParseException(
"Failed to parse news content",
selector="div.quote",
original_exception=e
)
return data
def _extract_quotes(self, html):
quotes = []
quote_pattern = r'<div class="quote"[^>]*>.*?<span class="text"[^>]*>([^<]+)</span>.*?<small class="author">([^<]+)</small>'
matches = re.findall(quote_pattern, html, re.DOTALL)
for match in matches:
quotes.append((match[0].strip(), match[1].strip()))
if not quotes:
text_pattern = r'"text">([^<]+)<'
author_pattern = r'author">([^<]+)<'
texts = re.findall(text_pattern, html)
authors = re.findall(author_pattern, html)
for i in range(min(len(texts), len(authors))):
quotes.append((texts[i].strip(), authors[i].strip()))
return quotes

67
strategies/quotes_scraper.py

@ -0,0 +1,67 @@
from urllib.request import urlopen, Request
import re
from datetime import datetime
from strategies.base_scraper import ScraperStrategy
from models import ScrapedItem, ScrapedData
from exceptions import NetworkException, ParseException
class BooksScraperStrategy(ScraperStrategy):
def __init__(self):
self._name = "books_scraper"
self._source = "https://books.toscrape.com"
@property
def name(self) -> str:
return self._name
@property
def source(self) -> str:
return self._source
def scrape(self) -> ScrapedData:
data = ScrapedData(source=self.source, strategy_name=self.name)
try:
request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(request, timeout=10)
html = response.read().decode('utf-8')
except Exception as e:
raise NetworkException(
f"Failed to fetch books from {self.source}",
original_exception=e
)
try:
books = self._extract_books(html)
for title, price in books[:20]:
item = ScrapedItem(
title=title,
content=f"Price: {price}",
url=self.source
)
data.add_item(item)
except Exception as e:
raise ParseException(
"Failed to parse book content",
selector="article.product_pod",
original_exception=e
)
return data
def _extract_books(self, html):
books = []
title_pattern = r'<h3><a href="[^"]*" title="([^"]+)"'
price_pattern = r'price_color">([^<]+)<'
titles = re.findall(title_pattern, html)
prices = re.findall(price_pattern, html)
for i in range(min(len(titles), len(prices))):
books.append((titles[i].strip(), prices[i].strip()))
return books

81
strategies/tech_news_scraper.py

@ -0,0 +1,81 @@
from urllib.request import urlopen, Request
import re
from strategies.base_scraper import ScraperStrategy
from models import ScrapedItem, ScrapedData
from exceptions import NetworkException, ParseException
class TechNewsScraperStrategy(ScraperStrategy):
def __init__(self):
self._name = "tech_news_scraper"
self._source = "https://www.bbc.com/news"
@property
def name(self) -> str:
return self._name
@property
def source(self) -> str:
return self._source
def scrape(self) -> ScrapedData:
data = ScrapedData(source=self.source, strategy_name=self.name)
try:
request = Request(self.source, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
response = urlopen(request, timeout=10)
html = response.read().decode('utf-8')
except Exception as e:
raise NetworkException(
f"Failed to fetch tech news from {self.source}",
original_exception=e
)
try:
headlines = self._extract_headlines(html)
for headline in headlines[:15]:
item = ScrapedItem(
title=headline,
content="",
url=self.source
)
data.add_item(item)
except Exception as e:
raise ParseException(
"Failed to parse tech news content",
selector="h1, h2, h3",
original_exception=e
)
return data
def _extract_headlines(self, html):
headlines = []
h_patterns = [
r'<h1[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
r'<h2[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
r'<h3[^>]*class="[^"]*headline[^"]*"[^>]*>([^<]+)<',
r'<h1[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<',
r'<h2[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)<',
]
for pattern in h_patterns:
matches = re.findall(pattern, html, re.IGNORECASE)
for match in matches:
headline = match.strip()
if headline and len(headline) > 10:
headlines.append(headline)
seen = set()
unique_headlines = []
for h in headlines:
if h not in seen:
seen.add(h)
unique_headlines.append(h)
return unique_headlines

3
views/__init__.py

@ -0,0 +1,3 @@
from .console_view import ConsoleView
__all__ = ['ConsoleView']

BIN
views/__pycache__/__init__.cpython-314.pyc

Binary file not shown.

BIN
views/__pycache__/console_view.cpython-314.pyc

Binary file not shown.

68
views/console_view.py

@ -0,0 +1,68 @@
import sys
from typing import List, Dict, Any
class ConsoleView:
def display_message(self, message: str):
try:
print(message)
except UnicodeEncodeError:
print(message.encode('utf-8', errors='replace').decode('utf-8'))
def display_error(self, error: str):
try:
print(f"[ERROR] {error}")
except UnicodeEncodeError:
print(f"[ERROR] {error.encode('utf-8', errors='replace').decode('utf-8')}")
def display_success(self, message: str):
try:
print(f"[SUCCESS] {message}")
except UnicodeEncodeError:
print(f"[SUCCESS] {message.encode('utf-8', errors='replace').decode('utf-8')}")
def display_strategies(self, strategies: List[Dict[str, str]]):
print("\n=== Available Scrapers ===")
for idx, strategy in enumerate(strategies, 1):
print(f"{idx}. {strategy['name']}")
print(f" Source: {strategy['source']}")
print()
def display_scraped_data(self, data: Any, saved_path: str = None):
if hasattr(data, 'to_dict'):
data = data.to_dict()
print("\n=== Scraping Results ===")
print(f"Source: {data.get('source', 'N/A')}")
print(f"Strategy: {data.get('strategy_name', 'N/A')}")
print(f"Total Items: {data.get('total_items', len(data.get('items', [])))}")
print(f"Scraped At: {data.get('scraped_at', 'N/A')}")
if saved_path:
print(f"Saved To: {saved_path}")
print("\n--- Items Preview ---")
items = data.get('items', [])
for idx, item in enumerate(items[:5], 1):
try:
title = item.get('title', 'N/A')
print(f"{idx}. {title}")
except UnicodeEncodeError:
print(f"{idx}. {item.get('title', 'N/A').encode('utf-8', errors='replace').decode('utf-8')}")
if item.get('content'):
content = item.get('content', '')
try:
truncated = content[:80] + "..." if len(content) > 80 else content
print(f" {truncated}")
except UnicodeEncodeError:
truncated = content[:80].encode('utf-8', errors='replace').decode('utf-8')
print(f" {truncated}")
print()
if len(items) > 5:
print(f"... and {len(items) - 5} more items")
def display_list(self, items: List[Any]):
for item in items:
print(item)
Loading…
Cancel
Save