items = data.getItems();
+ int displayCount = Math.min(items.size(), 5);
+ for (int i = 0; i < displayCount; i++) {
+ ScrapedItem item = items.get(i);
+ System.out.println((i + 1) + ". " + safeString(item.getTitle()));
+ if (item.getContent() != null && !item.getContent().isEmpty()) {
+ String content = safeString(item.getContent());
+ String truncated = content.length() > 80 ? content.substring(0, 80) + "..." : content;
+ System.out.println(" " + truncated);
+ }
+ System.out.println();
+ }
+
+ if (items.size() > 5) {
+ System.out.println("... and " + (items.size() - 5) + " more items");
+ }
+ }
+
+ private String safeString(String str) {
+ if (str == null) {
+ return "";
+ }
+ return str.replaceAll("[^\\x20-\\x7E]", "?");
+ }
+}
diff --git a/java-scraper/target/classes/com/scraper/Main.class b/java-scraper/target/classes/com/scraper/Main.class
new file mode 100644
index 0000000..cefcf5b
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/Main.class differ
diff --git a/java-scraper/target/classes/com/scraper/command/Command.class b/java-scraper/target/classes/com/scraper/command/Command.class
new file mode 100644
index 0000000..b7ab774
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/command/Command.class differ
diff --git a/java-scraper/target/classes/com/scraper/command/ListCommand.class b/java-scraper/target/classes/com/scraper/command/ListCommand.class
new file mode 100644
index 0000000..1ef3dbd
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/command/ListCommand.class differ
diff --git a/java-scraper/target/classes/com/scraper/command/ScrapeCommand.class b/java-scraper/target/classes/com/scraper/command/ScrapeCommand.class
new file mode 100644
index 0000000..bfa000f
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/command/ScrapeCommand.class differ
diff --git a/java-scraper/target/classes/com/scraper/controller/ScraperController.class b/java-scraper/target/classes/com/scraper/controller/ScraperController.class
new file mode 100644
index 0000000..3e523d4
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/controller/ScraperController.class differ
diff --git a/java-scraper/target/classes/com/scraper/exception/NetworkException.class b/java-scraper/target/classes/com/scraper/exception/NetworkException.class
new file mode 100644
index 0000000..7e17126
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/exception/NetworkException.class differ
diff --git a/java-scraper/target/classes/com/scraper/exception/ParseException.class b/java-scraper/target/classes/com/scraper/exception/ParseException.class
new file mode 100644
index 0000000..d2305cf
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/exception/ParseException.class differ
diff --git a/java-scraper/target/classes/com/scraper/exception/ScraperException.class b/java-scraper/target/classes/com/scraper/exception/ScraperException.class
new file mode 100644
index 0000000..ce80753
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/exception/ScraperException.class differ
diff --git a/java-scraper/target/classes/com/scraper/exception/StorageException.class b/java-scraper/target/classes/com/scraper/exception/StorageException.class
new file mode 100644
index 0000000..8792b37
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/exception/StorageException.class differ
diff --git a/java-scraper/target/classes/com/scraper/exception/StrategyException.class b/java-scraper/target/classes/com/scraper/exception/StrategyException.class
new file mode 100644
index 0000000..d2a5a5f
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/exception/StrategyException.class differ
diff --git a/java-scraper/target/classes/com/scraper/model/ScrapedData.class b/java-scraper/target/classes/com/scraper/model/ScrapedData.class
new file mode 100644
index 0000000..1bfb882
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/model/ScrapedData.class differ
diff --git a/java-scraper/target/classes/com/scraper/model/ScrapedItem.class b/java-scraper/target/classes/com/scraper/model/ScrapedItem.class
new file mode 100644
index 0000000..2224595
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/model/ScrapedItem.class differ
diff --git a/java-scraper/target/classes/com/scraper/strategy/BooksScraperStrategy.class b/java-scraper/target/classes/com/scraper/strategy/BooksScraperStrategy.class
new file mode 100644
index 0000000..d16cac2
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/strategy/BooksScraperStrategy.class differ
diff --git a/java-scraper/target/classes/com/scraper/strategy/NewsScraperStrategy.class b/java-scraper/target/classes/com/scraper/strategy/NewsScraperStrategy.class
new file mode 100644
index 0000000..99ec07a
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/strategy/NewsScraperStrategy.class differ
diff --git a/java-scraper/target/classes/com/scraper/strategy/ScraperStrategy.class b/java-scraper/target/classes/com/scraper/strategy/ScraperStrategy.class
new file mode 100644
index 0000000..d735b20
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/strategy/ScraperStrategy.class differ
diff --git a/java-scraper/target/classes/com/scraper/strategy/TechNewsScraperStrategy.class b/java-scraper/target/classes/com/scraper/strategy/TechNewsScraperStrategy.class
new file mode 100644
index 0000000..4549109
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/strategy/TechNewsScraperStrategy.class differ
diff --git a/java-scraper/target/classes/com/scraper/view/ConsoleView.class b/java-scraper/target/classes/com/scraper/view/ConsoleView.class
new file mode 100644
index 0000000..99bedc0
Binary files /dev/null and b/java-scraper/target/classes/com/scraper/view/ConsoleView.class differ
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..c25f084
--- /dev/null
+++ b/main.py
@@ -0,0 +1,102 @@
+import argparse
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from controllers import ScraperController
+from commands import ScrapeCommand, ListCommand
+from views import ConsoleView
+from exceptions import ScraperException
+
+
+class CLIApplication:
+ def __init__(self):
+ self.controller = ScraperController()
+ self.view = ConsoleView()
+
+ def run(self, args=None):
+ parser = self._create_parser()
+ parsed_args = parser.parse_args(args)
+
+ if hasattr(parsed_args, 'func'):
+ try:
+ parsed_args.func(parsed_args)
+ except ScraperException as e:
+ self.view.display_error(str(e))
+ if e.original_exception:
+ self.view.display_error(f"Original error: {e.original_exception}")
+ sys.exit(1)
+ except Exception as e:
+ self.view.display_error(f"Unexpected error: {str(e)}")
+ sys.exit(1)
+ else:
+ parser.print_help()
+
+ def _create_parser(self) -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ description='Web Scraper CLI - MVC + Command Pattern + Strategy Pattern',
+ formatter_class=argparse.RawDescriptionHelpFormatter
+ )
+
+ subparsers = parser.add_subparsers(dest='command', help='Available commands')
+
+ scrape_parser = subparsers.add_parser('scrape', help='Scrape data from a website')
+ scrape_parser.add_argument(
+ 'strategy',
+ choices=['news_scraper', 'books_scraper', 'tech_news_scraper', 'all'],
+ help='Scraper strategy to use'
+ )
+ scrape_parser.add_argument(
+ '--output', '-o',
+ default='data',
+ help='Output directory for scraped data'
+ )
+ scrape_parser.set_defaults(func=self._handle_scrape)
+
+ list_parser = subparsers.add_parser('list', help='List all available scrapers')
+ list_parser.set_defaults(func=self._handle_list)
+
+ info_parser = subparsers.add_parser('info', help='Show detailed info about a scraper')
+ info_parser.add_argument('strategy', help='Strategy name')
+ info_parser.set_defaults(func=self._handle_info)
+
+ return parser
+
+ def _handle_scrape(self, args):
+ if args.strategy == 'all':
+ strategies = ['news_scraper', 'books_scraper', 'tech_news_scraper']
+ for strategy in strategies:
+ self._scrape_single(strategy, args.output)
+ else:
+ self._scrape_single(args.strategy, args.output)
+
+ def _scrape_single(self, strategy_name: str, output_dir: str):
+ self.controller.output_dir = output_dir
+ command = ScrapeCommand(self.controller, strategy_name)
+ data, saved_path = command.execute()
+ self.view.display_success(f"Scraped {data.total_items} items using {strategy_name}")
+ self.view.display_scraped_data(data, saved_path)
+
+ def _handle_list(self, args):
+ command = ListCommand(self.controller)
+ strategies = command.execute()
+ self.view.display_strategies(strategies)
+
+ def _handle_info(self, args):
+ strategies = self.controller.list_strategies()
+ strategy = next((s for s in strategies if s['name'] == args.strategy), None)
+ if strategy:
+ self.view.display_message(f"\n=== {strategy['name']} ===")
+ self.view.display_message(f"Source: {strategy['source']}")
+ else:
+ self.view.display_error(f"Strategy '{args.strategy}' not found")
+
+
+def main():
+ app = CLIApplication()
+ app.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/models/__init__.py b/models/__init__.py
new file mode 100644
index 0000000..55e2bcc
--- /dev/null
+++ b/models/__init__.py
@@ -0,0 +1,3 @@
+from .scraped_data import ScrapedItem, ScrapedData
+
+__all__ = ['ScrapedItem', 'ScrapedData']
diff --git a/models/__pycache__/__init__.cpython-314.pyc b/models/__pycache__/__init__.cpython-314.pyc
new file mode 100644
index 0000000..4575285
Binary files /dev/null and b/models/__pycache__/__init__.cpython-314.pyc differ
diff --git a/models/__pycache__/scraped_data.cpython-314.pyc b/models/__pycache__/scraped_data.cpython-314.pyc
new file mode 100644
index 0000000..caa53e4
Binary files /dev/null and b/models/__pycache__/scraped_data.cpython-314.pyc differ
diff --git a/models/scraped_data.py b/models/scraped_data.py
new file mode 100644
index 0000000..80e524b
--- /dev/null
+++ b/models/scraped_data.py
@@ -0,0 +1,41 @@
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import List, Dict, Any
+
+
+@dataclass
+class ScrapedItem:
+ title: str
+ content: str
+ url: str
+ timestamp: datetime = field(default_factory=datetime.now)
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {
+ 'title': self.title,
+ 'content': self.content,
+ 'url': self.url,
+ 'timestamp': self.timestamp.isoformat()
+ }
+
+
+@dataclass
+class ScrapedData:
+ source: str
+ strategy_name: str
+ items: List[ScrapedItem] = field(default_factory=list)
+ scraped_at: datetime = field(default_factory=datetime.now)
+ total_items: int = 0
+
+ def add_item(self, item: ScrapedItem):
+ self.items.append(item)
+ self.total_items = len(self.items)
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {
+ 'source': self.source,
+ 'strategy_name': self.strategy_name,
+ 'items': [item.to_dict() for item in self.items],
+ 'scraped_at': self.scraped_at.isoformat(),
+ 'total_items': self.total_items
+ }
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..ae45045
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+requests>=2.28.0
+beautifulsoup4>=4.11.0
+
+requests is optional, the scraper uses urllib by default.
+beautifulsoup4 is optional, the scraper uses html.parser by default.
diff --git a/strategies/__init__.py b/strategies/__init__.py
new file mode 100644
index 0000000..b52c61e
--- /dev/null
+++ b/strategies/__init__.py
@@ -0,0 +1,11 @@
+from .base_scraper import ScraperStrategy
+from .news_scraper import NewsScraperStrategy
+from .quotes_scraper import BooksScraperStrategy
+from .tech_news_scraper import TechNewsScraperStrategy
+
+__all__ = [
+ 'ScraperStrategy',
+ 'NewsScraperStrategy',
+ 'BooksScraperStrategy',
+ 'TechNewsScraperStrategy'
+]
diff --git a/strategies/__pycache__/__init__.cpython-314.pyc b/strategies/__pycache__/__init__.cpython-314.pyc
new file mode 100644
index 0000000..1b5185d
Binary files /dev/null and b/strategies/__pycache__/__init__.cpython-314.pyc differ
diff --git a/strategies/__pycache__/base_scraper.cpython-314.pyc b/strategies/__pycache__/base_scraper.cpython-314.pyc
new file mode 100644
index 0000000..eda532f
Binary files /dev/null and b/strategies/__pycache__/base_scraper.cpython-314.pyc differ
diff --git a/strategies/__pycache__/news_scraper.cpython-314.pyc b/strategies/__pycache__/news_scraper.cpython-314.pyc
new file mode 100644
index 0000000..5f26ebe
Binary files /dev/null and b/strategies/__pycache__/news_scraper.cpython-314.pyc differ
diff --git a/strategies/__pycache__/quotes_scraper.cpython-314.pyc b/strategies/__pycache__/quotes_scraper.cpython-314.pyc
new file mode 100644
index 0000000..fed4be5
Binary files /dev/null and b/strategies/__pycache__/quotes_scraper.cpython-314.pyc differ
diff --git a/strategies/__pycache__/tech_news_scraper.cpython-314.pyc b/strategies/__pycache__/tech_news_scraper.cpython-314.pyc
new file mode 100644
index 0000000..915421c
Binary files /dev/null and b/strategies/__pycache__/tech_news_scraper.cpython-314.pyc differ
diff --git a/strategies/base_scraper.py b/strategies/base_scraper.py
new file mode 100644
index 0000000..4da220b
--- /dev/null
+++ b/strategies/base_scraper.py
@@ -0,0 +1,18 @@
+from abc import ABC, abstractmethod
+from models import ScrapedData
+
+
+class ScraperStrategy(ABC):
+ @abstractmethod
+ def scrape(self) -> ScrapedData:
+ pass
+
+ @property
+ @abstractmethod
+ def name(self) -> str:
+ pass
+
+ @property
+ @abstractmethod
+ def source(self) -> str:
+ pass
diff --git a/strategies/news_scraper.py b/strategies/news_scraper.py
new file mode 100644
index 0000000..5e7e238
--- /dev/null
+++ b/strategies/news_scraper.py
@@ -0,0 +1,72 @@
+from urllib.request import urlopen, Request
+import re
+from datetime import datetime
+
+from strategies.base_scraper import ScraperStrategy
+from models import ScrapedItem, ScrapedData
+from exceptions import NetworkException, ParseException
+
+
+class NewsScraperStrategy(ScraperStrategy):
+ def __init__(self):
+ self._name = "news_scraper"
+ self._source = "http://quotes.toscrape.com"
+
+ @property
+ def name(self) -> str:
+ return self._name
+
+ @property
+ def source(self) -> str:
+ return self._source
+
+ def scrape(self) -> ScrapedData:
+ data = ScrapedData(source=self.source, strategy_name=self.name)
+ try:
+ request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'})
+ response = urlopen(request, timeout=10)
+ html = response.read().decode('utf-8')
+ except Exception as e:
+ raise NetworkException(
+ f"Failed to fetch news from {self.source}",
+ original_exception=e
+ )
+
+ try:
+ quotes = self._extract_quotes(html)
+
+ for quote_text, author in quotes[:10]:
+ item = ScrapedItem(
+ title=f"Quote by {author}",
+ content=quote_text,
+ url=self.source
+ )
+ data.add_item(item)
+
+ except Exception as e:
+ raise ParseException(
+ "Failed to parse news content",
+ selector="div.quote",
+ original_exception=e
+ )
+
+ return data
+
+ def _extract_quotes(self, html):
+ quotes = []
+ quote_pattern = r']*>.*?
]*>([^<]+).*?
([^<]+)'
+
+ matches = re.findall(quote_pattern, html, re.DOTALL)
+ for match in matches:
+ quotes.append((match[0].strip(), match[1].strip()))
+
+ if not quotes:
+ text_pattern = r'"text">([^<]+)<'
+ author_pattern = r'author">([^<]+)<'
+ texts = re.findall(text_pattern, html)
+ authors = re.findall(author_pattern, html)
+
+ for i in range(min(len(texts), len(authors))):
+ quotes.append((texts[i].strip(), authors[i].strip()))
+
+ return quotes
diff --git a/strategies/quotes_scraper.py b/strategies/quotes_scraper.py
new file mode 100644
index 0000000..b2c2dbb
--- /dev/null
+++ b/strategies/quotes_scraper.py
@@ -0,0 +1,67 @@
+from urllib.request import urlopen, Request
+import re
+from datetime import datetime
+
+from strategies.base_scraper import ScraperStrategy
+from models import ScrapedItem, ScrapedData
+from exceptions import NetworkException, ParseException
+
+
+class BooksScraperStrategy(ScraperStrategy):
+ def __init__(self):
+ self._name = "books_scraper"
+ self._source = "https://books.toscrape.com"
+
+ @property
+ def name(self) -> str:
+ return self._name
+
+ @property
+ def source(self) -> str:
+ return self._source
+
+ def scrape(self) -> ScrapedData:
+ data = ScrapedData(source=self.source, strategy_name=self.name)
+ try:
+ request = Request(self.source, headers={'User-Agent': 'Mozilla/5.0'})
+ response = urlopen(request, timeout=10)
+ html = response.read().decode('utf-8')
+ except Exception as e:
+ raise NetworkException(
+ f"Failed to fetch books from {self.source}",
+ original_exception=e
+ )
+
+ try:
+ books = self._extract_books(html)
+
+ for title, price in books[:20]:
+ item = ScrapedItem(
+ title=title,
+ content=f"Price: {price}",
+ url=self.source
+ )
+ data.add_item(item)
+
+ except Exception as e:
+ raise ParseException(
+ "Failed to parse book content",
+ selector="article.product_pod",
+ original_exception=e
+ )
+
+ return data
+
+ def _extract_books(self, html):
+ books = []
+
+ title_pattern = r'