diff --git a/w11/java-cli-w11/.gitignore b/w11/java-cli-w11/.gitignore deleted file mode 100644 index 0ebcf1a..0000000 --- a/w11/java-cli-w11/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -*.jar -*.jar -*.class -*.log \ No newline at end of file diff --git a/w11/java-cli-w11/pom.xml b/w11/java-cli-w11/pom.xml deleted file mode 100644 index 9987b1c..0000000 --- a/w11/java-cli-w11/pom.xml +++ /dev/null @@ -1,62 +0,0 @@ - - 4.0.0 - com.example - datacollect-cli - 0.1.0 - - 11 - 11 - - - - org.jsoup - jsoup - 1.17.2 - - - org.slf4j - slf4j-api - 2.0.9 - - - ch.qos.logback - logback-classic - 1.4.14 - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.8.1 - - - org.apache.maven.plugins - maven-assembly-plugin - 3.3.0 - - - - com.example.datacollect.Main - - - - jar-with-dependencies - - - - - make-assembly - package - - single - - - - - - - diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/Main.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/Main.java deleted file mode 100644 index ea9d151..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/Main.java +++ /dev/null @@ -1,41 +0,0 @@ -package com.example.datacollect; - -import com.example.datacollect.controller.CrawlerController; -import com.example.datacollect.repository.ArticleRepository; -import com.example.datacollect.strategy.StrategyFactory; -import com.example.datacollect.view.ConsoleView; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -/*- 添加 logger 成员 -- 记录启动日志 -- 添加全局异常处理 */ -public class Main { - private static final Logger logger = LoggerFactory.getLogger(Main.class); - - public static void main(String[] args) { - try { - logger.info("Starting CLI Crawler application"); - - ConsoleView view = new ConsoleView(); - ArticleRepository repository = new ArticleRepository(); - StrategyFactory strategyFactory = new StrategyFactory(); - CrawlerController controller = new CrawlerController(view, repository, strategyFactory); - - view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); - logger.info("Application initialized successfully"); - - while (true) { - try { - controller.handle(view.readLine()); - } catch (Exception e) { - view.printError("Error: " + e.getMessage()); - logger.error("Error in main loop: {}", e.getMessage(), e); - } - } - } catch (Exception e) { - logger.error("Fatal error in application: {}", e.getMessage(), e); - System.err.println("Fatal error: " + e.getMessage()); - System.exit(1); - } - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/AnalyzeCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/AnalyzeCommand.java deleted file mode 100644 index ec9bcc3..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/AnalyzeCommand.java +++ /dev/null @@ -1,103 +0,0 @@ -package com.example.datacollect.command; - -import com.example.datacollect.exception.NetworkException; -import com.example.datacollect.exception.ParseException; -import com.example.datacollect.model.Article; -import com.example.datacollect.repository.ArticleRepository; -import com.example.datacollect.strategy.CrawlStrategy; -import com.example.datacollect.strategy.StrategyFactory; -import com.example.datacollect.util.RetryUtils; -import com.example.datacollect.view.ConsoleView; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.List; -import java.util.concurrent.Callable; - -public class AnalyzeCommand implements Command { - private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class); - private final ConsoleView view; - private final StrategyFactory strategyFactory; - - public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { - this.view = view; - this.strategyFactory = strategyFactory; - } - - @Override - public String getName() { - return "analyze"; - } - - @Override - public void execute(String[] args, ArticleRepository repository) { - if (args.length < 2) { - view.printError("Usage: analyze "); - logger.warn("Invalid command: missing URL argument"); - return; - } - String url = args[1]; - logger.info("Analyze command executed for URL: {}", url); - - try { - CrawlStrategy strategy = strategyFactory.getStrategy(url); - if (strategy == null) { - view.printError("No strategy found for: " + url); - logger.error("No strategy found for URL: {}", url); - return; - } - - Callable fetchTask = () -> { - logger.debug("Fetching document from: {}", url); - try { - return Jsoup.connect(url) - .userAgent("Mozilla/5.0") - .timeout(5000) - .get(); - } catch (IOException e) { - throw new NetworkException("Failed to connect to " + url + ": " + e.getMessage(), e); - } - }; - - Document doc = RetryUtils.executeWithRetry(fetchTask); - logger.info("Successfully fetched document from: {}", url); - - List
articles = strategy.parse(url, doc); - logger.info("Parsed {} articles for analysis", articles.size()); - - int total = articles.size(); - int totalTitleLen = 0; - int totalContentLen = 0; - - for (Article a : articles) { - totalTitleLen += a.getTitle() == null ? 0 : a.getTitle().length(); - totalContentLen += a.getContent() == null ? 0 : a.getContent().length(); - } - - view.printInfo("===== 分析统计结果 ====="); - view.printInfo("文章总数:" + total + " 篇"); - view.printInfo("标题总长度:" + totalTitleLen); - view.printInfo("内容总长度:" + totalContentLen); - if (total > 0) { - view.printInfo("平均标题长度:" + (totalTitleLen / total)); - view.printInfo("平均内容长度:" + (totalContentLen / total)); - } - view.printInfo("======================"); - view.printSuccess("分析完成(数据未保存)"); - - logger.info("Analysis completed: {} articles analyzed", total); - } catch (NetworkException e) { - view.printError("Network error: " + e.getMessage()); - logger.error("Network error while analyzing {}: {}", url, e.getMessage(), e); - } catch (ParseException e) { - view.printError("Parse error: " + e.getMessage()); - logger.error("Parse error while analyzing {}: {}", url, e.getMessage(), e); - } catch (Exception e) { - view.printError("分析失败:" + e.getMessage()); - logger.error("Unexpected error while analyzing {}: {}", url, e.getMessage(), e); - } - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/Command.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/Command.java deleted file mode 100644 index 029cadc..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/Command.java +++ /dev/null @@ -1,8 +0,0 @@ -package com.example.datacollect.command; - -import com.example.datacollect.repository.ArticleRepository; - -public interface Command { - String getName(); - void execute(String[] args, ArticleRepository repository); -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/CrawlCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/CrawlCommand.java deleted file mode 100644 index dd63594..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/CrawlCommand.java +++ /dev/null @@ -1,87 +0,0 @@ -package com.example.datacollect.command; - -import com.example.datacollect.exception.NetworkException; -import com.example.datacollect.exception.ParseException; -import com.example.datacollect.repository.ArticleRepository; -import com.example.datacollect.strategy.CrawlStrategy; -import com.example.datacollect.strategy.StrategyFactory; -import com.example.datacollect.util.RetryUtils; -import com.example.datacollect.view.ConsoleView; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.concurrent.Callable; - -public class CrawlCommand implements Command { - private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); - private final ConsoleView view; - private final StrategyFactory strategyFactory; - - public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { - this.view = view; - this.strategyFactory = strategyFactory; - } - - @Override - public String getName() { - return "crawl"; - } - - @Override - public void execute(String[] args, ArticleRepository repository) { - if (args.length < 2) { - view.printError("Usage: crawl "); - logger.warn("Invalid command: missing URL argument"); - return; - } - String url = args[1]; - logger.info("Crawl started for: {}", url); - - CrawlStrategy strategy = strategyFactory.getStrategy(url); - if (strategy == null) { - view.printError("No strategy found for: " + url); - logger.error("No strategy found for URL: {}", url); - return; - } - - try { - view.printInfo("Crawling: " + url); - - Callable fetchTask = () -> { - logger.debug("Fetching document from: {}", url); - try { - return Jsoup.connect(url) - .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") - .timeout(10000) - .get(); - } catch (IOException e) { - throw new NetworkException("Failed to connect to " + url + ": " + e.getMessage(), e); - } - }; - - Document doc = RetryUtils.executeWithRetry(fetchTask); - logger.info("Successfully fetched document from: {}", url); - - var articles = strategy.parse(url, doc); - logger.info("Parsed {} articles", articles.size()); - - repository.addAll(articles); - logger.info("Successfully added {} articles to repository", articles.size()); - - view.printSuccess("Crawled " + articles.size() + " articles."); - logger.info("Successfully crawled {} articles from {}", articles.size(), url); - } catch (NetworkException e) { - view.printError("Network error: " + e.getMessage()); - logger.error("Network error while crawling {}: {}", url, e.getMessage(), e); - } catch (ParseException e) { - view.printError("Parse error: " + e.getMessage()); - logger.error("Parse error while crawling {}: {}", url, e.getMessage(), e); - } catch (Exception e) { - view.printError("Failed to crawl: " + e.getMessage()); - logger.error("Unexpected error while crawling {}: {}", url, e.getMessage(), e); - } - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ExitCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ExitCommand.java deleted file mode 100644 index 0f1d7fd..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ExitCommand.java +++ /dev/null @@ -1,27 +0,0 @@ -package com.example.datacollect.command; - -import com.example.datacollect.repository.ArticleRepository; -import com.example.datacollect.view.ConsoleView; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class ExitCommand implements Command { - private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class); - private final ConsoleView view; - - public ExitCommand(ConsoleView view) { - this.view = view; - } - - @Override - public String getName() { - return "exit"; - } - - @Override - public void execute(String[] args, ArticleRepository repository) { - logger.info("Exit command executed, shutting down"); - view.printSuccess("Bye!"); - System.exit(0);/*退出程序 */ - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/HelpCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/HelpCommand.java deleted file mode 100644 index 2087695..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/HelpCommand.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.example.datacollect.command; - -import com.example.datacollect.repository.ArticleRepository; -import com.example.datacollect.view.ConsoleView; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class HelpCommand implements Command { - private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class); - private final ConsoleView view; - - public HelpCommand(ConsoleView view) { - this.view = view; - } - - @Override - public String getName() { - return "help"; - } - - @Override - public void execute(String[] args, ArticleRepository repository) { - logger.info("Help command executed"); - view.printInfo("Commands: crawl , list, help, exit, analyze"); - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ListCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ListCommand.java deleted file mode 100644 index 9261a3d..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ListCommand.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.example.datacollect.command; - -import com.example.datacollect.repository.ArticleRepository; -import com.example.datacollect.view.ConsoleView; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class ListCommand implements Command { - private static final Logger logger = LoggerFactory.getLogger(ListCommand.class); - private final ConsoleView view; - - public ListCommand(ConsoleView view) { - this.view = view; - } - - @Override - public String getName() { - return "list"; - } - - @Override - public void execute(String[] args, ArticleRepository repository) { - logger.info("List command executed, showing {} articles", repository.size()); - view.display(repository.getAll()); - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/controller/CrawlerController.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/controller/CrawlerController.java deleted file mode 100644 index 5ef370a..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/controller/CrawlerController.java +++ /dev/null @@ -1,64 +0,0 @@ -package com.example.datacollect.controller; - -import com.example.datacollect.command.AnalyzeCommand; -import com.example.datacollect.command.Command; -import com.example.datacollect.command.CrawlCommand; -import com.example.datacollect.command.ExitCommand; -import com.example.datacollect.command.HelpCommand; -import com.example.datacollect.command.ListCommand; -import com.example.datacollect.repository.ArticleRepository; -import com.example.datacollect.strategy.StrategyFactory; -import com.example.datacollect.view.ConsoleView; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.util.HashMap; -import java.util.Map; - -public class CrawlerController { - private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class); - private final Map commands = new HashMap<>(); - private final ConsoleView view; - private final ArticleRepository repository; - - public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { - this.view = view; - this.repository = repository; - register(new HelpCommand(view)); - register(new ListCommand(view)); - register(new CrawlCommand(view, strategyFactory)); - register(new ExitCommand(view)); - register(new AnalyzeCommand(view, strategyFactory)); - logger.info("CrawlerController initialized with {} commands", commands.size()); - } - - private void register(Command command) { - commands.put(command.getName(), command); - logger.debug("Registered command: {}", command.getName()); - } - - public void handle(String input) {/* 处理用户输入 */ - String text = input == null ? "" : input.trim();/* 处理空输入 */ - if (text.isEmpty()) { - return; - } - - String[] args = text.split("\\s+");/* 解析命令行参数 */ - String cmdName = args[0].toLowerCase();/* 提取命令名称并转换为小写 */ - - logger.debug("Processing command: {}", cmdName); - - Command command = commands.get(cmdName);/* 获取命令对象 */ - if (command == null) { - view.printError("Unknown command: " + cmdName); - logger.warn("Unknown command attempted: {}", cmdName); - return; - } - - try { - command.execute(args, repository);/* 执行命令 */ - } catch (Exception e) { - view.printError("Command execution failed: " + e.getMessage()); - logger.error("Error executing command {}: {}", cmdName, e.getMessage(), e); - } - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/CrawlerException.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/CrawlerException.java deleted file mode 100644 index 230adb3..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/CrawlerException.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.example.datacollect.exception; - -public class CrawlerException extends Exception { - public CrawlerException(String message) { - super(message); - } - public CrawlerException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/NetworkException.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/NetworkException.java deleted file mode 100644 index 3a24c92..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/NetworkException.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.example.datacollect.exception; - -public class NetworkException extends CrawlerException { - public NetworkException(String message) { - super(message); - } - public NetworkException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/ParseException.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/ParseException.java deleted file mode 100644 index 09f9f20..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/ParseException.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.example.datacollect.exception; - -public class ParseException extends CrawlerException { - public ParseException(String message) { - super(message); - } - public ParseException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/model/Article.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/model/Article.java deleted file mode 100644 index 53b138b..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/model/Article.java +++ /dev/null @@ -1,72 +0,0 @@ -package com.example.datacollect.model; -/*- 文章模型类 -- 添加字段验证 -- 添加 toString() 方法(已有) -- 考虑添加 equals() 和 hashCode() */ -public class Article { - private String title; - private String url; - private String content; - - public Article(String title, String url, String content) { - setTitle(title); - setUrl(url); - setContent(content); - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - if (title == null) { - throw new IllegalArgumentException("Title cannot be null"); - } - if (title.trim().isEmpty()) { - throw new IllegalArgumentException("Title cannot be empty"); - } - if (title.length() > 500) { - throw new IllegalArgumentException("Title cannot exceed 500 characters"); - } - this.title = title.trim(); - } - - public String getUrl() { - return url; - } - - public void setUrl(String url) { - if (url == null) { - throw new IllegalArgumentException("URL cannot be null"); - } - if (url.trim().isEmpty()) { - throw new IllegalArgumentException("URL cannot be empty"); - } - if (!url.startsWith("http://") && !url.startsWith("https://")) { - throw new IllegalArgumentException("URL must start with http:// or https://"); - } - this.url = url.trim(); - } - - public String getContent() { - return content; - } - - public void setContent(String content) { - if (content == null) { - this.content = ""; - } else if (content.length() > 10000) { - this.content = content.substring(0, 10000);/* 截断内容到 10000 个字符 */ - } else { - this.content = content; - } - } - - @Override - public String toString() { - return "Article{" - + "title='" + title + '\'' - + ", url='" + url + '\'' - + '}'; - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/repository/ArticleRepository.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/repository/ArticleRepository.java deleted file mode 100644 index 8994efa..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/repository/ArticleRepository.java +++ /dev/null @@ -1,113 +0,0 @@ -package com.example.datacollect.repository; - -import com.example.datacollect.model.Article; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -/* 文章仓库 -- 添加 logger 成员 -- 增强 add() 方法的防御检查 -- 增强 addALL() 方法的防御检查 -- 添加空值检查、重复检查、长度验证 -- 记录操作日志*/ -public class ArticleRepository { - private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); - private static final int MAX_TITLE_LENGTH = 500;/* 最大标题长度 */ - private static final int MAX_CONTENT_LENGTH = 10000;/* 最大内容长度 */ - - private final List
articles = new ArrayList<>(); - private final Set urlSet = new HashSet<>(); - - public void add(Article article) { - if (article == null) { - logger.error("Attempted to add null article"); - throw new IllegalArgumentException("Article cannot be null"); - } - - String title = article.getTitle(); - String url = article.getUrl(); - String content = article.getContent(); - - if (title == null || title.trim().isEmpty()) { - logger.warn("Attempted to add article with empty title"); - throw new IllegalArgumentException("Article title cannot be null or empty"); - } - - if (url == null || url.trim().isEmpty()) { - logger.warn("Attempted to add article with empty URL"); - throw new IllegalArgumentException("Article URL cannot be null or empty"); - } - - if (title.length() > MAX_TITLE_LENGTH) { - logger.warn("Article title too long: {} characters (max: {})", title.length(), MAX_TITLE_LENGTH); - throw new IllegalArgumentException("Article title exceeds maximum length of " + MAX_TITLE_LENGTH); - } - - if (content != null && content.length() > MAX_CONTENT_LENGTH) { - logger.warn("Article content too long: {} characters (max: {})", content.length(), MAX_CONTENT_LENGTH); - content = content.substring(0, MAX_CONTENT_LENGTH); - } - - if (!url.startsWith("http://") && !url.startsWith("https://")) { - logger.warn("Invalid URL format: {}", url); - throw new IllegalArgumentException("Article URL must start with http:// or https://"); - } - - if (urlSet.contains(url)) { - logger.warn("Duplicate article URL detected: {}", url); - return;/* 跳过重复文章 */ - } - - Article validatedArticle = new Article(title.trim(), url.trim(), content != null ? content.trim() : "");/* 创建验证后的文章 */ - articles.add(validatedArticle);/* 添加文章到列表 */ - urlSet.add(url);/* 添加URL到集合 */ - logger.debug("Added article: {}", title);/* 记录添加日志 */ - } - - public void addAll(List
articleList) { - if (articleList == null) { - logger.error("Attempted to add null article list"); - throw new IllegalArgumentException("Article list cannot be null"); - } - - int successCount = 0;/* 成功添加的文章数量 */ - int skipCount = 0;/* 跳过的无效文章数量 */ - - for (Article article : articleList) { - if (article != null) { - try { - add(article); - successCount++; - } catch (IllegalArgumentException e) { - logger.warn("Skipped invalid article: {}", e.getMessage()); - skipCount++; - } - } else { - logger.warn("Skipped null article in list"); - skipCount++; - } - } - - logger.info("Added {} articles, skipped {} invalid articles", successCount, skipCount); - } - - public List
getAll() { - logger.debug("Retrieving all articles, total: {}", articles.size()); - return Collections.unmodifiableList(articles);/* 返回不可修改的列表 */ - } - - public int size() { - return articles.size();/* 返回文章数量 */ - } - - public void clear() { - int count = articles.size();/* 记录当前文章数量 */ - articles.clear(); - urlSet.clear(); - logger.info("Cleared repository, removed {} articles", count); - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/BlogStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/BlogStrategy.java deleted file mode 100644 index 1e23b2b..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/BlogStrategy.java +++ /dev/null @@ -1,25 +0,0 @@ -package com.example.datacollect.strategy; - -import com.example.datacollect.model.Article; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import java.util.ArrayList; -import java.util.List; - -public class BlogStrategy implements CrawlStrategy { - @Override - public boolean supports(String url) { - return url.contains("blog.example.com"); - } - - @Override - public List
parse(String url, Document doc) { - List
articles = new ArrayList<>(); - Elements titles = doc.select(".post-title"); - for (Element e : titles) { - articles.add(new Article(e.text(), url, "")); - } - return articles; - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java deleted file mode 100644 index ed69e19..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.example.datacollect.strategy; - -import com.example.datacollect.exception.ParseException; -import com.example.datacollect.model.Article; -import org.jsoup.nodes.Document; -import java.util.List; - -public interface CrawlStrategy { - List
parse(String url, Document doc) throws ParseException; - boolean supports(String url); -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java deleted file mode 100644 index 6892510..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java +++ /dev/null @@ -1,77 +0,0 @@ -package com.example.datacollect.strategy; - -import com.example.datacollect.exception.ParseException; -import com.example.datacollect.model.Article; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.List; - -/* HNU News 策略 -- 添加 logger 成员 -- 添加异常处理 -- 实现防御性编程 */ -public class HnuNewsStrategy implements CrawlStrategy { - private static final Logger logger = LoggerFactory.getLogger(HnuNewsStrategy.class); - - @Override - public boolean supports(String url) { - return url.contains("news.hnu.edu.cn");/* 支持 HNU News 网站 */ - } - - @Override - public List
parse(String url, Document doc) throws ParseException { - logger.info("Starting to parse HNU News: {}", url); - List
articles = new ArrayList<>();/* 存储储解析后的文章 */ - - try { - Elements listItems = doc.select("ul.list11 li");/* 选择文章列表项 */ - logger.debug("Found {} list items", listItems.size());/* 记录找到的列表项数量 */ - - for (Element li : listItems) { - try { - Element link = li.selectFirst("a");/* 选择列表项中的链接 */ - if (link == null) { - logger.warn("No link found in list item");/* 记录未找到链接 */ - continue; - } - - String articleUrl = link.attr("href");/* 获取链接的 href 属性值 */ - if (!articleUrl.startsWith("http")) { - articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", "");/* 补全相对路径 */ - } - - String title = "";/* 存储文章标题 */ - Element titleEl = link.selectFirst("h4.l2.h4s2");/* 选择标题元素 */ - if (titleEl != null) { - title = titleEl.text().trim();/* 提取标题文本并移除首尾空格 */ - } - - String content = "";/* 存储文章内容 */ - Element contentEl = link.selectFirst("p.l3.ps3");/* 选择内容元素 */ - if (contentEl != null) { - content = contentEl.text().trim();/* 提取内容文本并移除首尾空格 */ - } - - if (!title.isEmpty()) { - Article article = new Article(title, articleUrl, content);/* 创建文章对象 */ - articles.add(article);/* 将文章添加到列表 */ - } else { - logger.warn("Empty title found, skipping article"); - } - } catch (Exception e) { - logger.error("Error parsing individual article: {}", e.getMessage()); - } - } - - logger.info("Successfully parsed {} articles from HNU News", articles.size()); - return articles; - } catch (Exception e) { - logger.error("Failed to parse HNU News page: {}", e.getMessage(), e); - throw new ParseException("Failed to parse HNU News: " + e.getMessage(), e); - } - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/NewsStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/NewsStrategy.java deleted file mode 100644 index f6eb4bd..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/NewsStrategy.java +++ /dev/null @@ -1,25 +0,0 @@ -package com.example.datacollect.strategy; - -import com.example.datacollect.model.Article; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import java.util.ArrayList; -import java.util.List; - -public class NewsStrategy implements CrawlStrategy { - @Override - public boolean supports(String url) { - return url.contains("news.example.com"); - } - - @Override - public List
parse(String url, Document doc) { - List
articles = new ArrayList<>(); - Elements items = doc.select(".article-headline"); - for (Element e : items) { - articles.add(new Article(e.text(), url, "")); - } - return articles; - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/PeopleStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/PeopleStrategy.java deleted file mode 100644 index eb25935..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/PeopleStrategy.java +++ /dev/null @@ -1,83 +0,0 @@ -package com.example.datacollect.strategy; - -import com.example.datacollect.exception.ParseException; -import com.example.datacollect.model.Article; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.List; -/* 人民网策略类 */ -public class PeopleStrategy implements CrawlStrategy { - private static final Logger logger = LoggerFactory.getLogger(PeopleStrategy.class); - - @Override - public boolean supports(String url) { - return url.contains("people.com.cn");/* 检查URL是否包含people.com.cn */ - } - - @Override - public List
parse(String url, Document doc) throws ParseException { - logger.info("Starting to parse People's Daily News: {}", url); - List
articles = new ArrayList<>();/* 初始化文章列表 */ - - try { - Elements newsItems = doc.select("div.w1000, div.news-item, li.list_item");/* 选择新闻容器 */ - logger.debug("Found {} news containers", newsItems.size()); - - if (newsItems.isEmpty()) { - newsItems = doc.select("a[href*='/n1/']");/* 选择替代选择器 */ - logger.debug("Trying alternative selector, found {} items", newsItems.size()); - } - - for (Element item : newsItems) { - try { - Element link = item.selectFirst("a");/* 选择链接元素 */ - if (link == null) { - link = item.tagName().equals("a") ? item : null;/* 检查是否为链接元素 */ - } - - if (link == null) { - logger.warn("No link found in news item"); - continue; - } - - String articleUrl = link.attr("href");/* 获取链接URL */ - if (!articleUrl.startsWith("http")) {/* 检查是否为绝对URL */ - if (articleUrl.startsWith("/")) { - articleUrl = "https://www.people.com.cn" + articleUrl; - } else { - articleUrl = "https://www.people.com.cn/" + articleUrl; - } - } - - String title = link.text().trim();/* 获取标题文本 */ - - String content = "";/* 初始化内容文本 */ - Element contentEl = item.selectFirst("p, div.ed, div.summary");/* 选择内容元素 */ - if (contentEl != null) { - content = contentEl.text().trim();/* 获取内容文本 */ - } - - if (!title.isEmpty() && title.length() > 5) { - Article article = new Article(title, articleUrl, content);/* 创建文章对象 */ - articles.add(article);/* 添加文章到列表 */ - logger.debug("Parsed article: {}", title);/* 记录解析文章 */ - } else { - logger.warn("Invalid title found, skipping article");/* 记录无效标题 */ - } - } catch (Exception e) { - logger.error("Error parsing individual article: {}", e.getMessage()); - } - } - - logger.info("Successfully parsed {} articles from People's Daily News", articles.size()); - return articles; - } catch (Exception e) { - logger.error("Failed to parse People's Daily News page: {}", e.getMessage(), e); - throw new ParseException("Failed to parse People's Daily News: " + e.getMessage(), e); - } - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/StrategyFactory.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/StrategyFactory.java deleted file mode 100644 index e28aaac..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/StrategyFactory.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.example.datacollect.strategy; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.List; - -public class StrategyFactory { - private static final Logger logger = LoggerFactory.getLogger(StrategyFactory.class); - private final List strategies = new ArrayList<>(); - - public StrategyFactory() { - strategies.add(new HnuNewsStrategy()); - strategies.add(new YouthStrategy()); - strategies.add(new PeopleStrategy()); - strategies.add(new BlogStrategy()); - strategies.add(new NewsStrategy()); - logger.info("Initialized StrategyFactory with {} strategies", strategies.size()); - } - - public CrawlStrategy getStrategy(String url) { - for (CrawlStrategy s : strategies) { - if (s.supports(url)) { - logger.debug("Found strategy {} for URL: {}", s.getClass().getSimpleName(), url); - return s; - } - } - logger.warn("No strategy found for URL: {}", url); - return null; - } - - public void register(CrawlStrategy strategy) { - strategies.add(strategy); - logger.info("Registered new strategy: {}", strategy.getClass().getSimpleName()); - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/YouthStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/YouthStrategy.java deleted file mode 100644 index 2bdb8d1..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/YouthStrategy.java +++ /dev/null @@ -1,87 +0,0 @@ -package com.example.datacollect.strategy; - -import com.example.datacollect.exception.ParseException; -import com.example.datacollect.model.Article; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.List; -/* 青年网新闻解析策略*/ -public class YouthStrategy implements CrawlStrategy { - private static final Logger logger = LoggerFactory.getLogger(YouthStrategy.class); - - @Override - public boolean supports(String url) { - return url.contains("youth.cn");/* 检查URL是否包含青年网域名 */ - } - - @Override - public List
parse(String url, Document doc) throws ParseException { - logger.info("Starting to parse Youth News: {}", url); - List
articles = new ArrayList<>(); - - try { - Elements newsItems = doc.select("div.news-item, div.article-item, li.news-list-item");/* 选择新闻项元素 */ - logger.debug("Found {} news items", newsItems.size()); - - if (newsItems.isEmpty()) { - newsItems = doc.select("a[href*='/n1/']");/* 选择替代选择器 */ - logger.debug("Trying alternative selector, found {} items", newsItems.size()); - } - - for (Element item : newsItems) { - try { - Element link = item.selectFirst("a");/* 选择链接元素 */ - if (link == null) { - link = item.tagName().equals("a") ? item : null;/* 检查是否为链接元素 */ - } - - if (link == null) { - logger.warn("No link found in news item"); - continue; - } - - String articleUrl = link.attr("href");/* 获取链接URL */ - - if (!articleUrl.startsWith("http")) {/* 检查URL是否为绝对URL */ - if (articleUrl.startsWith("/")) { - articleUrl = "https://www.youth.cn" + articleUrl; - } else { - articleUrl = "https://www.youth.cn/" + articleUrl; - } - } - - String title = link.text().trim();/* 获取链接文本 */ - if (title.isEmpty()) {/* 检查标题是否为空 */ - continue; - } - - String content = "";/* 初始化内容为空字符串 */ - Element contentEl = item.selectFirst("p.summary, p.desc, div.brief");/* 选择摘要元素 */ - if (contentEl != null) { - content = contentEl.text().trim();/* 获取摘要文本 */ - } - - if (!title.isEmpty() && title.length() > 5) { - Article article = new Article(title, articleUrl, content); - articles.add(article); - logger.debug("Parsed article: {}", title); - } else { - logger.warn("Invalid title found, skipping article"); - } - } catch (Exception e) { - logger.error("Error parsing individual article: {}", e.getMessage()); - } - } - - logger.info("Successfully parsed {} articles from Youth News", articles.size()); - return articles; - } catch (Exception e) { - logger.error("Failed to parse Youth News page: {}", e.getMessage(), e); - throw new ParseException("Failed to parse Youth News: " + e.getMessage(), e); - } - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/util/RetryUtils.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/util/RetryUtils.java deleted file mode 100644 index 96aee20..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/util/RetryUtils.java +++ /dev/null @@ -1,49 +0,0 @@ -package com.example.datacollect.util; - -import com.example.datacollect.exception.NetworkException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.util.concurrent.Callable; - -public class RetryUtils { - private static final Logger logger = LoggerFactory.getLogger(RetryUtils.class); - - private static final int DEFAULT_MAX_RETRIES = 3; - private static final long DEFAULT_RETRY_DELAY_MS = 1000; - - public static T executeWithRetry(Callable task) throws Exception { - return executeWithRetry(task, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_DELAY_MS); - } - - public static T executeWithRetry(Callable task, int maxRetries, long retryDelayMs) throws Exception { - Exception lastException = null; - - for (int attempt = 0; attempt <= maxRetries; attempt++) { - try { - if (attempt > 0) { - logger.info("Retry attempt {}/{} for task", attempt, maxRetries); - Thread.sleep(retryDelayMs); - } - - return task.call(); - } catch (Exception e) { - lastException = e; - - if (e instanceof NetworkException) { - logger.warn("Network error on attempt {}: {}", attempt, e.getMessage()); - - if (attempt < maxRetries) { - logger.info("Will retry in {} ms...", retryDelayMs); - continue; - } - } else { - logger.error("Non-retryable error: {}", e.getMessage()); - throw e; - } - } - } - - logger.error("All {} retry attempts failed", maxRetries + 1); - throw lastException; - } -} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/view/ConsoleView.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/view/ConsoleView.java deleted file mode 100644 index 4665db0..0000000 --- a/w11/java-cli-w11/src/main/java/com/example/datacollect/view/ConsoleView.java +++ /dev/null @@ -1,46 +0,0 @@ -package com.example.datacollect.view; - -import com.example.datacollect.model.Article; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.util.List; -import java.util.Scanner; - -public class ConsoleView { - private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class); - private static final String ANSI_RESET = "\u001B[0m"; - private static final String ANSI_GREEN = "\u001B[32m"; - private static final String ANSI_RED = "\u001B[31m"; - private static final String ANSI_BLUE = "\u001B[34m"; - - private final Scanner scanner = new Scanner(System.in); - - public String readLine() { - System.out.print("> "); - String input = scanner.nextLine(); - return input;/* 返回用户输入 */ - } - - public void printSuccess(String msg) { - System.out.println(ANSI_GREEN + msg + ANSI_RESET); - } - - public void printError(String msg) { - System.out.println(ANSI_RED + msg + ANSI_RESET); - } - - public void printInfo(String msg) { - System.out.println(ANSI_BLUE + msg + ANSI_RESET); - } - - public void display(List
articles) { - if (articles.isEmpty()) { - printInfo("暂无文章,请先执行 crawl。"); - return; - } - for (int i = 0; i < articles.size(); i++) { - Article a = articles.get(i); - System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); - } - } -} diff --git a/w11/java-cli-w11/src/main/resources/logback.xml b/w11/java-cli-w11/src/main/resources/logback.xml deleted file mode 100644 index aa0a06b..0000000 --- a/w11/java-cli-w11/src/main/resources/logback.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - - - %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n - - - - - logs/crawler.log - - logs/crawler.%d{yyyy-MM-dd}.log - 30 - - - %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n - - - - - - - - diff --git a/w11/java-cli-w11/target/classes/logback.xml b/w11/java-cli-w11/target/classes/logback.xml deleted file mode 100644 index aa0a06b..0000000 --- a/w11/java-cli-w11/target/classes/logback.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - - - %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n - - - - - logs/crawler.log - - logs/crawler.%d{yyyy-MM-dd}.log - 30 - - - %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n - - - - - - - - diff --git a/w11/java-cli-w11/target/maven-archiver/pom.properties b/w11/java-cli-w11/target/maven-archiver/pom.properties deleted file mode 100644 index 5c1de34..0000000 --- a/w11/java-cli-w11/target/maven-archiver/pom.properties +++ /dev/null @@ -1,3 +0,0 @@ -artifactId=datacollect-cli -groupId=com.example -version=0.1.0 diff --git a/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst b/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst deleted file mode 100644 index 1ead6c5..0000000 --- a/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst +++ /dev/null @@ -1,22 +0,0 @@ -com\example\datacollect\command\ListCommand.class -com\example\datacollect\strategy\PeopleStrategy.class -com\example\datacollect\command\CrawlCommand.class -com\example\datacollect\strategy\BlogStrategy.class -com\example\datacollect\repository\ArticleRepository.class -com\example\datacollect\Main.class -com\example\datacollect\view\ConsoleView.class -com\example\datacollect\command\ExitCommand.class -com\example\datacollect\command\HelpCommand.class -com\example\datacollect\util\RetryUtils.class -com\example\datacollect\strategy\NewsStrategy.class -com\example\datacollect\command\Command.class -com\example\datacollect\controller\CrawlerController.class -com\example\datacollect\exception\CrawlerException.class -com\example\datacollect\exception\NetworkException.class -com\example\datacollect\command\AnalyzeCommand.class -com\example\datacollect\strategy\StrategyFactory.class -com\example\datacollect\strategy\HnuNewsStrategy.class -com\example\datacollect\strategy\YouthStrategy.class -com\example\datacollect\exception\ParseException.class -com\example\datacollect\strategy\CrawlStrategy.class -com\example\datacollect\model\Article.class diff --git a/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst b/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst deleted file mode 100644 index 937e5d7..0000000 --- a/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst +++ /dev/null @@ -1,22 +0,0 @@ -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\NewsStrategy.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\controller\CrawlerController.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\repository\ArticleRepository.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\HnuNewsStrategy.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\ExitCommand.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\Command.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\Main.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\CrawlCommand.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\exception\NetworkException.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\StrategyFactory.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\BlogStrategy.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\util\RetryUtils.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\HelpCommand.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\exception\CrawlerException.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\exception\ParseException.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\model\Article.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\view\ConsoleView.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\AnalyzeCommand.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\YouthStrategy.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\ListCommand.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\CrawlStrategy.java -C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\PeopleStrategy.java