diff --git a/w11/java-cli-w11/.gitignore b/w11/java-cli-w11/.gitignore new file mode 100644 index 0000000..0ebcf1a --- /dev/null +++ b/w11/java-cli-w11/.gitignore @@ -0,0 +1,4 @@ +*.jar +*.jar +*.class +*.log \ No newline at end of file diff --git a/w11/java-cli-w11/pom.xml b/w11/java-cli-w11/pom.xml new file mode 100644 index 0000000..9987b1c --- /dev/null +++ b/w11/java-cli-w11/pom.xml @@ -0,0 +1,62 @@ + + 4.0.0 + com.example + datacollect-cli + 0.1.0 + + 11 + 11 + + + + org.jsoup + jsoup + 1.17.2 + + + org.slf4j + slf4j-api + 2.0.9 + + + ch.qos.logback + logback-classic + 1.4.14 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + + com.example.datacollect.Main + + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + + diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/Main.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/Main.java new file mode 100644 index 0000000..ea9d151 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/Main.java @@ -0,0 +1,41 @@ +package com.example.datacollect; + +import com.example.datacollect.controller.CrawlerController; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +/*- 添加 logger 成员 +- 记录启动日志 +- 添加全局异常处理 */ +public class Main { + private static final Logger logger = LoggerFactory.getLogger(Main.class); + + public static void main(String[] args) { + try { + logger.info("Starting CLI Crawler application"); + + ConsoleView view = new ConsoleView(); + ArticleRepository repository = new ArticleRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + + view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); + logger.info("Application initialized successfully"); + + while (true) { + try { + controller.handle(view.readLine()); + } catch (Exception e) { + view.printError("Error: " + e.getMessage()); + logger.error("Error in main loop: {}", e.getMessage(), e); + } + } + } catch (Exception e) { + logger.error("Fatal error in application: {}", e.getMessage(), e); + System.err.println("Fatal error: " + e.getMessage()); + System.exit(1); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/AnalyzeCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/AnalyzeCommand.java new file mode 100644 index 0000000..ec9bcc3 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/AnalyzeCommand.java @@ -0,0 +1,103 @@ +package com.example.datacollect.command; + +import com.example.datacollect.exception.NetworkException; +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.util.RetryUtils; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; +import java.util.concurrent.Callable; + +public class AnalyzeCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class); + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "analyze"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: analyze "); + logger.warn("Invalid command: missing URL argument"); + return; + } + String url = args[1]; + logger.info("Analyze command executed for URL: {}", url); + + try { + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + view.printError("No strategy found for: " + url); + logger.error("No strategy found for URL: {}", url); + return; + } + + Callable fetchTask = () -> { + logger.debug("Fetching document from: {}", url); + try { + return Jsoup.connect(url) + .userAgent("Mozilla/5.0") + .timeout(5000) + .get(); + } catch (IOException e) { + throw new NetworkException("Failed to connect to " + url + ": " + e.getMessage(), e); + } + }; + + Document doc = RetryUtils.executeWithRetry(fetchTask); + logger.info("Successfully fetched document from: {}", url); + + List
articles = strategy.parse(url, doc); + logger.info("Parsed {} articles for analysis", articles.size()); + + int total = articles.size(); + int totalTitleLen = 0; + int totalContentLen = 0; + + for (Article a : articles) { + totalTitleLen += a.getTitle() == null ? 0 : a.getTitle().length(); + totalContentLen += a.getContent() == null ? 0 : a.getContent().length(); + } + + view.printInfo("===== 分析统计结果 ====="); + view.printInfo("文章总数:" + total + " 篇"); + view.printInfo("标题总长度:" + totalTitleLen); + view.printInfo("内容总长度:" + totalContentLen); + if (total > 0) { + view.printInfo("平均标题长度:" + (totalTitleLen / total)); + view.printInfo("平均内容长度:" + (totalContentLen / total)); + } + view.printInfo("======================"); + view.printSuccess("分析完成(数据未保存)"); + + logger.info("Analysis completed: {} articles analyzed", total); + } catch (NetworkException e) { + view.printError("Network error: " + e.getMessage()); + logger.error("Network error while analyzing {}: {}", url, e.getMessage(), e); + } catch (ParseException e) { + view.printError("Parse error: " + e.getMessage()); + logger.error("Parse error while analyzing {}: {}", url, e.getMessage(), e); + } catch (Exception e) { + view.printError("分析失败:" + e.getMessage()); + logger.error("Unexpected error while analyzing {}: {}", url, e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/Command.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/Command.java new file mode 100644 index 0000000..029cadc --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/Command.java @@ -0,0 +1,8 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; + +public interface Command { + String getName(); + void execute(String[] args, ArticleRepository repository); +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/CrawlCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/CrawlCommand.java new file mode 100644 index 0000000..dd63594 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/CrawlCommand.java @@ -0,0 +1,87 @@ +package com.example.datacollect.command; + +import com.example.datacollect.exception.NetworkException; +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.util.RetryUtils; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.concurrent.Callable; + +public class CrawlCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: crawl "); + logger.warn("Invalid command: missing URL argument"); + return; + } + String url = args[1]; + logger.info("Crawl started for: {}", url); + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + view.printError("No strategy found for: " + url); + logger.error("No strategy found for URL: {}", url); + return; + } + + try { + view.printInfo("Crawling: " + url); + + Callable fetchTask = () -> { + logger.debug("Fetching document from: {}", url); + try { + return Jsoup.connect(url) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") + .timeout(10000) + .get(); + } catch (IOException e) { + throw new NetworkException("Failed to connect to " + url + ": " + e.getMessage(), e); + } + }; + + Document doc = RetryUtils.executeWithRetry(fetchTask); + logger.info("Successfully fetched document from: {}", url); + + var articles = strategy.parse(url, doc); + logger.info("Parsed {} articles", articles.size()); + + repository.addAll(articles); + logger.info("Successfully added {} articles to repository", articles.size()); + + view.printSuccess("Crawled " + articles.size() + " articles."); + logger.info("Successfully crawled {} articles from {}", articles.size(), url); + } catch (NetworkException e) { + view.printError("Network error: " + e.getMessage()); + logger.error("Network error while crawling {}: {}", url, e.getMessage(), e); + } catch (ParseException e) { + view.printError("Parse error: " + e.getMessage()); + logger.error("Parse error while crawling {}: {}", url, e.getMessage(), e); + } catch (Exception e) { + view.printError("Failed to crawl: " + e.getMessage()); + logger.error("Unexpected error while crawling {}: {}", url, e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ExitCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ExitCommand.java new file mode 100644 index 0000000..0f1d7fd --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ExitCommand.java @@ -0,0 +1,27 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ExitCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class); + private final ConsoleView view; + + public ExitCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.info("Exit command executed, shutting down"); + view.printSuccess("Bye!"); + System.exit(0);/*退出程序 */ + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/HelpCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/HelpCommand.java new file mode 100644 index 0000000..2087695 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/HelpCommand.java @@ -0,0 +1,26 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class HelpCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class); + private final ConsoleView view; + + public HelpCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "help"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.info("Help command executed"); + view.printInfo("Commands: crawl , list, help, exit, analyze"); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ListCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ListCommand.java new file mode 100644 index 0000000..9261a3d --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ListCommand.java @@ -0,0 +1,26 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ListCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(ListCommand.class); + private final ConsoleView view; + + public ListCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "list"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.info("List command executed, showing {} articles", repository.size()); + view.display(repository.getAll()); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/controller/CrawlerController.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/controller/CrawlerController.java new file mode 100644 index 0000000..5ef370a --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/controller/CrawlerController.java @@ -0,0 +1,64 @@ +package com.example.datacollect.controller; + +import com.example.datacollect.command.AnalyzeCommand; +import com.example.datacollect.command.Command; +import com.example.datacollect.command.CrawlCommand; +import com.example.datacollect.command.ExitCommand; +import com.example.datacollect.command.HelpCommand; +import com.example.datacollect.command.ListCommand; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.HashMap; +import java.util.Map; + +public class CrawlerController { + private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class); + private final Map commands = new HashMap<>(); + private final ConsoleView view; + private final ArticleRepository repository; + + public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { + this.view = view; + this.repository = repository; + register(new HelpCommand(view)); + register(new ListCommand(view)); + register(new CrawlCommand(view, strategyFactory)); + register(new ExitCommand(view)); + register(new AnalyzeCommand(view, strategyFactory)); + logger.info("CrawlerController initialized with {} commands", commands.size()); + } + + private void register(Command command) { + commands.put(command.getName(), command); + logger.debug("Registered command: {}", command.getName()); + } + + public void handle(String input) {/* 处理用户输入 */ + String text = input == null ? "" : input.trim();/* 处理空输入 */ + if (text.isEmpty()) { + return; + } + + String[] args = text.split("\\s+");/* 解析命令行参数 */ + String cmdName = args[0].toLowerCase();/* 提取命令名称并转换为小写 */ + + logger.debug("Processing command: {}", cmdName); + + Command command = commands.get(cmdName);/* 获取命令对象 */ + if (command == null) { + view.printError("Unknown command: " + cmdName); + logger.warn("Unknown command attempted: {}", cmdName); + return; + } + + try { + command.execute(args, repository);/* 执行命令 */ + } catch (Exception e) { + view.printError("Command execution failed: " + e.getMessage()); + logger.error("Error executing command {}: {}", cmdName, e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/CrawlerException.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/CrawlerException.java new file mode 100644 index 0000000..230adb3 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/CrawlerException.java @@ -0,0 +1,10 @@ +package com.example.datacollect.exception; + +public class CrawlerException extends Exception { + public CrawlerException(String message) { + super(message); + } + public CrawlerException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/NetworkException.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/NetworkException.java new file mode 100644 index 0000000..3a24c92 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/NetworkException.java @@ -0,0 +1,10 @@ +package com.example.datacollect.exception; + +public class NetworkException extends CrawlerException { + public NetworkException(String message) { + super(message); + } + public NetworkException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/ParseException.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/ParseException.java new file mode 100644 index 0000000..09f9f20 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/ParseException.java @@ -0,0 +1,10 @@ +package com.example.datacollect.exception; + +public class ParseException extends CrawlerException { + public ParseException(String message) { + super(message); + } + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/model/Article.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/model/Article.java new file mode 100644 index 0000000..53b138b --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/model/Article.java @@ -0,0 +1,72 @@ +package com.example.datacollect.model; +/*- 文章模型类 +- 添加字段验证 +- 添加 toString() 方法(已有) +- 考虑添加 equals() 和 hashCode() */ +public class Article { + private String title; + private String url; + private String content; + + public Article(String title, String url, String content) { + setTitle(title); + setUrl(url); + setContent(content); + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + if (title == null) { + throw new IllegalArgumentException("Title cannot be null"); + } + if (title.trim().isEmpty()) { + throw new IllegalArgumentException("Title cannot be empty"); + } + if (title.length() > 500) { + throw new IllegalArgumentException("Title cannot exceed 500 characters"); + } + this.title = title.trim(); + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + if (url == null) { + throw new IllegalArgumentException("URL cannot be null"); + } + if (url.trim().isEmpty()) { + throw new IllegalArgumentException("URL cannot be empty"); + } + if (!url.startsWith("http://") && !url.startsWith("https://")) { + throw new IllegalArgumentException("URL must start with http:// or https://"); + } + this.url = url.trim(); + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + if (content == null) { + this.content = ""; + } else if (content.length() > 10000) { + this.content = content.substring(0, 10000);/* 截断内容到 10000 个字符 */ + } else { + this.content = content; + } + } + + @Override + public String toString() { + return "Article{" + + "title='" + title + '\'' + + ", url='" + url + '\'' + + '}'; + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/repository/ArticleRepository.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/repository/ArticleRepository.java new file mode 100644 index 0000000..8994efa --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/repository/ArticleRepository.java @@ -0,0 +1,113 @@ +package com.example.datacollect.repository; + +import com.example.datacollect.model.Article; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +/* 文章仓库 +- 添加 logger 成员 +- 增强 add() 方法的防御检查 +- 增强 addALL() 方法的防御检查 +- 添加空值检查、重复检查、长度验证 +- 记录操作日志*/ +public class ArticleRepository { + private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); + private static final int MAX_TITLE_LENGTH = 500;/* 最大标题长度 */ + private static final int MAX_CONTENT_LENGTH = 10000;/* 最大内容长度 */ + + private final List
articles = new ArrayList<>(); + private final Set urlSet = new HashSet<>(); + + public void add(Article article) { + if (article == null) { + logger.error("Attempted to add null article"); + throw new IllegalArgumentException("Article cannot be null"); + } + + String title = article.getTitle(); + String url = article.getUrl(); + String content = article.getContent(); + + if (title == null || title.trim().isEmpty()) { + logger.warn("Attempted to add article with empty title"); + throw new IllegalArgumentException("Article title cannot be null or empty"); + } + + if (url == null || url.trim().isEmpty()) { + logger.warn("Attempted to add article with empty URL"); + throw new IllegalArgumentException("Article URL cannot be null or empty"); + } + + if (title.length() > MAX_TITLE_LENGTH) { + logger.warn("Article title too long: {} characters (max: {})", title.length(), MAX_TITLE_LENGTH); + throw new IllegalArgumentException("Article title exceeds maximum length of " + MAX_TITLE_LENGTH); + } + + if (content != null && content.length() > MAX_CONTENT_LENGTH) { + logger.warn("Article content too long: {} characters (max: {})", content.length(), MAX_CONTENT_LENGTH); + content = content.substring(0, MAX_CONTENT_LENGTH); + } + + if (!url.startsWith("http://") && !url.startsWith("https://")) { + logger.warn("Invalid URL format: {}", url); + throw new IllegalArgumentException("Article URL must start with http:// or https://"); + } + + if (urlSet.contains(url)) { + logger.warn("Duplicate article URL detected: {}", url); + return;/* 跳过重复文章 */ + } + + Article validatedArticle = new Article(title.trim(), url.trim(), content != null ? content.trim() : "");/* 创建验证后的文章 */ + articles.add(validatedArticle);/* 添加文章到列表 */ + urlSet.add(url);/* 添加URL到集合 */ + logger.debug("Added article: {}", title);/* 记录添加日志 */ + } + + public void addAll(List
articleList) { + if (articleList == null) { + logger.error("Attempted to add null article list"); + throw new IllegalArgumentException("Article list cannot be null"); + } + + int successCount = 0;/* 成功添加的文章数量 */ + int skipCount = 0;/* 跳过的无效文章数量 */ + + for (Article article : articleList) { + if (article != null) { + try { + add(article); + successCount++; + } catch (IllegalArgumentException e) { + logger.warn("Skipped invalid article: {}", e.getMessage()); + skipCount++; + } + } else { + logger.warn("Skipped null article in list"); + skipCount++; + } + } + + logger.info("Added {} articles, skipped {} invalid articles", successCount, skipCount); + } + + public List
getAll() { + logger.debug("Retrieving all articles, total: {}", articles.size()); + return Collections.unmodifiableList(articles);/* 返回不可修改的列表 */ + } + + public int size() { + return articles.size();/* 返回文章数量 */ + } + + public void clear() { + int count = articles.size();/* 记录当前文章数量 */ + articles.clear(); + urlSet.clear(); + logger.info("Cleared repository, removed {} articles", count); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/BlogStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/BlogStrategy.java new file mode 100644 index 0000000..1e23b2b --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/BlogStrategy.java @@ -0,0 +1,25 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("blog.example.com"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java new file mode 100644 index 0000000..ed69e19 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java @@ -0,0 +1,11 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface CrawlStrategy { + List
parse(String url, Document doc) throws ParseException; + boolean supports(String url); +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..6892510 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java @@ -0,0 +1,77 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + +/* HNU News 策略 +- 添加 logger 成员 +- 添加异常处理 +- 实现防御性编程 */ +public class HnuNewsStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(HnuNewsStrategy.class); + + @Override + public boolean supports(String url) { + return url.contains("news.hnu.edu.cn");/* 支持 HNU News 网站 */ + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + logger.info("Starting to parse HNU News: {}", url); + List
articles = new ArrayList<>();/* 存储储解析后的文章 */ + + try { + Elements listItems = doc.select("ul.list11 li");/* 选择文章列表项 */ + logger.debug("Found {} list items", listItems.size());/* 记录找到的列表项数量 */ + + for (Element li : listItems) { + try { + Element link = li.selectFirst("a");/* 选择列表项中的链接 */ + if (link == null) { + logger.warn("No link found in list item");/* 记录未找到链接 */ + continue; + } + + String articleUrl = link.attr("href");/* 获取链接的 href 属性值 */ + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", "");/* 补全相对路径 */ + } + + String title = "";/* 存储文章标题 */ + Element titleEl = link.selectFirst("h4.l2.h4s2");/* 选择标题元素 */ + if (titleEl != null) { + title = titleEl.text().trim();/* 提取标题文本并移除首尾空格 */ + } + + String content = "";/* 存储文章内容 */ + Element contentEl = link.selectFirst("p.l3.ps3");/* 选择内容元素 */ + if (contentEl != null) { + content = contentEl.text().trim();/* 提取内容文本并移除首尾空格 */ + } + + if (!title.isEmpty()) { + Article article = new Article(title, articleUrl, content);/* 创建文章对象 */ + articles.add(article);/* 将文章添加到列表 */ + } else { + logger.warn("Empty title found, skipping article"); + } + } catch (Exception e) { + logger.error("Error parsing individual article: {}", e.getMessage()); + } + } + + logger.info("Successfully parsed {} articles from HNU News", articles.size()); + return articles; + } catch (Exception e) { + logger.error("Failed to parse HNU News page: {}", e.getMessage(), e); + throw new ParseException("Failed to parse HNU News: " + e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/NewsStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/NewsStrategy.java new file mode 100644 index 0000000..f6eb4bd --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/NewsStrategy.java @@ -0,0 +1,25 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class NewsStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("news.example.com"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements items = doc.select(".article-headline"); + for (Element e : items) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/PeopleStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/PeopleStrategy.java new file mode 100644 index 0000000..eb25935 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/PeopleStrategy.java @@ -0,0 +1,83 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; +/* 人民网策略类 */ +public class PeopleStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(PeopleStrategy.class); + + @Override + public boolean supports(String url) { + return url.contains("people.com.cn");/* 检查URL是否包含people.com.cn */ + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + logger.info("Starting to parse People's Daily News: {}", url); + List
articles = new ArrayList<>();/* 初始化文章列表 */ + + try { + Elements newsItems = doc.select("div.w1000, div.news-item, li.list_item");/* 选择新闻容器 */ + logger.debug("Found {} news containers", newsItems.size()); + + if (newsItems.isEmpty()) { + newsItems = doc.select("a[href*='/n1/']");/* 选择替代选择器 */ + logger.debug("Trying alternative selector, found {} items", newsItems.size()); + } + + for (Element item : newsItems) { + try { + Element link = item.selectFirst("a");/* 选择链接元素 */ + if (link == null) { + link = item.tagName().equals("a") ? item : null;/* 检查是否为链接元素 */ + } + + if (link == null) { + logger.warn("No link found in news item"); + continue; + } + + String articleUrl = link.attr("href");/* 获取链接URL */ + if (!articleUrl.startsWith("http")) {/* 检查是否为绝对URL */ + if (articleUrl.startsWith("/")) { + articleUrl = "https://www.people.com.cn" + articleUrl; + } else { + articleUrl = "https://www.people.com.cn/" + articleUrl; + } + } + + String title = link.text().trim();/* 获取标题文本 */ + + String content = "";/* 初始化内容文本 */ + Element contentEl = item.selectFirst("p, div.ed, div.summary");/* 选择内容元素 */ + if (contentEl != null) { + content = contentEl.text().trim();/* 获取内容文本 */ + } + + if (!title.isEmpty() && title.length() > 5) { + Article article = new Article(title, articleUrl, content);/* 创建文章对象 */ + articles.add(article);/* 添加文章到列表 */ + logger.debug("Parsed article: {}", title);/* 记录解析文章 */ + } else { + logger.warn("Invalid title found, skipping article");/* 记录无效标题 */ + } + } catch (Exception e) { + logger.error("Error parsing individual article: {}", e.getMessage()); + } + } + + logger.info("Successfully parsed {} articles from People's Daily News", articles.size()); + return articles; + } catch (Exception e) { + logger.error("Failed to parse People's Daily News page: {}", e.getMessage(), e); + throw new ParseException("Failed to parse People's Daily News: " + e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/StrategyFactory.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/StrategyFactory.java new file mode 100644 index 0000000..e28aaac --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/StrategyFactory.java @@ -0,0 +1,36 @@ +package com.example.datacollect.strategy; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + +public class StrategyFactory { + private static final Logger logger = LoggerFactory.getLogger(StrategyFactory.class); + private final List strategies = new ArrayList<>(); + + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + strategies.add(new YouthStrategy()); + strategies.add(new PeopleStrategy()); + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + logger.info("Initialized StrategyFactory with {} strategies", strategies.size()); + } + + public CrawlStrategy getStrategy(String url) { + for (CrawlStrategy s : strategies) { + if (s.supports(url)) { + logger.debug("Found strategy {} for URL: {}", s.getClass().getSimpleName(), url); + return s; + } + } + logger.warn("No strategy found for URL: {}", url); + return null; + } + + public void register(CrawlStrategy strategy) { + strategies.add(strategy); + logger.info("Registered new strategy: {}", strategy.getClass().getSimpleName()); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/YouthStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/YouthStrategy.java new file mode 100644 index 0000000..2bdb8d1 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/YouthStrategy.java @@ -0,0 +1,87 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; +/* 青年网新闻解析策略*/ +public class YouthStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(YouthStrategy.class); + + @Override + public boolean supports(String url) { + return url.contains("youth.cn");/* 检查URL是否包含青年网域名 */ + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + logger.info("Starting to parse Youth News: {}", url); + List
articles = new ArrayList<>(); + + try { + Elements newsItems = doc.select("div.news-item, div.article-item, li.news-list-item");/* 选择新闻项元素 */ + logger.debug("Found {} news items", newsItems.size()); + + if (newsItems.isEmpty()) { + newsItems = doc.select("a[href*='/n1/']");/* 选择替代选择器 */ + logger.debug("Trying alternative selector, found {} items", newsItems.size()); + } + + for (Element item : newsItems) { + try { + Element link = item.selectFirst("a");/* 选择链接元素 */ + if (link == null) { + link = item.tagName().equals("a") ? item : null;/* 检查是否为链接元素 */ + } + + if (link == null) { + logger.warn("No link found in news item"); + continue; + } + + String articleUrl = link.attr("href");/* 获取链接URL */ + + if (!articleUrl.startsWith("http")) {/* 检查URL是否为绝对URL */ + if (articleUrl.startsWith("/")) { + articleUrl = "https://www.youth.cn" + articleUrl; + } else { + articleUrl = "https://www.youth.cn/" + articleUrl; + } + } + + String title = link.text().trim();/* 获取链接文本 */ + if (title.isEmpty()) {/* 检查标题是否为空 */ + continue; + } + + String content = "";/* 初始化内容为空字符串 */ + Element contentEl = item.selectFirst("p.summary, p.desc, div.brief");/* 选择摘要元素 */ + if (contentEl != null) { + content = contentEl.text().trim();/* 获取摘要文本 */ + } + + if (!title.isEmpty() && title.length() > 5) { + Article article = new Article(title, articleUrl, content); + articles.add(article); + logger.debug("Parsed article: {}", title); + } else { + logger.warn("Invalid title found, skipping article"); + } + } catch (Exception e) { + logger.error("Error parsing individual article: {}", e.getMessage()); + } + } + + logger.info("Successfully parsed {} articles from Youth News", articles.size()); + return articles; + } catch (Exception e) { + logger.error("Failed to parse Youth News page: {}", e.getMessage(), e); + throw new ParseException("Failed to parse Youth News: " + e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/util/RetryUtils.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/util/RetryUtils.java new file mode 100644 index 0000000..96aee20 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/util/RetryUtils.java @@ -0,0 +1,49 @@ +package com.example.datacollect.util; + +import com.example.datacollect.exception.NetworkException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.concurrent.Callable; + +public class RetryUtils { + private static final Logger logger = LoggerFactory.getLogger(RetryUtils.class); + + private static final int DEFAULT_MAX_RETRIES = 3; + private static final long DEFAULT_RETRY_DELAY_MS = 1000; + + public static T executeWithRetry(Callable task) throws Exception { + return executeWithRetry(task, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_DELAY_MS); + } + + public static T executeWithRetry(Callable task, int maxRetries, long retryDelayMs) throws Exception { + Exception lastException = null; + + for (int attempt = 0; attempt <= maxRetries; attempt++) { + try { + if (attempt > 0) { + logger.info("Retry attempt {}/{} for task", attempt, maxRetries); + Thread.sleep(retryDelayMs); + } + + return task.call(); + } catch (Exception e) { + lastException = e; + + if (e instanceof NetworkException) { + logger.warn("Network error on attempt {}: {}", attempt, e.getMessage()); + + if (attempt < maxRetries) { + logger.info("Will retry in {} ms...", retryDelayMs); + continue; + } + } else { + logger.error("Non-retryable error: {}", e.getMessage()); + throw e; + } + } + } + + logger.error("All {} retry attempts failed", maxRetries + 1); + throw lastException; + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/view/ConsoleView.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/view/ConsoleView.java new file mode 100644 index 0000000..4665db0 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/view/ConsoleView.java @@ -0,0 +1,46 @@ +package com.example.datacollect.view; + +import com.example.datacollect.model.Article; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.List; +import java.util.Scanner; + +public class ConsoleView { + private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class); + private static final String ANSI_RESET = "\u001B[0m"; + private static final String ANSI_GREEN = "\u001B[32m"; + private static final String ANSI_RED = "\u001B[31m"; + private static final String ANSI_BLUE = "\u001B[34m"; + + private final Scanner scanner = new Scanner(System.in); + + public String readLine() { + System.out.print("> "); + String input = scanner.nextLine(); + return input;/* 返回用户输入 */ + } + + public void printSuccess(String msg) { + System.out.println(ANSI_GREEN + msg + ANSI_RESET); + } + + public void printError(String msg) { + System.out.println(ANSI_RED + msg + ANSI_RESET); + } + + public void printInfo(String msg) { + System.out.println(ANSI_BLUE + msg + ANSI_RESET); + } + + public void display(List
articles) { + if (articles.isEmpty()) { + printInfo("暂无文章,请先执行 crawl。"); + return; + } + for (int i = 0; i < articles.size(); i++) { + Article a = articles.get(i); + System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); + } + } +} diff --git a/w11/java-cli-w11/src/main/resources/logback.xml b/w11/java-cli-w11/src/main/resources/logback.xml new file mode 100644 index 0000000..aa0a06b --- /dev/null +++ b/w11/java-cli-w11/src/main/resources/logback.xml @@ -0,0 +1,24 @@ + + + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + logs/crawler.log + + logs/crawler.%d{yyyy-MM-dd}.log + 30 + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + diff --git a/w11/java-cli-w11/target/classes/logback.xml b/w11/java-cli-w11/target/classes/logback.xml new file mode 100644 index 0000000..aa0a06b --- /dev/null +++ b/w11/java-cli-w11/target/classes/logback.xml @@ -0,0 +1,24 @@ + + + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + logs/crawler.log + + logs/crawler.%d{yyyy-MM-dd}.log + 30 + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + diff --git a/w11/java-cli-w11/target/maven-archiver/pom.properties b/w11/java-cli-w11/target/maven-archiver/pom.properties new file mode 100644 index 0000000..5c1de34 --- /dev/null +++ b/w11/java-cli-w11/target/maven-archiver/pom.properties @@ -0,0 +1,3 @@ +artifactId=datacollect-cli +groupId=com.example +version=0.1.0 diff --git a/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst b/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst new file mode 100644 index 0000000..1ead6c5 --- /dev/null +++ b/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst @@ -0,0 +1,22 @@ +com\example\datacollect\command\ListCommand.class +com\example\datacollect\strategy\PeopleStrategy.class +com\example\datacollect\command\CrawlCommand.class +com\example\datacollect\strategy\BlogStrategy.class +com\example\datacollect\repository\ArticleRepository.class +com\example\datacollect\Main.class +com\example\datacollect\view\ConsoleView.class +com\example\datacollect\command\ExitCommand.class +com\example\datacollect\command\HelpCommand.class +com\example\datacollect\util\RetryUtils.class +com\example\datacollect\strategy\NewsStrategy.class +com\example\datacollect\command\Command.class +com\example\datacollect\controller\CrawlerController.class +com\example\datacollect\exception\CrawlerException.class +com\example\datacollect\exception\NetworkException.class +com\example\datacollect\command\AnalyzeCommand.class +com\example\datacollect\strategy\StrategyFactory.class +com\example\datacollect\strategy\HnuNewsStrategy.class +com\example\datacollect\strategy\YouthStrategy.class +com\example\datacollect\exception\ParseException.class +com\example\datacollect\strategy\CrawlStrategy.class +com\example\datacollect\model\Article.class diff --git a/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst b/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst new file mode 100644 index 0000000..937e5d7 --- /dev/null +++ b/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst @@ -0,0 +1,22 @@ +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\NewsStrategy.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\controller\CrawlerController.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\repository\ArticleRepository.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\HnuNewsStrategy.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\ExitCommand.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\Command.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\Main.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\CrawlCommand.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\exception\NetworkException.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\StrategyFactory.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\BlogStrategy.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\util\RetryUtils.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\HelpCommand.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\exception\CrawlerException.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\exception\ParseException.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\model\Article.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\view\ConsoleView.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\AnalyzeCommand.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\YouthStrategy.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\ListCommand.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\CrawlStrategy.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\PeopleStrategy.java