22 changed files with 1081 additions and 0 deletions
@ -0,0 +1,34 @@ |
|||||
|
package com.example.datacollect; |
||||
|
|
||||
|
import com.example.datacollect.controller.CrawlerController; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
public class Main { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(Main.class); |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
logger.info("Starting CLI Crawler (w10_3)"); |
||||
|
ConsoleView view = new ConsoleView(); |
||||
|
ArticleRepository repository = new ArticleRepository(); |
||||
|
StrategyFactory strategyFactory = new StrategyFactory(); |
||||
|
CrawlerController controller = new CrawlerController(view, repository, strategyFactory); |
||||
|
|
||||
|
String welcomeMsg = "Welcome to CLI Crawler (w10_3)! Type help for commands."; |
||||
|
logger.info(welcomeMsg); |
||||
|
view.printSuccess(welcomeMsg); |
||||
|
|
||||
|
try { |
||||
|
while (true) { |
||||
|
controller.handle(view.readLine()); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Unexpected error in main loop", e); |
||||
|
view.printError("System error: " + e.getMessage()); |
||||
|
System.exit(1); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,101 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.strategy.CrawlStrategy; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class AnalyzeCommand implements Command { |
||||
|
|
||||
|
// 1. 添加 Logger 成员
|
||||
|
private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class); |
||||
|
|
||||
|
private final ConsoleView view; |
||||
|
private final StrategyFactory strategyFactory; |
||||
|
|
||||
|
public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { |
||||
|
this.view = view; |
||||
|
this.strategyFactory = strategyFactory; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "analyze"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
if (args.length < 2) { |
||||
|
logger.error("指令参数错误,正确用法: analyze <url>"); |
||||
|
view.printError("Usage: analyze <url>"); // 保留控制台提示,方便用户直接看到
|
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String url = args[1]; |
||||
|
CrawlStrategy strategy = strategyFactory.getStrategy(url); |
||||
|
if (strategy == null) { |
||||
|
logger.error("未找到适用于 URL [{}] 的抓取策略", url); |
||||
|
view.printError("No strategy found for: " + url); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
logger.info("开始分析目标网站: {}", url); |
||||
|
Document doc = Jsoup.connect(url).get(); |
||||
|
|
||||
|
// 调用策略解析,但不存入 Repository
|
||||
|
List<Article> articles = strategy.parse(url, doc); |
||||
|
|
||||
|
// 统计信息
|
||||
|
int total = articles.size(); |
||||
|
double avgTitleLen = articles.stream() |
||||
|
.mapToInt(a -> a.getTitle().length()) |
||||
|
.average() |
||||
|
.orElse(0.0); |
||||
|
|
||||
|
// Top 5 按标题长度排序
|
||||
|
List<Article> top5 = articles.stream() |
||||
|
.sorted((a, b) -> Integer.compare(b.getTitle().length(), a.getTitle().length())) |
||||
|
.limit(5) |
||||
|
.collect(Collectors.toList()); |
||||
|
|
||||
|
// 输出结果到日志
|
||||
|
logger.info("=== 分析结果 ==="); |
||||
|
logger.info("提取文章总数: {}", total); |
||||
|
logger.info("平均标题长度: {:.2f} 字符", avgTitleLen); |
||||
|
logger.info("Top 5 文章 (按标题长度排序):"); |
||||
|
|
||||
|
int rank = 1; |
||||
|
for (Article a : top5) { |
||||
|
logger.info("{}. {} ({} 字符)", rank, a.getTitle(), a.getTitle().length()); |
||||
|
rank++; |
||||
|
} |
||||
|
logger.info("=================="); |
||||
|
|
||||
|
// 保留原有的控制台输出,确保用户交互体验不受影响
|
||||
|
view.printInfo("=== Analysis Result ==="); |
||||
|
view.printInfo("Total Articles: " + total); |
||||
|
view.printInfo("Avg Title Length: " + String.format("%.2f", avgTitleLen)); |
||||
|
view.printInfo("Top 5 Articles (by Title Length):"); |
||||
|
rank = 1; |
||||
|
for (Article a : top5) { |
||||
|
view.printInfo(rank + ". " + a.getTitle() + " (" + a.getTitle().length() + " chars)"); |
||||
|
rank++; |
||||
|
} |
||||
|
view.printInfo("========================"); |
||||
|
|
||||
|
} catch (Exception e) { |
||||
|
logger.error("分析 URL [{}] 时发生异常: ", url, e); // 传入异常对象 e,以便记录完整堆栈
|
||||
|
view.printError("Failed to analyze: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
@ -0,0 +1,8 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
|
||||
|
public interface Command { |
||||
|
String getName(); |
||||
|
void execute(String[] args, ArticleRepository repository); |
||||
|
} |
||||
@ -0,0 +1,115 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.exception.NetworkException; |
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.strategy.CrawlStrategy; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
import java.util.concurrent.TimeUnit; |
||||
|
|
||||
|
public class CrawlCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); |
||||
|
private static final int MAX_RETRY = 3; // 最大重试次数
|
||||
|
private static final long RETRY_INTERVAL = 1000; // 重试间隔(毫秒)
|
||||
|
|
||||
|
private final ConsoleView view; |
||||
|
private final StrategyFactory strategyFactory; |
||||
|
|
||||
|
public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { |
||||
|
this.view = view; |
||||
|
this.strategyFactory = strategyFactory; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "crawl"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
if (args.length < 2) { |
||||
|
String errorMsg = "Crawl command usage: crawl <url>"; |
||||
|
logger.error(errorMsg); |
||||
|
view.printError(errorMsg); |
||||
|
return; |
||||
|
} |
||||
|
String url = args[1]; |
||||
|
logger.info("Start crawling url: {}", url); |
||||
|
|
||||
|
CrawlStrategy strategy = strategyFactory.getStrategy(url); |
||||
|
if (strategy == null) { |
||||
|
String errorMsg = "No crawl strategy found for url: " + url; |
||||
|
logger.error(errorMsg); |
||||
|
view.printError(errorMsg); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// 重试逻辑
|
||||
|
int retryCount = 0; |
||||
|
while (retryCount < MAX_RETRY) { |
||||
|
try { |
||||
|
Document doc = fetchDocumentWithRetry(url, retryCount); |
||||
|
List<Article> articles = strategy.parse(url, doc); |
||||
|
|
||||
|
// 批量添加(复用Repository的addAll方法)
|
||||
|
repository.addAll(articles); |
||||
|
|
||||
|
String successMsg = "Crawled " + articles.size() + " articles from url: " + url; |
||||
|
logger.info(successMsg); |
||||
|
view.printSuccess(successMsg); |
||||
|
return; // 成功则退出重试循环
|
||||
|
} catch (NetworkException e) { |
||||
|
retryCount++; |
||||
|
String retryMsg = String.format("Network error (retry %d/%d): %s", retryCount, MAX_RETRY, e.getMessage()); |
||||
|
logger.warn(retryMsg); |
||||
|
view.printError(retryMsg); |
||||
|
|
||||
|
if (retryCount >= MAX_RETRY) { |
||||
|
String failMsg = "Failed to crawl url after " + MAX_RETRY + " retries: " + url; |
||||
|
logger.error(failMsg, e); |
||||
|
view.printError(failMsg); |
||||
|
} |
||||
|
|
||||
|
// 重试间隔
|
||||
|
try { |
||||
|
TimeUnit.MILLISECONDS.sleep(RETRY_INTERVAL); |
||||
|
} catch (InterruptedException ie) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
logger.error("Retry sleep interrupted", ie); |
||||
|
break; |
||||
|
} |
||||
|
} catch (ParseException e) { |
||||
|
String errorMsg = "Parse failed for url: " + url; |
||||
|
logger.error(errorMsg, e); |
||||
|
view.printError(errorMsg); |
||||
|
return; // 解析异常不重试
|
||||
|
} catch (Exception e) { |
||||
|
String errorMsg = "Unexpected error when crawling url: " + url; |
||||
|
logger.error(errorMsg, e); |
||||
|
view.printError(errorMsg); |
||||
|
return; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 抽取文档获取逻辑,抛出网络异常
|
||||
|
private Document fetchDocumentWithRetry(String url, int retryCount) throws NetworkException { |
||||
|
try { |
||||
|
logger.debug("Fetching document (retry {}) for url: {}", retryCount, url); |
||||
|
return Jsoup.connect(url) |
||||
|
.timeout(5000) // 超时时间5秒
|
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
||||
|
.get(); |
||||
|
} catch (Exception e) { |
||||
|
throw new NetworkException("Failed to fetch document (retry " + retryCount + ") for url: " + url, e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,34 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
public class ExitCommand implements Command { |
||||
|
|
||||
|
// 1. 添加 Logger 成员
|
||||
|
private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class); |
||||
|
|
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public ExitCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "exit"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
// 记录退出日志
|
||||
|
logger.info("用户请求退出程序。"); |
||||
|
view.printSuccess("Bye!"); |
||||
|
|
||||
|
// 在调用 exit 前可以记录一些系统状态,或者直接记录
|
||||
|
logger.info("程序已终止。"); |
||||
|
System.exit(0); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,32 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
public class HelpCommand implements Command { |
||||
|
|
||||
|
// 1. 添加 Logger 成员
|
||||
|
private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class); |
||||
|
|
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public HelpCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "help"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
logger.debug("用户请求查看帮助信息。"); |
||||
|
|
||||
|
// 保留原有的帮助信息输出
|
||||
|
view.printInfo("Commands: crawl <url>, list, analyze, help, exit"); |
||||
|
// 建议:将硬编码的命令列表改为动态获取(如果 Command 接口有 getType 或类似方法),目前保持原样
|
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,33 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
public class ListCommand implements Command { |
||||
|
|
||||
|
// 1. 添加 Logger 成员
|
||||
|
private static final Logger logger = LoggerFactory.getLogger(ListCommand.class); |
||||
|
|
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public ListCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "list"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
logger.info("正在执行 list 命令,准备展示已抓取的文章列表。"); |
||||
|
|
||||
|
// 保留原有的视图输出
|
||||
|
view.display(repository.getAll()); |
||||
|
|
||||
|
logger.debug("当前仓库中共有 {} 篇文章已加载至视图。", repository.getAll().size()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,62 @@ |
|||||
|
package com.example.datacollect.controller; |
||||
|
|
||||
|
import com.example.datacollect.command.Command; |
||||
|
import com.example.datacollect.command.AnalyzeCommand; |
||||
|
import com.example.datacollect.command.CrawlCommand; |
||||
|
import com.example.datacollect.command.ExitCommand; |
||||
|
import com.example.datacollect.command.HelpCommand; |
||||
|
import com.example.datacollect.command.ListCommand; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.HashMap; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class CrawlerController { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class); |
||||
|
private final Map<String, Command> commands = new HashMap<>(); |
||||
|
private final ConsoleView view; |
||||
|
private final ArticleRepository repository; |
||||
|
|
||||
|
public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { |
||||
|
this.view = view; |
||||
|
this.repository = repository; |
||||
|
logger.info("Registering crawler commands"); |
||||
|
register(new HelpCommand(view)); |
||||
|
register(new ListCommand(view)); |
||||
|
register(new CrawlCommand(view, strategyFactory)); |
||||
|
register(new AnalyzeCommand(view, strategyFactory));//新增
|
||||
|
register(new ExitCommand(view)); |
||||
|
logger.debug("Registered commands: {}", commands.keySet()); |
||||
|
} |
||||
|
|
||||
|
private void register(Command command) { |
||||
|
commands.put(command.getName(), command); |
||||
|
logger.debug("Registered command: {}", command.getName()); |
||||
|
} |
||||
|
|
||||
|
public void handle(String input) { |
||||
|
String text = input == null ? "" : input.trim(); |
||||
|
logger.debug("Handling input: {}", text); |
||||
|
if (text.isEmpty()) { |
||||
|
logger.debug("Empty input, skip handling"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String[] args = text.split("\\s+"); |
||||
|
String cmdName = args[0].toLowerCase(); |
||||
|
Command command = commands.get(cmdName); |
||||
|
if (command == null) { |
||||
|
String errorMsg = "Unknown command: " + cmdName; |
||||
|
logger.error(errorMsg); |
||||
|
view.printError(errorMsg); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
logger.info("Executing command: {}", cmdName); |
||||
|
command.execute(args, repository); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,20 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class CrawlerException extends Exception { |
||||
|
public CrawlerException() { |
||||
|
super(); |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(Throwable cause) { |
||||
|
super(cause); |
||||
|
} |
||||
|
} |
||||
|
|
||||
@ -0,0 +1,19 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class NetworkException extends CrawlerException{ |
||||
|
public NetworkException() { |
||||
|
super(); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(Throwable cause) { |
||||
|
super(cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,19 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class ParseException extends CrawlerException{ |
||||
|
public ParseException() { |
||||
|
super(); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
|
||||
|
public ParseException(Throwable cause) { |
||||
|
super(cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,45 @@ |
|||||
|
package com.example.datacollect.model; |
||||
|
|
||||
|
public class Article { |
||||
|
private String title; |
||||
|
private String url; |
||||
|
private String content; |
||||
|
|
||||
|
public Article(String title, String url, String content) { |
||||
|
this.title = title; |
||||
|
this.url = url; |
||||
|
this.content = content; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public String getUrl() { |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
public void setUrl(String url) { |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
public String getContent() { |
||||
|
return content; |
||||
|
} |
||||
|
|
||||
|
public void setContent(String content) { |
||||
|
this.content = content; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Article{" |
||||
|
+ "title='" + title + '\'' |
||||
|
+ ", url='" + url + '\'' |
||||
|
+ '}'; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,100 @@ |
|||||
|
package com.example.datacollect.repository; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.Collections; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ArticleRepository { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); |
||||
|
private final List<Article> articles = new ArrayList<>(); |
||||
|
|
||||
|
// 新增:根据索引获取文章(防御索引越界)
|
||||
|
public Article get(int index) { |
||||
|
logger.debug("Getting article at index: {}", index); |
||||
|
if (index < 0 || index >= articles.size()) { |
||||
|
String errorMsg = "Index out of bounds: index=" + index + ", size=" + articles.size(); |
||||
|
logger.error(errorMsg); |
||||
|
throw new IndexOutOfBoundsException(errorMsg); |
||||
|
} |
||||
|
return articles.get(index); |
||||
|
} |
||||
|
|
||||
|
public void add(Article article) { |
||||
|
logger.debug("Adding article: {}", article); |
||||
|
if (article == null) { |
||||
|
String errorMsg = "Article cannot be null"; |
||||
|
logger.error(errorMsg); |
||||
|
throw new IllegalArgumentException(errorMsg); |
||||
|
} |
||||
|
// 新增:防御重复添加(可选)
|
||||
|
if (articles.contains(article)) { |
||||
|
logger.warn("Article already exists: {}", article); |
||||
|
return; |
||||
|
} |
||||
|
articles.add(article); |
||||
|
logger.info("Added article: {}", article.getTitle()); |
||||
|
} |
||||
|
|
||||
|
// ★ 新增:批量添加方法以及注意防御 null
|
||||
|
public void addAll(List<Article> articles) { |
||||
|
logger.debug("Adding batch articles, size: {}", articles == null ? "null" : articles.size()); |
||||
|
if (articles == null) { |
||||
|
String errorMsg = "Articles list cannot be null"; |
||||
|
logger.error(errorMsg); |
||||
|
throw new IllegalArgumentException(errorMsg); |
||||
|
} |
||||
|
if (articles.isEmpty()) { |
||||
|
logger.warn("Articles list is empty, skip addAll"); |
||||
|
return; |
||||
|
} |
||||
|
int addedCount = 0; |
||||
|
for (Article article : articles) { |
||||
|
if (article == null) { |
||||
|
logger.error("Skipping null article in batch add"); |
||||
|
continue; // 或抛出异常,根据业务选择
|
||||
|
} |
||||
|
if (!this.articles.contains(article)) { |
||||
|
this.articles.add(article); |
||||
|
addedCount++; |
||||
|
} |
||||
|
} |
||||
|
logger.info("Batch added {} articles (skipped duplicates/null)", addedCount); |
||||
|
} |
||||
|
|
||||
|
public List<Article> getAll() { |
||||
|
List<Article> unmodifiableList = Collections.unmodifiableList(articles); |
||||
|
logger.debug("Getting all articles, size: {}", unmodifiableList.size()); |
||||
|
return unmodifiableList; |
||||
|
} |
||||
|
|
||||
|
public int size() { |
||||
|
int size = articles.size(); |
||||
|
logger.debug("Repository size: {}", size); |
||||
|
return size; |
||||
|
} |
||||
|
|
||||
|
// 新增:清空前校验 + 日志
|
||||
|
public void clear() { |
||||
|
logger.warn("Clearing all articles (current size: {})", articles.size()); |
||||
|
if (articles.isEmpty()) { |
||||
|
logger.info("Repository is already empty, skip clear"); |
||||
|
return; |
||||
|
} |
||||
|
articles.clear(); |
||||
|
logger.info("Cleared all articles successfully"); |
||||
|
} |
||||
|
|
||||
|
// 新增:检查是否包含指定URL的文章(防御检查)
|
||||
|
public boolean containsUrl(String url) { |
||||
|
logger.debug("Checking if repository contains url: {}", url); |
||||
|
if (url == null || url.isBlank()) { |
||||
|
logger.error("URL cannot be null/blank"); |
||||
|
throw new IllegalArgumentException("URL cannot be null or blank"); |
||||
|
} |
||||
|
return articles.stream().anyMatch(article -> url.equals(article.getUrl())); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,47 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class BlogStrategy implements CrawlStrategy { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(BlogStrategy.class); |
||||
|
private static final Pattern URL_PATTERN = Pattern.compile(".*blog\\.example\\.com.*"); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
boolean isSupported = URL_PATTERN.matcher(url).matches(); |
||||
|
logger.debug("URL {} support status: {}", url, isSupported); |
||||
|
return isSupported; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) throws ParseException { |
||||
|
try { |
||||
|
logger.info("Start parsing blog articles from url: {}", url); |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
Elements titles = doc.select(".post-title"); |
||||
|
for (Element e : titles) { |
||||
|
articles.add(new Article(e.text(), url, "")); |
||||
|
} |
||||
|
logger.debug("Parsed {} blog articles from url: {}", articles.size(), url); |
||||
|
return articles; |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Failed to parse blog articles from url: {}", url, e); |
||||
|
throw new ParseException("Blog article parse failed for url: " + url, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public int getPriority() { |
||||
|
return 10; // 优先级高于默认策略
|
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,15 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlStrategy { |
||||
|
List<Article> parse(String url, Document doc) throws ParseException; |
||||
|
boolean supports(String url); |
||||
|
//增加优先级
|
||||
|
default int getPriority(){ |
||||
|
return 0; |
||||
|
} |
||||
|
|
||||
|
} |
||||
@ -0,0 +1,25 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class DefaultStrategy implements CrawlStrategy { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(DefaultStrategy.class); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
logger.debug("默认策略支持所有 URL:{}", url); |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) throws ParseException { |
||||
|
logger.info("使用默认策略解析:{}", url); |
||||
|
// 你的解析逻辑
|
||||
|
return List.of(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,95 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class HnuNewsStrategy implements CrawlStrategy { |
||||
|
|
||||
|
// 1. 添加 Logger 成员
|
||||
|
private static final Logger logger = LoggerFactory.getLogger(HnuNewsStrategy.class); |
||||
|
|
||||
|
// 2. 修正 URL 匹配逻辑(原逻辑仅匹配域名,建议增加路径灵活性)
|
||||
|
private static final Pattern URL_PATTERN = Pattern.compile(".*news\\.hnu\\.edu\\.cn.*"); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return URL_PATTERN.matcher(url).matches(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
|
||||
|
// 原有逻辑:尝试选择列表项
|
||||
|
// 注意:根据2026年5月的网页结构,实际可能需要调整为 div 或其他容器
|
||||
|
Elements listItems = doc.select("ul.list11 li"); |
||||
|
|
||||
|
if (listItems.isEmpty()) { |
||||
|
logger.warn("在 URL [{}] 中未找到符合选择器 'ul.list11 li' 的新闻列表项。可能网页结构已更新。", url); |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
for (Element li : listItems) { |
||||
|
Element link = li.selectFirst("a"); |
||||
|
if (link == null) { |
||||
|
logger.debug("跳过一个无链接的列表项: {}", li.toString()); |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
String articleUrl = link.attr("href"); |
||||
|
// 3. 修正 URL 拼接逻辑(原逻辑 replace("..") 可能不够健壮)
|
||||
|
if (!articleUrl.startsWith("http")) { |
||||
|
// 使用 URI 或简单的字符串处理来规范化路径
|
||||
|
articleUrl = "https://news.hnu.edu.cn/" + articleUrl; |
||||
|
// 这里简单处理,实际可能需要更复杂的路径规范化
|
||||
|
while (articleUrl.contains("/../")) { |
||||
|
int index = articleUrl.indexOf("/../"); |
||||
|
int prevSlash = articleUrl.lastIndexOf('/', index - 1); |
||||
|
if (prevSlash != -1) { |
||||
|
articleUrl = articleUrl.substring(0, prevSlash) + articleUrl.substring(index + 3); |
||||
|
} else { |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
String title = ""; |
||||
|
Element titleEl = link.selectFirst("h4.l2.h4s2"); |
||||
|
if (titleEl != null) { |
||||
|
title = titleEl.text().trim(); |
||||
|
} else { |
||||
|
logger.debug("在链接 [{}] 中未找到标题元素 h4.l2.h4s2", articleUrl); |
||||
|
} |
||||
|
|
||||
|
String content = ""; |
||||
|
Element contentEl = link.selectFirst("p.l3.ps3"); |
||||
|
if (contentEl != null) { |
||||
|
content = contentEl.text().trim(); |
||||
|
} |
||||
|
// 不再输出空内容警告,因 content 可能为空
|
||||
|
|
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, articleUrl, content)); |
||||
|
logger.debug("解析到新闻条目: [标题] {} - [URL] {}", title, articleUrl); |
||||
|
} else { |
||||
|
logger.trace("跳过空标题的链接: {}", articleUrl); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("成功解析 URL [{}],共提取 {} 篇新闻。", url, articles.size()); |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public int getPriority() { |
||||
|
return 15; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,57 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class NewsStrategy implements CrawlStrategy { |
||||
|
|
||||
|
// 1. 添加 Logger 成员
|
||||
|
private static final Logger logger = LoggerFactory.getLogger(NewsStrategy.class); |
||||
|
|
||||
|
// 使用正则匹配
|
||||
|
private static final Pattern URL_PATTERN = Pattern.compile(".*news\\.example\\.com.*"); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return URL_PATTERN.matcher(url).matches(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
|
||||
|
// 2. 添加解析过程日志
|
||||
|
logger.debug("开始解析 URL: [{}]", url); |
||||
|
|
||||
|
Elements items = doc.select(".article-headline"); |
||||
|
|
||||
|
if (items.isEmpty()) { |
||||
|
logger.warn("在 URL [{}] 中未找到符合选择器 '.article-headline' 的文章标题元素。", url); |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
for (Element e : items) { |
||||
|
String title = e.text().trim(); |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, url, "")); |
||||
|
logger.trace("提取到文章标题: {}", title); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("成功解析 URL [{}],共提取 {} 篇文章。", url, articles.size()); |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public int getPriority() { |
||||
|
return 10; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,29 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class StrategyFactory { |
||||
|
private final List<CrawlStrategy> strategies = new ArrayList<>(); |
||||
|
|
||||
|
public StrategyFactory() { |
||||
|
strategies.add(new HnuNewsStrategy()); |
||||
|
strategies.add(new BlogStrategy()); |
||||
|
strategies.add(new NewsStrategy()); |
||||
|
//注册默认策略
|
||||
|
strategies.add(new DefaultStrategy()); |
||||
|
} |
||||
|
|
||||
|
public CrawlStrategy getStrategy(String url) { |
||||
|
//按优先级降序排序
|
||||
|
return strategies.stream() |
||||
|
.sorted((s1, s2) -> Integer.compare(s2.getPriority(), s1.getPriority())) |
||||
|
.filter(s -> s.supports(url)) |
||||
|
.findFirst() |
||||
|
.orElse(null); // 如果默认策略未匹配到,返回 null 或默认策略本身
|
||||
|
} |
||||
|
|
||||
|
public void register(CrawlStrategy strategy) { |
||||
|
strategies.add(strategy); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,56 @@ |
|||||
|
package com.example.datacollect.view; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
public class ConsoleView { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class); |
||||
|
private static final String ANSI_RESET = "\u001B[0m"; |
||||
|
private static final String ANSI_GREEN = "\u001B[32m"; |
||||
|
private static final String ANSI_RED = "\u001B[31m"; |
||||
|
private static final String ANSI_BLUE = "\u001B[34m"; |
||||
|
|
||||
|
private final Scanner scanner = new Scanner(System.in); |
||||
|
|
||||
|
public String readLine() { |
||||
|
System.out.print("> "); |
||||
|
String input = scanner.nextLine(); |
||||
|
logger.debug("User input: {}", input); |
||||
|
return input; |
||||
|
} |
||||
|
|
||||
|
public void printSuccess(String msg) { |
||||
|
logger.info(msg); |
||||
|
System.out.println(ANSI_GREEN + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void printError(String msg) { |
||||
|
logger.error(msg); |
||||
|
System.out.println(ANSI_RED + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void printInfo(String msg) { |
||||
|
logger.info(msg); |
||||
|
System.out.println(ANSI_BLUE + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void display(List<Article> articles) { |
||||
|
logger.debug("Displaying {} articles", articles.size()); |
||||
|
if (articles.isEmpty()) { |
||||
|
String emptyMsg = "暂无文章,请先执行 crawl。"; |
||||
|
logger.info(emptyMsg); |
||||
|
printInfo(emptyMsg); |
||||
|
return; |
||||
|
} |
||||
|
for (int i = 0; i < articles.size(); i++) { |
||||
|
Article a = articles.get(i); |
||||
|
String articleStr = (i + 1) + ". " + a.getTitle() + " | " + a.getUrl(); |
||||
|
System.out.println(articleStr); |
||||
|
logger.debug(articleStr); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,39 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<configuration scan="true" scanPeriod="30 seconds"> |
||||
|
<!-- 控制台输出 --> |
||||
|
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
||||
|
<encoder> |
||||
|
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n</pattern> |
||||
|
<charset>UTF-8</charset> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<!-- 文件输出(按天滚动) --> |
||||
|
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender"> |
||||
|
<file>logs/crawler.log</file> |
||||
|
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy"> |
||||
|
<fileNamePattern>logs/crawler.%d{yyyy-MM-dd}.log</fileNamePattern> |
||||
|
<maxHistory>7</maxHistory> <!-- 保留7天日志 --> |
||||
|
<totalSizeCap>100MB</totalSizeCap> <!-- 总日志大小限制 --> |
||||
|
</rollingPolicy> |
||||
|
<encoder> |
||||
|
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n</pattern> |
||||
|
<charset>UTF-8</charset> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<!-- 根日志级别 --> |
||||
|
<root level="INFO"> |
||||
|
<appender-ref ref="CONSOLE"/> |
||||
|
<appender-ref ref="FILE"/> |
||||
|
</root> |
||||
|
|
||||
|
<!-- 自定义包日志级别 --> |
||||
|
<logger name="com.example.datacollect" level="DEBUG" additivity="false"> |
||||
|
<appender-ref ref="CONSOLE"/> |
||||
|
<appender-ref ref="FILE"/> |
||||
|
</logger> |
||||
|
|
||||
|
<!-- 第三方库日志级别(降低jsoup日志) --> |
||||
|
<logger name="org.jsoup" level="WARN"/> |
||||
|
</configuration> |
||||
@ -0,0 +1,96 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" |
||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 |
||||
|
https://maven.apache.org/xsd/maven-4.0.0.xsd"> |
||||
|
<modelVersion>4.0.0</modelVersion> |
||||
|
<groupId>com.example</groupId> |
||||
|
<artifactId>datacollect-cli</artifactId> |
||||
|
<version>0.1.0</version> |
||||
|
<packaging>jar</packaging> |
||||
|
|
||||
|
<properties> |
||||
|
<maven.compiler.source>11</maven.compiler.source> |
||||
|
<maven.compiler.target>11</maven.compiler.target> |
||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
||||
|
<!-- 统一日志版本,避免冲突 --> |
||||
|
<slf4j.version>2.0.9</slf4j.version> |
||||
|
<logback.version>1.4.14</logback.version> |
||||
|
</properties> |
||||
|
|
||||
|
<repositories> |
||||
|
<!-- 添加阿里云镜像,加速下载 --> |
||||
|
<repository> |
||||
|
<id>aliyun</id> |
||||
|
<name>Aliyun Maven</name> |
||||
|
<url>https://maven.aliyun.com/repository/public</url> |
||||
|
<releases> |
||||
|
<enabled>true</enabled> |
||||
|
</releases> |
||||
|
<snapshots> |
||||
|
<enabled>true</enabled> |
||||
|
</snapshots> |
||||
|
</repository> |
||||
|
</repositories> |
||||
|
|
||||
|
<dependencies> |
||||
|
<dependency> |
||||
|
<groupId>org.jsoup</groupId> |
||||
|
<artifactId>jsoup</artifactId> |
||||
|
<version>1.17.2</version> |
||||
|
</dependency> |
||||
|
|
||||
|
<!-- SLF4J 核心 API --> |
||||
|
<dependency> |
||||
|
<groupId>org.slf4j</groupId> |
||||
|
<artifactId>slf4j-api</artifactId> |
||||
|
<version>${slf4j.version}</version> |
||||
|
</dependency> |
||||
|
|
||||
|
<!-- Logback 实现 --> |
||||
|
<dependency> |
||||
|
<groupId>ch.qos.logback</groupId> |
||||
|
<artifactId>logback-classic</artifactId> |
||||
|
<version>${logback.version}</version> |
||||
|
</dependency> |
||||
|
</dependencies> |
||||
|
|
||||
|
<build> |
||||
|
<plugins> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-compiler-plugin</artifactId> |
||||
|
<version>3.8.1</version> |
||||
|
<configuration> |
||||
|
<source>${maven.compiler.source}</source> |
||||
|
<target>${maven.compiler.target}</target> |
||||
|
</configuration> |
||||
|
</plugin> |
||||
|
|
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-assembly-plugin</artifactId> |
||||
|
<version>3.3.0</version> |
||||
|
<configuration> |
||||
|
<archive> |
||||
|
<manifest> |
||||
|
<mainClass>com.example.datacollect.Main</mainClass> |
||||
|
</manifest> |
||||
|
</archive> |
||||
|
<descriptorRefs> |
||||
|
<descriptorRef>jar-with-dependencies</descriptorRef> |
||||
|
</descriptorRefs> |
||||
|
</configuration> |
||||
|
<executions> |
||||
|
<execution> |
||||
|
<id>make-assembly</id> |
||||
|
<phase>package</phase> |
||||
|
<goals> |
||||
|
<goal>single</goal> |
||||
|
</goals> |
||||
|
</execution> |
||||
|
</executions> |
||||
|
</plugin> |
||||
|
</plugins> |
||||
|
</build> |
||||
|
</project> |
||||
Loading…
Reference in new issue