You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

104 lines
3.9 KiB

package com.example.datacollect.command;
import com.example.datacollect.exception.NetworkException;
import com.example.datacollect.exception.ParseException;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.strategy.CrawlStrategy;
import com.example.datacollect.strategy.StrategyFactory;
import com.example.datacollect.view.ConsoleView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
public class CrawlCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class);
private static final int MAX_RETRY = 3;
private static final long RETRY_INTERVAL = 1000;
private final ConsoleView view;
private final StrategyFactory strategyFactory;
public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) {
this.view = view;
this.strategyFactory = strategyFactory;
}
@Override
public String getName() {
return "crawl";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
if (args.length < 2) {
view.printError("用法: crawl <URL> (示例: crawl https://news.hnu.edu.cn)");
return;
}
String url = args[1];
logger.info("开始抓取URL: {}", url);
CrawlStrategy strategy = strategyFactory.getStrategy(url);
if (strategy == null) {
view.printError("无适配的抓取策略: " + url);
return;
}
// 重试逻辑
int retryCount = 0;
while (retryCount < MAX_RETRY) {
try {
Document doc = fetchDocumentWithRetry(url, retryCount);
List<Article> allArticles = strategy.parse(url, doc);
// 增量抓取核心:过滤已存在的URL
List<Article> newArticles = allArticles.stream()
.filter(article -> article.getUrl() != null && !article.getUrl().isBlank())
.filter(article -> !repository.containsUrl(article.getUrl()))
.collect(Collectors.toList());
if (newArticles.isEmpty()) {
view.printInfo("ℹ️ 无新文章(所有URL已存在)");
return;
}
// 批量添加新文章
repository.addAll(newArticles);
view.printSuccess("✅ 抓取成功 | 新增: " + newArticles.size() + " 篇 | 总计解析: " + allArticles.size() + " 篇");
return;
} catch (NetworkException e) {
retryCount++;
view.printError("⚠️ 网络异常(重试 " + retryCount + "/" + MAX_RETRY + "): " + e.getMessage());
if (retryCount >= MAX_RETRY) {
view.printError("❌ 抓取失败(超出最大重试次数): " + url);
}
try { TimeUnit.MILLISECONDS.sleep(RETRY_INTERVAL); }
catch (InterruptedException ie) { Thread.currentThread().interrupt(); break; }
} catch (ParseException e) {
view.printError("❌ 解析失败: " + e.getMessage());
return;
} catch (Exception e) {
view.printError("❌ 未知异常: " + e.getMessage());
return;
}
}
}
private Document fetchDocumentWithRetry(String url, int retryCount) throws NetworkException {
try {
return Jsoup.connect(url)
.timeout(5000)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.get();
} catch (Exception e) {
throw new NetworkException("获取文档失败(重试" + retryCount + "): " + url, e);
}
}
}