You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
104 lines
3.9 KiB
104 lines
3.9 KiB
package com.example.datacollect.command;
|
|
|
|
import com.example.datacollect.exception.NetworkException;
|
|
import com.example.datacollect.exception.ParseException;
|
|
import com.example.datacollect.model.Article;
|
|
import com.example.datacollect.repository.ArticleRepository;
|
|
import com.example.datacollect.strategy.CrawlStrategy;
|
|
import com.example.datacollect.strategy.StrategyFactory;
|
|
import com.example.datacollect.view.ConsoleView;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.util.List;
|
|
import java.util.concurrent.TimeUnit;
|
|
import java.util.stream.Collectors;
|
|
|
|
public class CrawlCommand implements Command {
|
|
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class);
|
|
private static final int MAX_RETRY = 3;
|
|
private static final long RETRY_INTERVAL = 1000;
|
|
private final ConsoleView view;
|
|
private final StrategyFactory strategyFactory;
|
|
|
|
public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) {
|
|
this.view = view;
|
|
this.strategyFactory = strategyFactory;
|
|
}
|
|
|
|
@Override
|
|
public String getName() {
|
|
return "crawl";
|
|
}
|
|
|
|
@Override
|
|
public void execute(String[] args, ArticleRepository repository) {
|
|
if (args.length < 2) {
|
|
view.printError("用法: crawl <URL> (示例: crawl https://news.hnu.edu.cn)");
|
|
return;
|
|
}
|
|
String url = args[1];
|
|
logger.info("开始抓取URL: {}", url);
|
|
|
|
CrawlStrategy strategy = strategyFactory.getStrategy(url);
|
|
if (strategy == null) {
|
|
view.printError("无适配的抓取策略: " + url);
|
|
return;
|
|
}
|
|
|
|
// 重试逻辑
|
|
int retryCount = 0;
|
|
while (retryCount < MAX_RETRY) {
|
|
try {
|
|
Document doc = fetchDocumentWithRetry(url, retryCount);
|
|
List<Article> allArticles = strategy.parse(url, doc);
|
|
|
|
// 增量抓取核心:过滤已存在的URL
|
|
List<Article> newArticles = allArticles.stream()
|
|
.filter(article -> article.getUrl() != null && !article.getUrl().isBlank())
|
|
.filter(article -> !repository.containsUrl(article.getUrl()))
|
|
.collect(Collectors.toList());
|
|
|
|
if (newArticles.isEmpty()) {
|
|
view.printInfo("ℹ️ 无新文章(所有URL已存在)");
|
|
return;
|
|
}
|
|
|
|
// 批量添加新文章
|
|
repository.addAll(newArticles);
|
|
view.printSuccess("✅ 抓取成功 | 新增: " + newArticles.size() + " 篇 | 总计解析: " + allArticles.size() + " 篇");
|
|
return;
|
|
|
|
} catch (NetworkException e) {
|
|
retryCount++;
|
|
view.printError("⚠️ 网络异常(重试 " + retryCount + "/" + MAX_RETRY + "): " + e.getMessage());
|
|
if (retryCount >= MAX_RETRY) {
|
|
view.printError("❌ 抓取失败(超出最大重试次数): " + url);
|
|
}
|
|
try { TimeUnit.MILLISECONDS.sleep(RETRY_INTERVAL); }
|
|
catch (InterruptedException ie) { Thread.currentThread().interrupt(); break; }
|
|
|
|
} catch (ParseException e) {
|
|
view.printError("❌ 解析失败: " + e.getMessage());
|
|
return;
|
|
|
|
} catch (Exception e) {
|
|
view.printError("❌ 未知异常: " + e.getMessage());
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
private Document fetchDocumentWithRetry(String url, int retryCount) throws NetworkException {
|
|
try {
|
|
return Jsoup.connect(url)
|
|
.timeout(5000)
|
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
|
.get();
|
|
} catch (Exception e) {
|
|
throw new NetworkException("获取文档失败(重试" + retryCount + "): " + url, e);
|
|
}
|
|
}
|
|
}
|
|
|