You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
115 lines
4.4 KiB
115 lines
4.4 KiB
package com.example.datacollect.command;
|
|
|
|
import com.example.datacollect.exception.NetworkException;
|
|
import com.example.datacollect.exception.ParseException;
|
|
import com.example.datacollect.model.Article;
|
|
import com.example.datacollect.repository.ArticleRepository;
|
|
import com.example.datacollect.strategy.CrawlStrategy;
|
|
import com.example.datacollect.strategy.StrategyFactory;
|
|
import com.example.datacollect.view.ConsoleView;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.util.List;
|
|
import java.util.concurrent.TimeUnit;
|
|
|
|
public class CrawlCommand implements Command {
|
|
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class);
|
|
private static final int MAX_RETRY = 3; // 最大重试次数
|
|
private static final long RETRY_INTERVAL = 1000; // 重试间隔(毫秒)
|
|
|
|
private final ConsoleView view;
|
|
private final StrategyFactory strategyFactory;
|
|
|
|
public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) {
|
|
this.view = view;
|
|
this.strategyFactory = strategyFactory;
|
|
}
|
|
|
|
@Override
|
|
public String getName() {
|
|
return "crawl";
|
|
}
|
|
|
|
@Override
|
|
public void execute(String[] args, ArticleRepository repository) {
|
|
if (args.length < 2) {
|
|
String errorMsg = "Crawl command usage: crawl <url>";
|
|
logger.error(errorMsg);
|
|
view.printError(errorMsg);
|
|
return;
|
|
}
|
|
String url = args[1];
|
|
logger.info("Start crawling url: {}", url);
|
|
|
|
CrawlStrategy strategy = strategyFactory.getStrategy(url);
|
|
if (strategy == null) {
|
|
String errorMsg = "No crawl strategy found for url: " + url;
|
|
logger.error(errorMsg);
|
|
view.printError(errorMsg);
|
|
return;
|
|
}
|
|
|
|
// 重试逻辑
|
|
int retryCount = 0;
|
|
while (retryCount < MAX_RETRY) {
|
|
try {
|
|
Document doc = fetchDocumentWithRetry(url, retryCount);
|
|
List<Article> articles = strategy.parse(url, doc);
|
|
|
|
// 批量添加(复用Repository的addAll方法)
|
|
repository.addAll(articles);
|
|
|
|
String successMsg = "Crawled " + articles.size() + " articles from url: " + url;
|
|
logger.info(successMsg);
|
|
view.printSuccess(successMsg);
|
|
return; // 成功则退出重试循环
|
|
} catch (NetworkException e) {
|
|
retryCount++;
|
|
String retryMsg = String.format("Network error (retry %d/%d): %s", retryCount, MAX_RETRY, e.getMessage());
|
|
logger.warn(retryMsg);
|
|
view.printError(retryMsg);
|
|
|
|
if (retryCount >= MAX_RETRY) {
|
|
String failMsg = "Failed to crawl url after " + MAX_RETRY + " retries: " + url;
|
|
logger.error(failMsg, e);
|
|
view.printError(failMsg);
|
|
}
|
|
|
|
// 重试间隔
|
|
try {
|
|
TimeUnit.MILLISECONDS.sleep(RETRY_INTERVAL);
|
|
} catch (InterruptedException ie) {
|
|
Thread.currentThread().interrupt();
|
|
logger.error("Retry sleep interrupted", ie);
|
|
break;
|
|
}
|
|
} catch (ParseException e) {
|
|
String errorMsg = "Parse failed for url: " + url;
|
|
logger.error(errorMsg, e);
|
|
view.printError(errorMsg);
|
|
return; // 解析异常不重试
|
|
} catch (Exception e) {
|
|
String errorMsg = "Unexpected error when crawling url: " + url;
|
|
logger.error(errorMsg, e);
|
|
view.printError(errorMsg);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
// 抽取文档获取逻辑,抛出网络异常
|
|
private Document fetchDocumentWithRetry(String url, int retryCount) throws NetworkException {
|
|
try {
|
|
logger.debug("Fetching document (retry {}) for url: {}", retryCount, url);
|
|
return Jsoup.connect(url)
|
|
.timeout(5000) // 超时时间5秒
|
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
|
.get();
|
|
} catch (Exception e) {
|
|
throw new NetworkException("Failed to fetch document (retry " + retryCount + ") for url: " + url, e);
|
|
}
|
|
}
|
|
}
|
|
|