package com.example.datacollect.command; import com.example.datacollect.exception.NetworkException; import com.example.datacollect.exception.ParseException; import com.example.datacollect.model.Article; import com.example.datacollect.repository.ArticleRepository; import com.example.datacollect.strategy.CrawlStrategy; import com.example.datacollect.strategy.StrategyFactory; import com.example.datacollect.view.ConsoleView; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.List; import java.util.concurrent.TimeUnit; public class CrawlCommand implements Command { private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); private static final int MAX_RETRY = 3; // 最大重试次数 private static final long RETRY_INTERVAL = 1000; // 重试间隔(毫秒) private final ConsoleView view; private final StrategyFactory strategyFactory; public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { this.view = view; this.strategyFactory = strategyFactory; } @Override public String getName() { return "crawl"; } @Override public void execute(String[] args, ArticleRepository repository) { if (args.length < 2) { String errorMsg = "Crawl command usage: crawl "; logger.error(errorMsg); view.printError(errorMsg); return; } String url = args[1]; logger.info("Start crawling url: {}", url); CrawlStrategy strategy = strategyFactory.getStrategy(url); if (strategy == null) { String errorMsg = "No crawl strategy found for url: " + url; logger.error(errorMsg); view.printError(errorMsg); return; } // 重试逻辑 int retryCount = 0; while (retryCount < MAX_RETRY) { try { Document doc = fetchDocumentWithRetry(url, retryCount); List
articles = strategy.parse(url, doc); // 批量添加(复用Repository的addAll方法) repository.addAll(articles); String successMsg = "Crawled " + articles.size() + " articles from url: " + url; logger.info(successMsg); view.printSuccess(successMsg); return; // 成功则退出重试循环 } catch (NetworkException e) { retryCount++; String retryMsg = String.format("Network error (retry %d/%d): %s", retryCount, MAX_RETRY, e.getMessage()); logger.warn(retryMsg); view.printError(retryMsg); if (retryCount >= MAX_RETRY) { String failMsg = "Failed to crawl url after " + MAX_RETRY + " retries: " + url; logger.error(failMsg, e); view.printError(failMsg); } // 重试间隔 try { TimeUnit.MILLISECONDS.sleep(RETRY_INTERVAL); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); logger.error("Retry sleep interrupted", ie); break; } } catch (ParseException e) { String errorMsg = "Parse failed for url: " + url; logger.error(errorMsg, e); view.printError(errorMsg); return; // 解析异常不重试 } catch (Exception e) { String errorMsg = "Unexpected error when crawling url: " + url; logger.error(errorMsg, e); view.printError(errorMsg); return; } } } // 抽取文档获取逻辑,抛出网络异常 private Document fetchDocumentWithRetry(String url, int retryCount) throws NetworkException { try { logger.debug("Fetching document (retry {}) for url: {}", retryCount, url); return Jsoup.connect(url) .timeout(5000) // 超时时间5秒 .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") .get(); } catch (Exception e) { throw new NetworkException("Failed to fetch document (retry " + retryCount + ") for url: " + url, e); } } }