package command; import model.CrawlResult; import model.Statistics; import model.ResultContainer; import strategy.CrawlStrategy; import exception.CrawlerException; import exception.NetworkException; import exception.ParseException; import view.CrawlerView; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class CrawlCommand implements Command { private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); private final CrawlStrategy strategy; private final int startPage; private final int endPage; private final String outputFile; private final CrawlerView view; private final int maxRetries; private final int baseRetryDelay; private final Statistics statistics; private final int pageDelay; public CrawlCommand(CrawlStrategy strategy, int startPage, int endPage, String outputFile, CrawlerView view) { this(strategy, startPage, endPage, outputFile, view, 3, 1500, 1500); } public CrawlCommand(CrawlStrategy strategy, int startPage, int endPage, String outputFile, CrawlerView view, int maxRetries, int baseRetryDelay, int pageDelay) { validateConstructorParams(strategy, startPage, endPage, view); this.strategy = strategy; this.startPage = startPage; this.endPage = endPage; this.outputFile = outputFile; this.view = view; this.maxRetries = maxRetries; this.baseRetryDelay = baseRetryDelay; this.pageDelay = pageDelay; this.statistics = new Statistics<>("CrawlCommand_" + strategy.getSiteName()); logger.debug("CrawlCommand 初始化: site={}, pages={}-{}, pageDelay={}ms", strategy.getSiteName(), startPage, endPage, pageDelay); } private void validateConstructorParams(CrawlStrategy strategy, int startPage, int endPage, CrawlerView view) { if (strategy == null) { throw new IllegalArgumentException("Strategy cannot be null"); } if (startPage < 1) { throw new IllegalArgumentException("Start page must be >= 1"); } if (endPage < startPage) { throw new IllegalArgumentException("End page must be >= start page"); } if (view == null) { throw new IllegalArgumentException("View cannot be null"); } } @Override public List execute() throws CrawlerException { List allResults = new ArrayList<>(); int consecutiveFailures = 0; logger.info("开始爬取: {} (页码 {} 到 {})", strategy.getSiteName(), startPage, endPage); view.showMessage("开始爬取: " + strategy.getSiteName()); view.showMessage("目标: " + (endPage - startPage + 1) + " 页数据,每页间隔 " + pageDelay + "ms"); view.showLine(); for (int page = startPage; page <= endPage; page++) { List pageResults = null; boolean pageSuccess = false; for (int retry = 0; retry < maxRetries; retry++) { try { logger.info("正在爬取第 {} 页...", page); view.showMessage("正在爬取第 " + page + " 页..."); pageResults = strategy.crawlPage(page); if (pageResults != null && !pageResults.isEmpty()) { consecutiveFailures = 0; statistics.increment("success_pages"); logger.debug("第 {} 页爬取成功,获取 {} 条数据", page, pageResults.size()); pageSuccess = true; break; } logger.warn("第 {} 页返回空结果", page); view.showWarning("第 " + page + " 页返回空结果"); if (retry < maxRetries - 1) { int delay = baseRetryDelay * (int) Math.pow(2, retry); view.showMessage("等待 " + delay + "ms 后重试..."); Thread.sleep(delay); } } catch (ParseException e) { consecutiveFailures++; statistics.increment("parse_failures"); logger.error("第 {} 页解析失败 (尝试 {}/{}): {}", page, retry + 1, maxRetries, e.getMessage()); view.showError("解析失败: " + e.getMessage()); if (retry < maxRetries - 1) { try { int delay = baseRetryDelay * (int) Math.pow(2, retry); view.showMessage("等待 " + delay + "ms 后重试..."); Thread.sleep(delay); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); logger.warn("爬取过程被中断"); throw new CrawlerException("爬取被中断", ie); } } } catch (IOException e) { consecutiveFailures++; statistics.increment("network_failures"); logger.error("【网络异常】第 {} 页网络请求失败 (尝试 {}/{}): {}", page, retry + 1, maxRetries, e.getMessage()); view.showError("【网络异常】" + e.getMessage()); String exceptionType = getNetworkExceptionType(e); if (exceptionType != null) { logger.error("【断网检测】检测到网络中断类型: {}", exceptionType); view.showError("【断网检测】" + exceptionType); } if (consecutiveFailures >= 2) { logger.error("【断网检测】连续失败2次,判定为网络异常,立即停止爬取"); view.showError("【断网检测】连续失败2次,判定为网络异常,停止爬取"); throw new NetworkException("【断网异常】网络请求连续失败,请检查网络连接状态: " + e.getMessage(), e); } if (retry < maxRetries - 1) { try { int delay = baseRetryDelay * (int) Math.pow(2, retry); view.showMessage("等待 " + delay + "ms 后重试..."); Thread.sleep(delay); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); logger.warn("爬取过程被中断"); throw new NetworkException("爬取被中断", ie); } } else { throw new NetworkException("【网络连接失败】网络请求失败,请检查网络连接: " + e.getMessage(), e); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); logger.warn("爬取过程被中断"); throw new CrawlerException("爬取被中断", e); } } if (pageSuccess && pageResults != null) { allResults.addAll(pageResults); view.showSuccess("Page " + page + ": " + pageResults.size() + " items"); statistics.increment("total_items", pageResults.size()); } if (page < endPage && pageSuccess) { try { logger.debug("翻页间隔等待 {}ms", pageDelay); Thread.sleep(pageDelay); } catch (InterruptedException e) { Thread.currentThread().interrupt(); logger.warn("翻页等待被中断"); break; } } if (allResults.size() >= 200) { logger.info("已达到最大数据量 200 条,停止爬取"); view.showWarning("已达到最大数据量 200 条,停止爬取"); break; } } view.showLine(); view.showMessage("爬取完成,共获取 " + allResults.size() + " 条数据"); statistics.record("total_items", allResults.size()); logger.info("爬取完成: {} 共获取 {} 条数据", strategy.getSiteName(), allResults.size()); if (allResults.isEmpty()) { view.showError("警告: 未能获取到任何数据!"); logger.error("未能获取到任何数据"); } return allResults; } @Override public String getName() { return "CrawlCommand[" + strategy.getSiteName() + "]"; } public String getOutputFile() { return outputFile; } public CrawlStrategy getStrategy() { return strategy; } public Statistics getStatistics() { return statistics; } private String getNetworkExceptionType(IOException e) { Throwable cause = e.getCause(); if (cause instanceof java.net.ConnectException) { return "【网络连接失败】无法连接到服务器,请检查网络连接"; } else if (cause instanceof java.net.UnknownHostException) { return "【DNS解析失败】无法解析域名,请检查网络或DNS设置"; } else if (cause instanceof java.net.NoRouteToHostException) { return "【路由不可达】无法到达目标主机,请检查网络连接"; } else if (cause instanceof java.net.SocketException) { return "【Socket异常】网络连接异常,请检查网络状态"; } else if (cause instanceof java.net.SocketTimeoutException) { return "【连接超时】网络请求超时,请检查网络稳定性"; } String message = e.getMessage(); if (message != null) { if (message.contains("Connection refused")) { return "【连接被拒绝】服务器拒绝连接,请检查网络或稍后重试"; } else if (message.contains("UnknownHost")) { return "【域名解析失败】无法解析域名,请检查网络连接"; } else if (message.contains("timeout")) { return "【请求超时】网络请求超时,请检查网络稳定性"; } else if (message.contains("Network is unreachable")) { return "【网络不可达】网络连接不可用,请检查网络状态"; } else if (message.contains("Connection reset")) { return "【连接重置】连接被服务器重置,请检查网络"; } } return null; } }