You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

244 lines
11 KiB

package command;
import model.CrawlResult;
import model.Statistics;
import model.ResultContainer;
import strategy.CrawlStrategy;
import exception.CrawlerException;
import exception.NetworkException;
import exception.ParseException;
import view.CrawlerView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class CrawlCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class);
private final CrawlStrategy strategy;
private final int startPage;
private final int endPage;
private final String outputFile;
private final CrawlerView view;
private final int maxRetries;
private final int baseRetryDelay;
private final Statistics<String> statistics;
private final int pageDelay;
public CrawlCommand(CrawlStrategy strategy, int startPage, int endPage,
String outputFile, CrawlerView view) {
this(strategy, startPage, endPage, outputFile, view, 3, 1500, 1500);
}
public CrawlCommand(CrawlStrategy strategy, int startPage, int endPage,
String outputFile, CrawlerView view, int maxRetries, int baseRetryDelay, int pageDelay) {
validateConstructorParams(strategy, startPage, endPage, view);
this.strategy = strategy;
this.startPage = startPage;
this.endPage = endPage;
this.outputFile = outputFile;
this.view = view;
this.maxRetries = maxRetries;
this.baseRetryDelay = baseRetryDelay;
this.pageDelay = pageDelay;
this.statistics = new Statistics<>("CrawlCommand_" + strategy.getSiteName());
logger.debug("CrawlCommand 初始化: site={}, pages={}-{}, pageDelay={}ms",
strategy.getSiteName(), startPage, endPage, pageDelay);
}
private void validateConstructorParams(CrawlStrategy strategy, int startPage,
int endPage, CrawlerView view) {
if (strategy == null) {
throw new IllegalArgumentException("Strategy cannot be null");
}
if (startPage < 1) {
throw new IllegalArgumentException("Start page must be >= 1");
}
if (endPage < startPage) {
throw new IllegalArgumentException("End page must be >= start page");
}
if (view == null) {
throw new IllegalArgumentException("View cannot be null");
}
}
@Override
public List<CrawlResult> execute() throws CrawlerException {
List<CrawlResult> allResults = new ArrayList<>();
int consecutiveFailures = 0;
logger.info("开始爬取: {} (页码 {} 到 {})", strategy.getSiteName(), startPage, endPage);
view.showMessage("开始爬取: " + strategy.getSiteName());
view.showMessage("目标: " + (endPage - startPage + 1) + " 页数据,每页间隔 " + pageDelay + "ms");
view.showLine();
for (int page = startPage; page <= endPage; page++) {
List<CrawlResult> pageResults = null;
boolean pageSuccess = false;
for (int retry = 0; retry < maxRetries; retry++) {
try {
logger.info("正在爬取第 {} 页...", page);
view.showMessage("正在爬取第 " + page + " 页...");
pageResults = strategy.crawlPage(page);
if (pageResults != null && !pageResults.isEmpty()) {
consecutiveFailures = 0;
statistics.increment("success_pages");
logger.debug("第 {} 页爬取成功,获取 {} 条数据", page, pageResults.size());
pageSuccess = true;
break;
}
logger.warn("第 {} 页返回空结果", page);
view.showWarning("第 " + page + " 页返回空结果");
if (retry < maxRetries - 1) {
int delay = baseRetryDelay * (int) Math.pow(2, retry);
view.showMessage("等待 " + delay + "ms 后重试...");
Thread.sleep(delay);
}
} catch (ParseException e) {
consecutiveFailures++;
statistics.increment("parse_failures");
logger.error("第 {} 页解析失败 (尝试 {}/{}): {}",
page, retry + 1, maxRetries, e.getMessage());
view.showError("解析失败: " + e.getMessage());
if (retry < maxRetries - 1) {
try {
int delay = baseRetryDelay * (int) Math.pow(2, retry);
view.showMessage("等待 " + delay + "ms 后重试...");
Thread.sleep(delay);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
logger.warn("爬取过程被中断");
throw new CrawlerException("爬取被中断", ie);
}
}
} catch (IOException e) {
consecutiveFailures++;
statistics.increment("network_failures");
logger.error("【网络异常】第 {} 页网络请求失败 (尝试 {}/{}): {}",
page, retry + 1, maxRetries, e.getMessage());
view.showError("【网络异常】" + e.getMessage());
String exceptionType = getNetworkExceptionType(e);
if (exceptionType != null) {
logger.error("【断网检测】检测到网络中断类型: {}", exceptionType);
view.showError("【断网检测】" + exceptionType);
}
if (consecutiveFailures >= 2) {
logger.error("【断网检测】连续失败2次,判定为网络异常,立即停止爬取");
view.showError("【断网检测】连续失败2次,判定为网络异常,停止爬取");
throw new NetworkException("【断网异常】网络请求连续失败,请检查网络连接状态: " + e.getMessage(), e);
}
if (retry < maxRetries - 1) {
try {
int delay = baseRetryDelay * (int) Math.pow(2, retry);
view.showMessage("等待 " + delay + "ms 后重试...");
Thread.sleep(delay);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
logger.warn("爬取过程被中断");
throw new NetworkException("爬取被中断", ie);
}
} else {
throw new NetworkException("【网络连接失败】网络请求失败,请检查网络连接: " + e.getMessage(), e);
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("爬取过程被中断");
throw new CrawlerException("爬取被中断", e);
}
}
if (pageSuccess && pageResults != null) {
allResults.addAll(pageResults);
view.showSuccess("Page " + page + ": " + pageResults.size() + " items");
statistics.increment("total_items", pageResults.size());
}
if (page < endPage && pageSuccess) {
try {
logger.debug("翻页间隔等待 {}ms", pageDelay);
Thread.sleep(pageDelay);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("翻页等待被中断");
break;
}
}
if (allResults.size() >= 200) {
logger.info("已达到最大数据量 200 条,停止爬取");
view.showWarning("已达到最大数据量 200 条,停止爬取");
break;
}
}
view.showLine();
view.showMessage("爬取完成,共获取 " + allResults.size() + " 条数据");
statistics.record("total_items", allResults.size());
logger.info("爬取完成: {} 共获取 {} 条数据", strategy.getSiteName(), allResults.size());
if (allResults.isEmpty()) {
view.showError("警告: 未能获取到任何数据!");
logger.error("未能获取到任何数据");
}
return allResults;
}
@Override
public String getName() {
return "CrawlCommand[" + strategy.getSiteName() + "]";
}
public String getOutputFile() {
return outputFile;
}
public CrawlStrategy getStrategy() {
return strategy;
}
public Statistics<String> getStatistics() {
return statistics;
}
private String getNetworkExceptionType(IOException e) {
Throwable cause = e.getCause();
if (cause instanceof java.net.ConnectException) {
return "【网络连接失败】无法连接到服务器,请检查网络连接";
} else if (cause instanceof java.net.UnknownHostException) {
return "【DNS解析失败】无法解析域名,请检查网络或DNS设置";
} else if (cause instanceof java.net.NoRouteToHostException) {
return "【路由不可达】无法到达目标主机,请检查网络连接";
} else if (cause instanceof java.net.SocketException) {
return "【Socket异常】网络连接异常,请检查网络状态";
} else if (cause instanceof java.net.SocketTimeoutException) {
return "【连接超时】网络请求超时,请检查网络稳定性";
}
String message = e.getMessage();
if (message != null) {
if (message.contains("Connection refused")) {
return "【连接被拒绝】服务器拒绝连接,请检查网络或稍后重试";
} else if (message.contains("UnknownHost")) {
return "【域名解析失败】无法解析域名,请检查网络连接";
} else if (message.contains("timeout")) {
return "【请求超时】网络请求超时,请检查网络稳定性";
} else if (message.contains("Network is unreachable")) {
return "【网络不可达】网络连接不可用,请检查网络状态";
} else if (message.contains("Connection reset")) {
return "【连接重置】连接被服务器重置,请检查网络";
}
}
return null;
}
}