You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
244 lines
11 KiB
244 lines
11 KiB
package command;
|
|
|
|
import model.CrawlResult;
|
|
import model.Statistics;
|
|
import model.ResultContainer;
|
|
import strategy.CrawlStrategy;
|
|
import exception.CrawlerException;
|
|
import exception.NetworkException;
|
|
import exception.ParseException;
|
|
import view.CrawlerView;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class CrawlCommand implements Command {
|
|
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class);
|
|
|
|
private final CrawlStrategy strategy;
|
|
private final int startPage;
|
|
private final int endPage;
|
|
private final String outputFile;
|
|
private final CrawlerView view;
|
|
private final int maxRetries;
|
|
private final int baseRetryDelay;
|
|
private final Statistics<String> statistics;
|
|
private final int pageDelay;
|
|
|
|
public CrawlCommand(CrawlStrategy strategy, int startPage, int endPage,
|
|
String outputFile, CrawlerView view) {
|
|
this(strategy, startPage, endPage, outputFile, view, 3, 1500, 1500);
|
|
}
|
|
|
|
public CrawlCommand(CrawlStrategy strategy, int startPage, int endPage,
|
|
String outputFile, CrawlerView view, int maxRetries, int baseRetryDelay, int pageDelay) {
|
|
validateConstructorParams(strategy, startPage, endPage, view);
|
|
this.strategy = strategy;
|
|
this.startPage = startPage;
|
|
this.endPage = endPage;
|
|
this.outputFile = outputFile;
|
|
this.view = view;
|
|
this.maxRetries = maxRetries;
|
|
this.baseRetryDelay = baseRetryDelay;
|
|
this.pageDelay = pageDelay;
|
|
this.statistics = new Statistics<>("CrawlCommand_" + strategy.getSiteName());
|
|
logger.debug("CrawlCommand 初始化: site={}, pages={}-{}, pageDelay={}ms",
|
|
strategy.getSiteName(), startPage, endPage, pageDelay);
|
|
}
|
|
|
|
private void validateConstructorParams(CrawlStrategy strategy, int startPage,
|
|
int endPage, CrawlerView view) {
|
|
if (strategy == null) {
|
|
throw new IllegalArgumentException("Strategy cannot be null");
|
|
}
|
|
if (startPage < 1) {
|
|
throw new IllegalArgumentException("Start page must be >= 1");
|
|
}
|
|
if (endPage < startPage) {
|
|
throw new IllegalArgumentException("End page must be >= start page");
|
|
}
|
|
if (view == null) {
|
|
throw new IllegalArgumentException("View cannot be null");
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public List<CrawlResult> execute() throws CrawlerException {
|
|
List<CrawlResult> allResults = new ArrayList<>();
|
|
int consecutiveFailures = 0;
|
|
logger.info("开始爬取: {} (页码 {} 到 {})", strategy.getSiteName(), startPage, endPage);
|
|
view.showMessage("开始爬取: " + strategy.getSiteName());
|
|
view.showMessage("目标: " + (endPage - startPage + 1) + " 页数据,每页间隔 " + pageDelay + "ms");
|
|
view.showLine();
|
|
|
|
for (int page = startPage; page <= endPage; page++) {
|
|
List<CrawlResult> pageResults = null;
|
|
boolean pageSuccess = false;
|
|
|
|
for (int retry = 0; retry < maxRetries; retry++) {
|
|
try {
|
|
logger.info("正在爬取第 {} 页...", page);
|
|
view.showMessage("正在爬取第 " + page + " 页...");
|
|
pageResults = strategy.crawlPage(page);
|
|
|
|
if (pageResults != null && !pageResults.isEmpty()) {
|
|
consecutiveFailures = 0;
|
|
statistics.increment("success_pages");
|
|
logger.debug("第 {} 页爬取成功,获取 {} 条数据", page, pageResults.size());
|
|
pageSuccess = true;
|
|
break;
|
|
}
|
|
logger.warn("第 {} 页返回空结果", page);
|
|
view.showWarning("第 " + page + " 页返回空结果");
|
|
|
|
if (retry < maxRetries - 1) {
|
|
int delay = baseRetryDelay * (int) Math.pow(2, retry);
|
|
view.showMessage("等待 " + delay + "ms 后重试...");
|
|
Thread.sleep(delay);
|
|
}
|
|
} catch (ParseException e) {
|
|
consecutiveFailures++;
|
|
statistics.increment("parse_failures");
|
|
logger.error("第 {} 页解析失败 (尝试 {}/{}): {}",
|
|
page, retry + 1, maxRetries, e.getMessage());
|
|
view.showError("解析失败: " + e.getMessage());
|
|
|
|
if (retry < maxRetries - 1) {
|
|
try {
|
|
int delay = baseRetryDelay * (int) Math.pow(2, retry);
|
|
view.showMessage("等待 " + delay + "ms 后重试...");
|
|
Thread.sleep(delay);
|
|
} catch (InterruptedException ie) {
|
|
Thread.currentThread().interrupt();
|
|
logger.warn("爬取过程被中断");
|
|
throw new CrawlerException("爬取被中断", ie);
|
|
}
|
|
}
|
|
} catch (IOException e) {
|
|
consecutiveFailures++;
|
|
statistics.increment("network_failures");
|
|
logger.error("【网络异常】第 {} 页网络请求失败 (尝试 {}/{}): {}",
|
|
page, retry + 1, maxRetries, e.getMessage());
|
|
view.showError("【网络异常】" + e.getMessage());
|
|
|
|
String exceptionType = getNetworkExceptionType(e);
|
|
if (exceptionType != null) {
|
|
logger.error("【断网检测】检测到网络中断类型: {}", exceptionType);
|
|
view.showError("【断网检测】" + exceptionType);
|
|
}
|
|
|
|
if (consecutiveFailures >= 2) {
|
|
logger.error("【断网检测】连续失败2次,判定为网络异常,立即停止爬取");
|
|
view.showError("【断网检测】连续失败2次,判定为网络异常,停止爬取");
|
|
throw new NetworkException("【断网异常】网络请求连续失败,请检查网络连接状态: " + e.getMessage(), e);
|
|
}
|
|
|
|
if (retry < maxRetries - 1) {
|
|
try {
|
|
int delay = baseRetryDelay * (int) Math.pow(2, retry);
|
|
view.showMessage("等待 " + delay + "ms 后重试...");
|
|
Thread.sleep(delay);
|
|
} catch (InterruptedException ie) {
|
|
Thread.currentThread().interrupt();
|
|
logger.warn("爬取过程被中断");
|
|
throw new NetworkException("爬取被中断", ie);
|
|
}
|
|
} else {
|
|
throw new NetworkException("【网络连接失败】网络请求失败,请检查网络连接: " + e.getMessage(), e);
|
|
}
|
|
} catch (InterruptedException e) {
|
|
Thread.currentThread().interrupt();
|
|
logger.warn("爬取过程被中断");
|
|
throw new CrawlerException("爬取被中断", e);
|
|
}
|
|
}
|
|
|
|
if (pageSuccess && pageResults != null) {
|
|
allResults.addAll(pageResults);
|
|
view.showSuccess("Page " + page + ": " + pageResults.size() + " items");
|
|
statistics.increment("total_items", pageResults.size());
|
|
}
|
|
|
|
if (page < endPage && pageSuccess) {
|
|
try {
|
|
logger.debug("翻页间隔等待 {}ms", pageDelay);
|
|
Thread.sleep(pageDelay);
|
|
} catch (InterruptedException e) {
|
|
Thread.currentThread().interrupt();
|
|
logger.warn("翻页等待被中断");
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (allResults.size() >= 200) {
|
|
logger.info("已达到最大数据量 200 条,停止爬取");
|
|
view.showWarning("已达到最大数据量 200 条,停止爬取");
|
|
break;
|
|
}
|
|
}
|
|
|
|
view.showLine();
|
|
view.showMessage("爬取完成,共获取 " + allResults.size() + " 条数据");
|
|
statistics.record("total_items", allResults.size());
|
|
logger.info("爬取完成: {} 共获取 {} 条数据", strategy.getSiteName(), allResults.size());
|
|
|
|
if (allResults.isEmpty()) {
|
|
view.showError("警告: 未能获取到任何数据!");
|
|
logger.error("未能获取到任何数据");
|
|
}
|
|
|
|
return allResults;
|
|
}
|
|
|
|
@Override
|
|
public String getName() {
|
|
return "CrawlCommand[" + strategy.getSiteName() + "]";
|
|
}
|
|
|
|
public String getOutputFile() {
|
|
return outputFile;
|
|
}
|
|
|
|
public CrawlStrategy getStrategy() {
|
|
return strategy;
|
|
}
|
|
|
|
public Statistics<String> getStatistics() {
|
|
return statistics;
|
|
}
|
|
|
|
private String getNetworkExceptionType(IOException e) {
|
|
Throwable cause = e.getCause();
|
|
if (cause instanceof java.net.ConnectException) {
|
|
return "【网络连接失败】无法连接到服务器,请检查网络连接";
|
|
} else if (cause instanceof java.net.UnknownHostException) {
|
|
return "【DNS解析失败】无法解析域名,请检查网络或DNS设置";
|
|
} else if (cause instanceof java.net.NoRouteToHostException) {
|
|
return "【路由不可达】无法到达目标主机,请检查网络连接";
|
|
} else if (cause instanceof java.net.SocketException) {
|
|
return "【Socket异常】网络连接异常,请检查网络状态";
|
|
} else if (cause instanceof java.net.SocketTimeoutException) {
|
|
return "【连接超时】网络请求超时,请检查网络稳定性";
|
|
}
|
|
|
|
String message = e.getMessage();
|
|
if (message != null) {
|
|
if (message.contains("Connection refused")) {
|
|
return "【连接被拒绝】服务器拒绝连接,请检查网络或稍后重试";
|
|
} else if (message.contains("UnknownHost")) {
|
|
return "【域名解析失败】无法解析域名,请检查网络连接";
|
|
} else if (message.contains("timeout")) {
|
|
return "【请求超时】网络请求超时,请检查网络稳定性";
|
|
} else if (message.contains("Network is unreachable")) {
|
|
return "【网络不可达】网络连接不可用,请检查网络状态";
|
|
} else if (message.contains("Connection reset")) {
|
|
return "【连接重置】连接被服务器重置,请检查网络";
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
}
|