You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
331 lines
14 KiB
331 lines
14 KiB
package controller;
|
|
|
|
import model.CrawlResult;
|
|
import model.Statistics;
|
|
import model.ResultContainer;
|
|
import repository.Repository;
|
|
import command.Command;
|
|
import command.CommandInvoker;
|
|
import command.CrawlCommand;
|
|
import command.RetryCommand;
|
|
import strategy.CrawlStrategy;
|
|
import strategy.DangDangStrategy;
|
|
import strategy.WeatherStrategy;
|
|
import strategy.MovieStrategy;
|
|
import strategy.Train12306Strategy;
|
|
import strategy.CsdnBlogStrategy;
|
|
import exception.CrawlerException;
|
|
import exception.NetworkException;
|
|
import view.CrawlerView;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.io.File;
|
|
import java.io.FileOutputStream;
|
|
import java.io.IOException;
|
|
import java.io.OutputStreamWriter;
|
|
import java.io.PrintWriter;
|
|
import java.util.List;
|
|
|
|
public class CrawlerController {
|
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class);
|
|
|
|
private final CrawlerView view;
|
|
private final CommandInvoker invoker;
|
|
private final Repository<CrawlResult> dataRepository;
|
|
private final Statistics<String> statistics;
|
|
|
|
public CrawlerController(CrawlerView view) {
|
|
if (view == null) {
|
|
throw new IllegalArgumentException("View cannot be null");
|
|
}
|
|
this.view = view;
|
|
this.invoker = new CommandInvoker(view);
|
|
this.dataRepository = new Repository<>(CrawlResult.class);
|
|
this.statistics = new Statistics<>("CrawlerController");
|
|
logger.info("CrawlerController 初始化完成");
|
|
}
|
|
|
|
public ResultContainer<List<CrawlResult>> runDangDangCrawler() {
|
|
logger.info("开始执行当当网图书爬虫");
|
|
statistics.record("dangdang_start", System.currentTimeMillis());
|
|
try {
|
|
CrawlStrategy strategy = new DangDangStrategy();
|
|
Command command = new CrawlCommand(strategy, 1, 5, "dangdang_books.txt", view);
|
|
Command retryCommand = new RetryCommand(command, 3, view);
|
|
|
|
List<CrawlResult> results = retryCommand.execute();
|
|
logger.info("当当网爬虫执行成功,获取 {} 条数据", results.size());
|
|
return processResults(results, "dangdang_books.txt", "当当网图书");
|
|
} catch (NetworkException e) {
|
|
logger.error("【断网异常】当当网爬虫网络请求失败: {}", e.getMessage());
|
|
view.showError("【断网异常】当当网爬虫网络请求失败: " + e.getMessage());
|
|
statistics.increment("dangdang_failures");
|
|
return ResultContainer.failure("【断网异常】当当网爬虫失败 - 网络连接异常: " + e.getMessage(), e);
|
|
} catch (CrawlerException e) {
|
|
logger.error("当当网爬虫执行失败: {}", e.getMessage());
|
|
statistics.increment("dangdang_failures");
|
|
return ResultContainer.failure("当当网爬虫失败: " + e.getMessage(), e);
|
|
}
|
|
}
|
|
|
|
public ResultContainer<List<CrawlResult>> runWeatherCrawler() {
|
|
logger.info("开始执行中国天气网爬虫");
|
|
statistics.record("weather_start", System.currentTimeMillis());
|
|
try {
|
|
CrawlStrategy strategy = new WeatherStrategy();
|
|
Command command = new CrawlCommand(strategy, 1, 14, "weather_cities.txt", view);
|
|
Command retryCommand = new RetryCommand(command, 3, view);
|
|
|
|
List<CrawlResult> results = retryCommand.execute();
|
|
logger.info("中国天气网爬虫执行成功,获取 {} 条数据", results.size());
|
|
return processResults(results, "weather_cities.txt", "中国天气网");
|
|
} catch (NetworkException e) {
|
|
logger.error("【断网异常】中国天气网爬虫网络请求失败: {}", e.getMessage());
|
|
view.showError("【断网异常】中国天气网爬虫网络请求失败: " + e.getMessage());
|
|
statistics.increment("weather_failures");
|
|
return ResultContainer.failure("【断网异常】中国天气网爬虫失败 - 网络连接异常: " + e.getMessage(), e);
|
|
} catch (CrawlerException e) {
|
|
logger.error("中国天气网爬虫执行失败: {}", e.getMessage());
|
|
statistics.increment("weather_failures");
|
|
return ResultContainer.failure("天气网爬虫失败: " + e.getMessage(), e);
|
|
}
|
|
}
|
|
|
|
public ResultContainer<List<CrawlResult>> runMaoyanMovieCrawler() {
|
|
logger.info("开始执行猫眼电影爬虫");
|
|
statistics.record("maoyan_start", System.currentTimeMillis());
|
|
try {
|
|
CrawlStrategy strategy = new MovieStrategy();
|
|
Command command = new CrawlCommand(strategy, 1, 10, "maoyan_top100.txt", view);
|
|
Command retryCommand = new RetryCommand(command, 3, view);
|
|
|
|
List<CrawlResult> results = retryCommand.execute();
|
|
logger.info("猫眼电影爬虫执行成功,获取 {} 条数据", results.size());
|
|
return processResults(results, "maoyan_top100.txt", "猫眼电影");
|
|
} catch (NetworkException e) {
|
|
logger.error("【断网异常】猫眼电影爬虫网络请求失败: {}", e.getMessage());
|
|
view.showError("【断网异常】猫眼电影爬虫网络请求失败: " + e.getMessage());
|
|
statistics.increment("maoyan_failures");
|
|
return ResultContainer.failure("【断网异常】猫眼电影爬虫失败 - 网络连接异常: " + e.getMessage(), e);
|
|
} catch (CrawlerException e) {
|
|
logger.error("猫眼电影爬虫执行失败: {}", e.getMessage());
|
|
statistics.increment("maoyan_failures");
|
|
return ResultContainer.failure("猫眼电影爬虫失败: " + e.getMessage(), e);
|
|
}
|
|
}
|
|
|
|
public ResultContainer<List<CrawlResult>> runTrain12306Crawler() {
|
|
logger.info("开始执行12306火车票爬虫");
|
|
statistics.record("12306_start", System.currentTimeMillis());
|
|
try {
|
|
CrawlStrategy strategy = new Train12306Strategy();
|
|
Command command = new CrawlCommand(strategy, 1, 10, "train_12306.txt", view);
|
|
Command retryCommand = new RetryCommand(command, 3, view);
|
|
|
|
List<CrawlResult> results = retryCommand.execute();
|
|
logger.info("12306爬虫执行成功,获取 {} 条数据", results.size());
|
|
return processResults(results, "train_12306.txt", "12306火车票");
|
|
} catch (NetworkException e) {
|
|
logger.error("【断网异常】12306爬虫网络请求失败: {}", e.getMessage());
|
|
view.showError("【断网异常】12306爬虫网络请求失败: " + e.getMessage());
|
|
statistics.increment("12306_failures");
|
|
return ResultContainer.failure("【断网异常】12306爬虫失败 - 网络连接异常: " + e.getMessage(), e);
|
|
} catch (CrawlerException e) {
|
|
logger.error("12306爬虫执行失败: {}", e.getMessage());
|
|
statistics.increment("12306_failures");
|
|
return ResultContainer.failure("12306爬虫失败: " + e.getMessage(), e);
|
|
}
|
|
}
|
|
|
|
public ResultContainer<List<CrawlResult>> runCsdnBlogCrawler() {
|
|
logger.info("开始执行CSDN博客爬虫");
|
|
statistics.record("csdn_start", System.currentTimeMillis());
|
|
try {
|
|
CrawlStrategy strategy = new CsdnBlogStrategy();
|
|
Command command = new CrawlCommand(strategy, 1, 15, "csdn_blogs.txt", view);
|
|
Command retryCommand = new RetryCommand(command, 3, view);
|
|
|
|
List<CrawlResult> results = retryCommand.execute();
|
|
logger.info("CSDN博客爬虫执行成功,获取 {} 条数据", results.size());
|
|
return processResults(results, "csdn_blogs.txt", "CSDN博客");
|
|
} catch (NetworkException e) {
|
|
logger.error("【断网异常】CSDN博客爬虫网络请求失败: {}", e.getMessage());
|
|
view.showError("【断网异常】CSDN博客爬虫网络请求失败: " + e.getMessage());
|
|
statistics.increment("csdn_failures");
|
|
return ResultContainer.failure("【断网异常】CSDN博客爬虫失败 - 网络连接异常: " + e.getMessage(), e);
|
|
} catch (CrawlerException e) {
|
|
logger.error("CSDN博客爬虫执行失败: {}", e.getMessage());
|
|
statistics.increment("csdn_failures");
|
|
return ResultContainer.failure("CSDN博客爬虫失败: " + e.getMessage(), e);
|
|
}
|
|
}
|
|
|
|
private ResultContainer<List<CrawlResult>> processResults(List<CrawlResult> results,
|
|
String filename, String siteName) {
|
|
if (results == null || results.isEmpty()) {
|
|
logger.warn("{} 爬取结果为空", siteName);
|
|
return ResultContainer.failure(siteName + "爬取结果为空");
|
|
}
|
|
|
|
for (CrawlResult result : results) {
|
|
dataRepository.add(result);
|
|
}
|
|
|
|
saveToFile(results, filename);
|
|
saveToJson(results, filename.replace(".txt", ".json"));
|
|
|
|
statistics.record(siteName + "_count", results.size());
|
|
statistics.record(siteName + "_end", System.currentTimeMillis());
|
|
statistics.increment("total_items", results.size());
|
|
|
|
logger.info("{} 爬取完成,共 {} 条数据,已保存到 {}", siteName, results.size(), filename);
|
|
return ResultContainer.success(results, siteName + "爬取完成,共 " + results.size() + " 条数据");
|
|
}
|
|
|
|
public void runAllCrawlers() {
|
|
logger.info("开始执行所有爬虫");
|
|
int successCount = 0;
|
|
int failCount = 0;
|
|
|
|
ResultContainer<List<CrawlResult>> result;
|
|
|
|
view.showHeader("当当网图书爬虫");
|
|
result = runDangDangCrawler();
|
|
if (result.isSuccess()) {
|
|
successCount++;
|
|
view.showSuccess(result.getMessage());
|
|
} else {
|
|
failCount++;
|
|
view.showError(result.getMessage());
|
|
}
|
|
|
|
view.showHeader("中国天气网爬虫");
|
|
result = runWeatherCrawler();
|
|
if (result.isSuccess()) {
|
|
successCount++;
|
|
view.showSuccess(result.getMessage());
|
|
} else {
|
|
failCount++;
|
|
view.showError(result.getMessage());
|
|
}
|
|
|
|
view.showHeader("猫眼电影爬虫");
|
|
result = runMaoyanMovieCrawler();
|
|
if (result.isSuccess()) {
|
|
successCount++;
|
|
view.showSuccess(result.getMessage());
|
|
} else {
|
|
failCount++;
|
|
view.showError(result.getMessage());
|
|
}
|
|
|
|
view.showHeader("12306火车票爬虫");
|
|
result = runTrain12306Crawler();
|
|
if (result.isSuccess()) {
|
|
successCount++;
|
|
view.showSuccess(result.getMessage());
|
|
} else {
|
|
failCount++;
|
|
view.showError(result.getMessage());
|
|
}
|
|
|
|
view.showHeader("CSDN博客爬虫");
|
|
result = runCsdnBlogCrawler();
|
|
if (result.isSuccess()) {
|
|
successCount++;
|
|
view.showSuccess(result.getMessage());
|
|
} else {
|
|
failCount++;
|
|
view.showError(result.getMessage());
|
|
}
|
|
|
|
view.showLine();
|
|
view.showMessage("所有爬虫执行完成");
|
|
view.showMessage("成功: " + successCount + " 个");
|
|
view.showMessage("失败: " + failCount + " 个");
|
|
view.showMessage("总计采集数据: " + statistics.getCount("total_items") + " 条");
|
|
|
|
statistics.record("success_count", successCount);
|
|
statistics.record("fail_count", failCount);
|
|
logger.info("所有爬虫执行完成,成功: {},失败: {},总计数据: {}",
|
|
successCount, failCount, statistics.getCount("total_items"));
|
|
}
|
|
|
|
public void saveToFile(List<CrawlResult> results, String filename) {
|
|
if (filename == null || filename.trim().isEmpty()) {
|
|
logger.error("文件名为空,无法保存");
|
|
view.showError("文件名不能为空");
|
|
return;
|
|
}
|
|
|
|
try {
|
|
File file = new File(filename);
|
|
File parentDir = file.getParentFile();
|
|
if (parentDir != null && !parentDir.exists()) {
|
|
parentDir.mkdirs();
|
|
}
|
|
|
|
try (PrintWriter writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"))) {
|
|
writer.println("Title,Price,OriginalPrice,Discount,ImageUrl,Author");
|
|
for (CrawlResult result : results) {
|
|
if (result != null) {
|
|
writer.println(result.toString());
|
|
}
|
|
}
|
|
}
|
|
logger.info("文件保存成功: {}", filename);
|
|
view.showSuccess("文件保存成功: " + filename);
|
|
} catch (IOException e) {
|
|
logger.error("保存文件失败: {} - {}", filename, e.getMessage());
|
|
view.showError("保存文件失败: " + filename + " (" + e.getMessage() + ")");
|
|
}
|
|
}
|
|
|
|
public void saveToJson(List<CrawlResult> results, String filename) {
|
|
if (filename == null || filename.trim().isEmpty()) {
|
|
logger.error("JSON文件名为空,无法保存");
|
|
view.showError("文件名不能为空");
|
|
return;
|
|
}
|
|
|
|
try {
|
|
File file = new File(filename);
|
|
File parentDir = file.getParentFile();
|
|
if (parentDir != null && !parentDir.exists()) {
|
|
parentDir.mkdirs();
|
|
}
|
|
|
|
StringBuilder json = new StringBuilder();
|
|
json.append("[\n");
|
|
for (int i = 0; i < results.size(); i++) {
|
|
CrawlResult result = results.get(i);
|
|
if (result != null) {
|
|
json.append(" ").append(result.toJson());
|
|
if (i < results.size() - 1) {
|
|
json.append(",");
|
|
}
|
|
json.append("\n");
|
|
}
|
|
}
|
|
json.append("]");
|
|
|
|
try (PrintWriter writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"))) {
|
|
writer.write(json.toString());
|
|
}
|
|
logger.info("JSON文件保存成功: {}", filename);
|
|
view.showSuccess("JSON文件保存成功: " + filename);
|
|
} catch (IOException e) {
|
|
logger.error("保存JSON文件失败: {} - {}", filename, e.getMessage());
|
|
view.showError("保存JSON文件失败: " + filename + " (" + e.getMessage() + ")");
|
|
}
|
|
}
|
|
|
|
public Repository<CrawlResult> getDataRepository() {
|
|
return dataRepository;
|
|
}
|
|
|
|
public Statistics<String> getStatistics() {
|
|
return statistics;
|
|
}
|
|
}
|