You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

197 lines
8.4 KiB

package com.example.moviecli.command;
import com.example.moviecli.model.Movie;
import com.example.moviecli.repository.MovieRepository;
import com.example.moviecli.strategy.MovieCrawlStrategy;
import com.example.moviecli.strategy.MovieStrategyFactory;
import com.example.moviecli.view.ConsoleView;
import com.example.moviecli.exception.CrawlFailedException;
import com.example.moviecli.exception.ParseFailedException;
import com.example.moviecli.exception.SaveFailedException;
import com.opencsv.CSVWriter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.List;
public class CrawlCommand implements Command {
private final ConsoleView view;
private final MovieStrategyFactory factory;
public CrawlCommand(ConsoleView view, MovieStrategyFactory factory) {
this.view = view;
this.factory = factory;
}
@Override
public String getName() {
return "crawl";
}
@Override
public void execute(String[] args, MovieRepository repository) {
if (args.length < 2) {
view.printError("用法: crawl <url>");
view.printInfo("支持的 URL 示例:");
view.printInfo(" https://movie.douban.com/top250");
view.printInfo(" https://news.sina.com.cn/");
view.printInfo(" https://book.douban.com/top250");
return;
}
String url = args[1];
MovieCrawlStrategy strategy = factory.getStrategy(url);
if (strategy == null) {
view.printError("不支持该 URL 的爬取策略: " + url);
return;
}
if (url.contains("movie.douban.com/top250")) {
crawlDoubanTop250(strategy, repository);
} else if (url.contains("news.sina.com.cn")) {
crawlSinaNews(strategy, repository);
} else if (url.contains("book.douban.com/top250")) {
crawlDoubanBookTop50(strategy, repository);
} else {
crawlSinglePage(url, strategy, repository);
}
}
/** 豆瓣电影 Top250 -> douban_movies.csv */
private void crawlDoubanTop250(MovieCrawlStrategy strategy, MovieRepository repository) {
List<Movie> allMovies = new ArrayList<>();
int total = 0;
for (int start = 0; start < 250; start += 25) {
String pageUrl = "https://movie.douban.com/top250?start=" + start;
try {
view.printInfo("正在爬取: " + pageUrl);
Document doc = Jsoup.connect(pageUrl)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(15000)
.get();
List<Movie> pageMovies = strategy.parse(doc);
allMovies.addAll(pageMovies);
repository.addAll(pageMovies);
total += pageMovies.size();
view.printInfo("已累计爬取 " + total + " 条...");
Thread.sleep(1500);
} catch (ParseFailedException e) {
view.printError("解析失败: " + e.getMessage());
e.printStackTrace();
} catch (Exception e) {
CrawlFailedException ex = new CrawlFailedException("豆瓣电影爬取失败: " + pageUrl, e);
view.printError(ex.getMessage());
ex.printStackTrace();
}
}
view.printSuccess("豆瓣电影 Top250 全部爬取完成,共 " + total + " 条记录。");
saveToCsv(allMovies, "douban_movies.csv");
}
/** 新浪新闻首页 -> sina_news.csv */
private void crawlSinaNews(MovieCrawlStrategy strategy, MovieRepository repository) {
String url = "https://news.sina.com.cn/";
try {
view.printInfo("正在爬取新浪新闻: " + url);
Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(15000)
.get();
List<Movie> news = strategy.parse(doc);
repository.addAll(news);
view.printSuccess("新浪新闻爬取完成,共 " + news.size() + " 条记录。");
saveToCsv(news, "sina_news.csv");
} catch (ParseFailedException e) {
view.printError("解析失败: " + e.getMessage());
e.printStackTrace();
} catch (Exception e) {
CrawlFailedException ex = new CrawlFailedException("新浪新闻爬取失败: " + url, e);
view.printError(ex.getMessage());
ex.printStackTrace();
}
}
/** 豆瓣图书 Top50 -> douban_books.csv */
private void crawlDoubanBookTop50(MovieCrawlStrategy strategy, MovieRepository repository) {
List<Movie> allMovies = new ArrayList<>();
int total = 0;
for (int start = 0; start < 50; start += 25) {
String pageUrl = "https://book.douban.com/top250?start=" + start;
try {
view.printInfo("正在爬取: " + pageUrl);
Document doc = Jsoup.connect(pageUrl)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(15000)
.get();
List<Movie> pageMovies = strategy.parse(doc);
allMovies.addAll(pageMovies);
repository.addAll(pageMovies);
total += pageMovies.size();
view.printInfo("已累计爬取 " + total + " 条...");
Thread.sleep(1500);
} catch (ParseFailedException e) {
view.printError("解析失败: " + e.getMessage());
e.printStackTrace();
} catch (Exception e) {
CrawlFailedException ex = new CrawlFailedException("豆瓣图书爬取失败: " + pageUrl, e);
view.printError(ex.getMessage());
ex.printStackTrace();
}
}
view.printSuccess("豆瓣图书 Top50 爬取完成,共 " + total + " 条记录。");
saveToCsv(allMovies, "douban_books.csv");
}
/** 单页兜底(未匹配的URL) */
private void crawlSinglePage(String url, MovieCrawlStrategy strategy, MovieRepository repository) {
List<Movie> allMovies = new ArrayList<>();
try {
view.printInfo("正在爬取: " + url);
Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0")
.timeout(10000)
.get();
List<Movie> movies = strategy.parse(doc);
allMovies.addAll(movies);
repository.addAll(movies);
view.printSuccess("爬取完成!共 " + movies.size() + " 条记录。");
saveToCsv(allMovies, "unknown.csv");
} catch (ParseFailedException e) {
view.printError("解析失败: " + e.getMessage());
e.printStackTrace();
} catch (Exception e) {
CrawlFailedException ex = new CrawlFailedException("爬取失败: " + url, e);
view.printError(ex.getMessage());
ex.printStackTrace();
}
}
/** 保存电影/新闻列表到 CSV 文件 */
private void saveToCsv(List<Movie> items, String filename) {
if (items.isEmpty()) {
view.printInfo("没有数据可保存到 " + filename);
return;
}
try (CSVWriter writer = new CSVWriter(new FileWriter(filename))) {
String[] header = {"Rank", "Title", "OriginalTitle", "Score", "Year", "Director"};
writer.writeNext(header);
for (Movie m : items) {
String[] line = {
String.valueOf(m.getRank()),
m.getTitle(),
m.getOriginalTitle(),
m.getScore(),
m.getYear(),
m.getDirector()
};
writer.writeNext(line);
}
view.printSuccess("已保存 " + items.size() + " 条记录到 " + filename);
} catch (Exception e) {
SaveFailedException ex = new SaveFailedException("保存 " + filename + " 失败", e);
view.printError(ex.getMessage());
ex.printStackTrace();
}
}
}