You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
197 lines
8.4 KiB
197 lines
8.4 KiB
package com.example.moviecli.command;
|
|
|
|
import com.example.moviecli.model.Movie;
|
|
import com.example.moviecli.repository.MovieRepository;
|
|
import com.example.moviecli.strategy.MovieCrawlStrategy;
|
|
import com.example.moviecli.strategy.MovieStrategyFactory;
|
|
import com.example.moviecli.view.ConsoleView;
|
|
import com.example.moviecli.exception.CrawlFailedException;
|
|
import com.example.moviecli.exception.ParseFailedException;
|
|
import com.example.moviecli.exception.SaveFailedException;
|
|
import com.opencsv.CSVWriter;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
|
|
import java.io.FileWriter;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class CrawlCommand implements Command {
|
|
private final ConsoleView view;
|
|
private final MovieStrategyFactory factory;
|
|
|
|
public CrawlCommand(ConsoleView view, MovieStrategyFactory factory) {
|
|
this.view = view;
|
|
this.factory = factory;
|
|
}
|
|
|
|
@Override
|
|
public String getName() {
|
|
return "crawl";
|
|
}
|
|
|
|
@Override
|
|
public void execute(String[] args, MovieRepository repository) {
|
|
if (args.length < 2) {
|
|
view.printError("用法: crawl <url>");
|
|
view.printInfo("支持的 URL 示例:");
|
|
view.printInfo(" https://movie.douban.com/top250");
|
|
view.printInfo(" https://news.sina.com.cn/");
|
|
view.printInfo(" https://book.douban.com/top250");
|
|
return;
|
|
}
|
|
String url = args[1];
|
|
MovieCrawlStrategy strategy = factory.getStrategy(url);
|
|
if (strategy == null) {
|
|
view.printError("不支持该 URL 的爬取策略: " + url);
|
|
return;
|
|
}
|
|
|
|
if (url.contains("movie.douban.com/top250")) {
|
|
crawlDoubanTop250(strategy, repository);
|
|
} else if (url.contains("news.sina.com.cn")) {
|
|
crawlSinaNews(strategy, repository);
|
|
} else if (url.contains("book.douban.com/top250")) {
|
|
crawlDoubanBookTop50(strategy, repository);
|
|
} else {
|
|
crawlSinglePage(url, strategy, repository);
|
|
}
|
|
}
|
|
|
|
/** 豆瓣电影 Top250 -> douban_movies.csv */
|
|
private void crawlDoubanTop250(MovieCrawlStrategy strategy, MovieRepository repository) {
|
|
List<Movie> allMovies = new ArrayList<>();
|
|
int total = 0;
|
|
for (int start = 0; start < 250; start += 25) {
|
|
String pageUrl = "https://movie.douban.com/top250?start=" + start;
|
|
try {
|
|
view.printInfo("正在爬取: " + pageUrl);
|
|
Document doc = Jsoup.connect(pageUrl)
|
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
|
.timeout(15000)
|
|
.get();
|
|
List<Movie> pageMovies = strategy.parse(doc);
|
|
allMovies.addAll(pageMovies);
|
|
repository.addAll(pageMovies);
|
|
total += pageMovies.size();
|
|
view.printInfo("已累计爬取 " + total + " 条...");
|
|
Thread.sleep(1500);
|
|
} catch (ParseFailedException e) {
|
|
view.printError("解析失败: " + e.getMessage());
|
|
e.printStackTrace();
|
|
} catch (Exception e) {
|
|
CrawlFailedException ex = new CrawlFailedException("豆瓣电影爬取失败: " + pageUrl, e);
|
|
view.printError(ex.getMessage());
|
|
ex.printStackTrace();
|
|
}
|
|
}
|
|
view.printSuccess("豆瓣电影 Top250 全部爬取完成,共 " + total + " 条记录。");
|
|
saveToCsv(allMovies, "douban_movies.csv");
|
|
}
|
|
|
|
/** 新浪新闻首页 -> sina_news.csv */
|
|
private void crawlSinaNews(MovieCrawlStrategy strategy, MovieRepository repository) {
|
|
String url = "https://news.sina.com.cn/";
|
|
try {
|
|
view.printInfo("正在爬取新浪新闻: " + url);
|
|
Document doc = Jsoup.connect(url)
|
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
|
.timeout(15000)
|
|
.get();
|
|
List<Movie> news = strategy.parse(doc);
|
|
repository.addAll(news);
|
|
view.printSuccess("新浪新闻爬取完成,共 " + news.size() + " 条记录。");
|
|
saveToCsv(news, "sina_news.csv");
|
|
} catch (ParseFailedException e) {
|
|
view.printError("解析失败: " + e.getMessage());
|
|
e.printStackTrace();
|
|
} catch (Exception e) {
|
|
CrawlFailedException ex = new CrawlFailedException("新浪新闻爬取失败: " + url, e);
|
|
view.printError(ex.getMessage());
|
|
ex.printStackTrace();
|
|
}
|
|
}
|
|
|
|
/** 豆瓣图书 Top50 -> douban_books.csv */
|
|
private void crawlDoubanBookTop50(MovieCrawlStrategy strategy, MovieRepository repository) {
|
|
List<Movie> allMovies = new ArrayList<>();
|
|
int total = 0;
|
|
for (int start = 0; start < 50; start += 25) {
|
|
String pageUrl = "https://book.douban.com/top250?start=" + start;
|
|
try {
|
|
view.printInfo("正在爬取: " + pageUrl);
|
|
Document doc = Jsoup.connect(pageUrl)
|
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
|
.timeout(15000)
|
|
.get();
|
|
List<Movie> pageMovies = strategy.parse(doc);
|
|
allMovies.addAll(pageMovies);
|
|
repository.addAll(pageMovies);
|
|
total += pageMovies.size();
|
|
view.printInfo("已累计爬取 " + total + " 条...");
|
|
Thread.sleep(1500);
|
|
} catch (ParseFailedException e) {
|
|
view.printError("解析失败: " + e.getMessage());
|
|
e.printStackTrace();
|
|
} catch (Exception e) {
|
|
CrawlFailedException ex = new CrawlFailedException("豆瓣图书爬取失败: " + pageUrl, e);
|
|
view.printError(ex.getMessage());
|
|
ex.printStackTrace();
|
|
}
|
|
}
|
|
view.printSuccess("豆瓣图书 Top50 爬取完成,共 " + total + " 条记录。");
|
|
saveToCsv(allMovies, "douban_books.csv");
|
|
}
|
|
|
|
/** 单页兜底(未匹配的URL) */
|
|
private void crawlSinglePage(String url, MovieCrawlStrategy strategy, MovieRepository repository) {
|
|
List<Movie> allMovies = new ArrayList<>();
|
|
try {
|
|
view.printInfo("正在爬取: " + url);
|
|
Document doc = Jsoup.connect(url)
|
|
.userAgent("Mozilla/5.0")
|
|
.timeout(10000)
|
|
.get();
|
|
List<Movie> movies = strategy.parse(doc);
|
|
allMovies.addAll(movies);
|
|
repository.addAll(movies);
|
|
view.printSuccess("爬取完成!共 " + movies.size() + " 条记录。");
|
|
saveToCsv(allMovies, "unknown.csv");
|
|
} catch (ParseFailedException e) {
|
|
view.printError("解析失败: " + e.getMessage());
|
|
e.printStackTrace();
|
|
} catch (Exception e) {
|
|
CrawlFailedException ex = new CrawlFailedException("爬取失败: " + url, e);
|
|
view.printError(ex.getMessage());
|
|
ex.printStackTrace();
|
|
}
|
|
}
|
|
|
|
/** 保存电影/新闻列表到 CSV 文件 */
|
|
private void saveToCsv(List<Movie> items, String filename) {
|
|
if (items.isEmpty()) {
|
|
view.printInfo("没有数据可保存到 " + filename);
|
|
return;
|
|
}
|
|
try (CSVWriter writer = new CSVWriter(new FileWriter(filename))) {
|
|
String[] header = {"Rank", "Title", "OriginalTitle", "Score", "Year", "Director"};
|
|
writer.writeNext(header);
|
|
for (Movie m : items) {
|
|
String[] line = {
|
|
String.valueOf(m.getRank()),
|
|
m.getTitle(),
|
|
m.getOriginalTitle(),
|
|
m.getScore(),
|
|
m.getYear(),
|
|
m.getDirector()
|
|
};
|
|
writer.writeNext(line);
|
|
}
|
|
view.printSuccess("已保存 " + items.size() + " 条记录到 " + filename);
|
|
} catch (Exception e) {
|
|
SaveFailedException ex = new SaveFailedException("保存 " + filename + " 失败", e);
|
|
view.printError(ex.getMessage());
|
|
ex.printStackTrace();
|
|
}
|
|
}
|
|
}
|