From 196ea736526077f4c1c411f582fdadcffe2c1249 Mon Sep 17 00:00:00 2001 From: LiuZihan <1353843969@qq.com> Date: Sun, 24 May 2026 17:27:09 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=20'project/CrawlCommand.java?= =?UTF-8?q?'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- project/CrawlCommand.java | 197 -------------------------------------- 1 file changed, 197 deletions(-) delete mode 100644 project/CrawlCommand.java diff --git a/project/CrawlCommand.java b/project/CrawlCommand.java deleted file mode 100644 index 1ec3ade..0000000 --- a/project/CrawlCommand.java +++ /dev/null @@ -1,197 +0,0 @@ -package com.example.moviecli.command; - -import com.example.moviecli.model.Movie; -import com.example.moviecli.repository.MovieRepository; -import com.example.moviecli.strategy.MovieCrawlStrategy; -import com.example.moviecli.strategy.MovieStrategyFactory; -import com.example.moviecli.view.ConsoleView; -import com.example.moviecli.exception.CrawlFailedException; -import com.example.moviecli.exception.ParseFailedException; -import com.example.moviecli.exception.SaveFailedException; -import com.opencsv.CSVWriter; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; - -import java.io.FileWriter; -import java.util.ArrayList; -import java.util.List; - -public class CrawlCommand implements Command { - private final ConsoleView view; - private final MovieStrategyFactory factory; - - public CrawlCommand(ConsoleView view, MovieStrategyFactory factory) { - this.view = view; - this.factory = factory; - } - - @Override - public String getName() { - return "crawl"; - } - - @Override - public void execute(String[] args, MovieRepository repository) { - if (args.length < 2) { - view.printError("用法: crawl "); - view.printInfo("支持的 URL 示例:"); - view.printInfo(" https://movie.douban.com/top250"); - view.printInfo(" https://news.sina.com.cn/"); - view.printInfo(" https://book.douban.com/top250"); - return; - } - String url = args[1]; - MovieCrawlStrategy strategy = factory.getStrategy(url); - if (strategy == null) { - view.printError("不支持该 URL 的爬取策略: " + url); - return; - } - - if (url.contains("movie.douban.com/top250")) { - crawlDoubanTop250(strategy, repository); - } else if (url.contains("news.sina.com.cn")) { - crawlSinaNews(strategy, repository); - } else if (url.contains("book.douban.com/top250")) { - crawlDoubanBookTop50(strategy, repository); - } else { - crawlSinglePage(url, strategy, repository); - } - } - - /** 豆瓣电影 Top250 -> douban_movies.csv */ - private void crawlDoubanTop250(MovieCrawlStrategy strategy, MovieRepository repository) { - List allMovies = new ArrayList<>(); - int total = 0; - for (int start = 0; start < 250; start += 25) { - String pageUrl = "https://movie.douban.com/top250?start=" + start; - try { - view.printInfo("正在爬取: " + pageUrl); - Document doc = Jsoup.connect(pageUrl) - .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") - .timeout(15000) - .get(); - List pageMovies = strategy.parse(doc); - allMovies.addAll(pageMovies); - repository.addAll(pageMovies); - total += pageMovies.size(); - view.printInfo("已累计爬取 " + total + " 条..."); - Thread.sleep(1500); - } catch (ParseFailedException e) { - view.printError("解析失败: " + e.getMessage()); - e.printStackTrace(); - } catch (Exception e) { - CrawlFailedException ex = new CrawlFailedException("豆瓣电影爬取失败: " + pageUrl, e); - view.printError(ex.getMessage()); - ex.printStackTrace(); - } - } - view.printSuccess("豆瓣电影 Top250 全部爬取完成,共 " + total + " 条记录。"); - saveToCsv(allMovies, "douban_movies.csv"); - } - - /** 新浪新闻首页 -> sina_news.csv */ - private void crawlSinaNews(MovieCrawlStrategy strategy, MovieRepository repository) { - String url = "https://news.sina.com.cn/"; - try { - view.printInfo("正在爬取新浪新闻: " + url); - Document doc = Jsoup.connect(url) - .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") - .timeout(15000) - .get(); - List news = strategy.parse(doc); - repository.addAll(news); - view.printSuccess("新浪新闻爬取完成,共 " + news.size() + " 条记录。"); - saveToCsv(news, "sina_news.csv"); - } catch (ParseFailedException e) { - view.printError("解析失败: " + e.getMessage()); - e.printStackTrace(); - } catch (Exception e) { - CrawlFailedException ex = new CrawlFailedException("新浪新闻爬取失败: " + url, e); - view.printError(ex.getMessage()); - ex.printStackTrace(); - } - } - - /** 豆瓣图书 Top50 -> douban_books.csv */ - private void crawlDoubanBookTop50(MovieCrawlStrategy strategy, MovieRepository repository) { - List allMovies = new ArrayList<>(); - int total = 0; - for (int start = 0; start < 50; start += 25) { - String pageUrl = "https://book.douban.com/top250?start=" + start; - try { - view.printInfo("正在爬取: " + pageUrl); - Document doc = Jsoup.connect(pageUrl) - .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") - .timeout(15000) - .get(); - List pageMovies = strategy.parse(doc); - allMovies.addAll(pageMovies); - repository.addAll(pageMovies); - total += pageMovies.size(); - view.printInfo("已累计爬取 " + total + " 条..."); - Thread.sleep(1500); - } catch (ParseFailedException e) { - view.printError("解析失败: " + e.getMessage()); - e.printStackTrace(); - } catch (Exception e) { - CrawlFailedException ex = new CrawlFailedException("豆瓣图书爬取失败: " + pageUrl, e); - view.printError(ex.getMessage()); - ex.printStackTrace(); - } - } - view.printSuccess("豆瓣图书 Top50 爬取完成,共 " + total + " 条记录。"); - saveToCsv(allMovies, "douban_books.csv"); - } - - /** 单页兜底(未匹配的URL) */ - private void crawlSinglePage(String url, MovieCrawlStrategy strategy, MovieRepository repository) { - List allMovies = new ArrayList<>(); - try { - view.printInfo("正在爬取: " + url); - Document doc = Jsoup.connect(url) - .userAgent("Mozilla/5.0") - .timeout(10000) - .get(); - List movies = strategy.parse(doc); - allMovies.addAll(movies); - repository.addAll(movies); - view.printSuccess("爬取完成!共 " + movies.size() + " 条记录。"); - saveToCsv(allMovies, "unknown.csv"); - } catch (ParseFailedException e) { - view.printError("解析失败: " + e.getMessage()); - e.printStackTrace(); - } catch (Exception e) { - CrawlFailedException ex = new CrawlFailedException("爬取失败: " + url, e); - view.printError(ex.getMessage()); - ex.printStackTrace(); - } - } - - /** 保存电影/新闻列表到 CSV 文件 */ - private void saveToCsv(List items, String filename) { - if (items.isEmpty()) { - view.printInfo("没有数据可保存到 " + filename); - return; - } - try (CSVWriter writer = new CSVWriter(new FileWriter(filename))) { - String[] header = {"Rank", "Title", "OriginalTitle", "Score", "Year", "Director"}; - writer.writeNext(header); - for (Movie m : items) { - String[] line = { - String.valueOf(m.getRank()), - m.getTitle(), - m.getOriginalTitle(), - m.getScore(), - m.getYear(), - m.getDirector() - }; - writer.writeNext(line); - } - view.printSuccess("已保存 " + items.size() + " 条记录到 " + filename); - } catch (Exception e) { - SaveFailedException ex = new SaveFailedException("保存 " + filename + " 失败", e); - view.printError(ex.getMessage()); - ex.printStackTrace(); - } - } -} \ No newline at end of file