import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.*; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.*; // ==================== 1. 异常体系 ==================== enum ErrorCode { CONNECTION_ERROR("连接失败"), PARSE_ERROR("解析失败"), VALIDATION_ERROR("参数验证失败"), IO_ERROR("IO操作失败"), UNKNOWN_ERROR("未知错误"), NO_RESULTS_ERROR("未获取到数据"); private final String message; ErrorCode(String message) { this.message = message; } public String getMessage() { return message; } } class CrawlerException extends Exception { private final ErrorCode errorCode; public CrawlerException(ErrorCode errorCode, String message) { super(message); this.errorCode = errorCode; } public CrawlerException(ErrorCode errorCode, String message, Throwable cause) { super(message, cause); this.errorCode = errorCode; } public ErrorCode getErrorCode() { return errorCode; } } // ==================== 2. 数据模型 (Model) ==================== class Book { private String title; private String authors; private String publisher; private String publishDate; private String price; private double rating; private String summary; private String url; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getAuthors() { return authors; } public void setAuthors(String authors) { this.authors = authors; } public String getPublisher() { return publisher; } public void setPublisher(String publisher) { this.publisher = publisher; } public String getPublishDate() { return publishDate; } public void setPublishDate(String publishDate) { this.publishDate = publishDate; } public String getPrice() { return price; } public void setPrice(String price) { this.price = price; } public double getRating() { return rating; } public void setRating(double rating) { this.rating = rating; } public String getSummary() { return summary; } public void setSummary(String summary) { this.summary = summary; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } } class Movie { private String title; private String director; private String actors; private String year; private String type; private double rating; private String url; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getDirector() { return director; } public void setDirector(String director) { this.director = director; } public String getActors() { return actors; } public void setActors(String actors) { this.actors = actors; } public String getYear() { return year; } public void setYear(String year) { this.year = year; } public String getType() { return type; } public void setType(String type) { this.type = type; } public double getRating() { return rating; } public void setRating(double rating) { this.rating = rating; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } } class Music { private String title; private String artist; private String album; private String releaseDate; private double rating; private String url; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getArtist() { return artist; } public void setArtist(String artist) { this.artist = artist; } public String getAlbum() { return album; } public void setAlbum(String album) { this.album = album; } public String getReleaseDate() { return releaseDate; } public void setReleaseDate(String releaseDate) { this.releaseDate = releaseDate; } public double getRating() { return rating; } public void setRating(double rating) { this.rating = rating; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } } class Article { private String title; private String author; private String publishDate; private String views; private String url; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getPublishDate() { return publishDate; } public void setPublishDate(String publishDate) { this.publishDate = publishDate; } public String getViews() { return views; } public void setViews(String views) { this.views = views; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } } class CrawlResult { private String source; private List items; private long crawlTime; private int totalCount; public CrawlResult(String source, List items, long crawlTime) { this.source = source; this.items = items; this.crawlTime = crawlTime; this.totalCount = items.size(); } public String getSource() { return source; } public List getItems() { return items; } public long getCrawlTime() { return crawlTime; } public int getTotalCount() { return totalCount; } } // ==================== 3. 策略模式 (Strategy) ==================== interface CrawlStrategy { List crawl(int pageCount) throws CrawlerException; String getSourceName(); String getItemType(); } class DoubanBookStrategy implements CrawlStrategy { @Override public List crawl(int pageCount) throws CrawlerException { List books = new ArrayList<>(); try { for (int page = 0; page < pageCount && books.size() < 35; page++) { String url = "https://book.douban.com/tag/计算机?start=" + (page * 20); System.out.println("正在爬取豆瓣读书: " + url); Document doc = HttpUtil.getDocument(url); Elements bookElements = doc.select(".subject-item"); for (Element element : bookElements) { Book book = new Book(); Element titleElement = element.selectFirst(".info h2 a"); if (titleElement != null) { book.setTitle(titleElement.text().trim()); book.setUrl(titleElement.attr("href")); } Element infoElement = element.selectFirst(".info .pub"); if (infoElement != null) { String[] parts = infoElement.text().trim().split("/"); if (parts.length >= 4) { book.setAuthors(parts[0].trim()); book.setPublisher(parts[1].trim()); book.setPublishDate(parts[2].trim()); book.setPrice(parts[3].trim()); } } Element ratingElement = element.selectFirst(".info .rating_nums"); if (ratingElement != null) { try { book.setRating(Double.parseDouble(ratingElement.text().trim())); } catch (NumberFormatException e) { book.setRating(0.0); } } if (book.getTitle() != null && !book.getTitle().isEmpty()) books.add(book); if (books.size() >= 35) break; } Thread.sleep(800); } } catch (IOException e) { throw new CrawlerException(ErrorCode.CONNECTION_ERROR, "连接豆瓣失败", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new CrawlerException(ErrorCode.UNKNOWN_ERROR, "爬取被中断", e); } if (books.isEmpty()) throw new CrawlerException(ErrorCode.NO_RESULTS_ERROR, "未获取到豆瓣图书数据"); return books; } @Override public String getSourceName() { return "豆瓣读书"; } @Override public String getItemType() { return "Book"; } } class DoubanMovieStrategy implements CrawlStrategy { @Override public List crawl(int pageCount) throws CrawlerException { List movies = new ArrayList<>(); try { for (int page = 0; page < pageCount && movies.size() < 35; page++) { String url = "https://movie.douban.com/top250?start=" + (page * 25); System.out.println("正在爬取豆瓣电影: " + url); Document doc = HttpUtil.getDocument(url); Elements movieElements = doc.select(".item"); for (Element element : movieElements) { Movie movie = new Movie(); Element titleElement = element.selectFirst(".hd a"); if (titleElement != null) { movie.setTitle(titleElement.text().replaceAll("\\s+", " ").trim()); movie.setUrl(titleElement.attr("href")); } Element infoElement = element.selectFirst(".bd p:first-child"); if (infoElement != null) { String info = infoElement.text(); if (info.contains("导演")) { String[] parts = info.split("\\s+"); movie.setDirector(parts[1]); StringBuilder actors = new StringBuilder(); for (int i = 3; i < parts.length && i < 6; i++) actors.append(parts[i]).append(" "); movie.setActors(actors.toString().trim()); } } Element ratingElement = element.selectFirst(".rating_num"); if (ratingElement != null) { try { movie.setRating(Double.parseDouble(ratingElement.text().trim())); } catch (NumberFormatException e) { movie.setRating(0.0); } } Element yearElement = element.selectFirst(".bd p:nth-child(2)"); if (yearElement != null) { String text = yearElement.text(); int idx = text.indexOf("上映日期:"); if (idx > 0) movie.setYear(text.substring(idx + 5).trim().substring(0, 4)); } if (movie.getTitle() != null && !movie.getTitle().isEmpty()) movies.add(movie); if (movies.size() >= 35) break; } Thread.sleep(800); } } catch (IOException e) { throw new CrawlerException(ErrorCode.CONNECTION_ERROR, "连接豆瓣电影失败", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new CrawlerException(ErrorCode.UNKNOWN_ERROR, "爬取被中断", e); } if (movies.isEmpty()) throw new CrawlerException(ErrorCode.NO_RESULTS_ERROR, "未获取到豆瓣电影数据"); return movies; } @Override public String getSourceName() { return "豆瓣电影"; } @Override public String getItemType() { return "Movie"; } } class DoubanMusicStrategy implements CrawlStrategy { @Override public List crawl(int pageCount) throws CrawlerException { List musics = new ArrayList<>(); try { for (int page = 0; page < pageCount && musics.size() < 35; page++) { String url = "https://music.douban.com/chart"; System.out.println("正在爬取豆瓣音乐: " + url); Document doc = HttpUtil.getDocument(url); Elements musicElements = doc.select(".clearfix"); for (Element element : musicElements) { Music music = new Music(); Element titleElement = element.selectFirst(".song-name a"); if (titleElement != null) { music.setTitle(titleElement.text().trim()); music.setUrl(titleElement.attr("href")); } Element artistElement = element.selectFirst(".artist a"); if (artistElement != null) music.setArtist(artistElement.text().trim()); Element albumElement = element.selectFirst(".album a"); if (albumElement != null) music.setAlbum(albumElement.text().trim()); Element ratingElement = element.selectFirst(".rating_nums"); if (ratingElement != null) { try { music.setRating(Double.parseDouble(ratingElement.text().trim())); } catch (NumberFormatException e) { music.setRating(0.0); } } if (music.getTitle() != null && !music.getTitle().isEmpty()) musics.add(music); if (musics.size() >= 35) break; } Thread.sleep(800); } } catch (IOException e) { throw new CrawlerException(ErrorCode.CONNECTION_ERROR, "连接豆瓣音乐失败", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new CrawlerException(ErrorCode.UNKNOWN_ERROR, "爬取被中断", e); } if (musics.isEmpty()) throw new CrawlerException(ErrorCode.NO_RESULTS_ERROR, "未获取到豆瓣音乐数据"); return musics; } @Override public String getSourceName() { return "豆瓣音乐"; } @Override public String getItemType() { return "Music"; } } class CsdnBlogStrategy implements CrawlStrategy { @Override public List crawl(int pageCount) throws CrawlerException { List articles = new ArrayList<>(); try { for (int page = 1; page <= pageCount && articles.size() < 35; page++) { String url = "https://www.csdn.net/nav/java?page=" + page; System.out.println("正在爬取CSDN博客: " + url); Document doc = HttpUtil.getDocument(url); Elements articleElements = doc.select(".list-item"); for (Element element : articleElements) { Article article = new Article(); Element titleElement = element.selectFirst(".title a"); if (titleElement != null) { article.setTitle(titleElement.text().trim()); article.setUrl(titleElement.attr("href")); } Element authorElement = element.selectFirst(".name a"); if (authorElement != null) article.setAuthor(authorElement.text().trim()); Element timeElement = element.selectFirst(".time"); if (timeElement != null) article.setPublishDate(timeElement.text().trim()); Element viewElement = element.selectFirst(".view"); if (viewElement != null) article.setViews(viewElement.text().trim()); if (article.getTitle() != null && !article.getTitle().isEmpty()) articles.add(article); if (articles.size() >= 35) break; } Thread.sleep(1000); } } catch (IOException e) { throw new CrawlerException(ErrorCode.CONNECTION_ERROR, "连接CSDN失败", e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new CrawlerException(ErrorCode.UNKNOWN_ERROR, "爬取被中断", e); } if (articles.isEmpty()) throw new CrawlerException(ErrorCode.NO_RESULTS_ERROR, "未获取到CSDN文章数据"); return articles; } @Override public String getSourceName() { return "CSDN博客"; } @Override public String getItemType() { return "Article"; } } // ==================== 4. 工具类 ==================== class HttpUtil { private static final String[] USER_AGENTS = { "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15" }; public static Document getDocument(String url) throws IOException { String userAgent = USER_AGENTS[(int) (Math.random() * USER_AGENTS.length)]; return Jsoup.connect(url).userAgent(userAgent).timeout(15000).get(); } } // ==================== 5. Command模式 ==================== interface Command { void execute(); String getCommandName(); } class CrawlCommand implements Command { private CrawlStrategy strategy; private CLIView view; private List results; private int pageCount; public CrawlCommand(CrawlStrategy strategy, CLIView view, List results, int pageCount) { this.strategy = strategy; this.view = view; this.results = results; this.pageCount = pageCount; } @Override public void execute() { view.showMessage("开始爬取 " + strategy.getSourceName() + "..."); try { long startTime = System.currentTimeMillis(); List items = strategy.crawl(pageCount); long endTime = System.currentTimeMillis(); results.add(new CrawlResult(strategy.getSourceName(), items, endTime - startTime)); view.showMessage("爬取完成!获取 " + items.size() + " 条数据,耗时 " + (endTime - startTime) + "ms"); } catch (CrawlerException e) { view.showError("爬取失败: " + e.getMessage()); } } @Override public String getCommandName() { return "CrawlCommand(" + strategy.getSourceName() + ")"; } } class SaveCommand implements Command { private List results; private CLIView view; public SaveCommand(List results, CLIView view) { this.results = results; this.view = view; } @Override public void execute() { if (results.isEmpty()) { view.showError("没有可保存的数据"); return; } String fileName = "e:\\w14\\crawl_results_" + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".txt"; try (BufferedWriter writer = new BufferedWriter(new FileWriter(fileName))) { writer.write("爬虫结果报告"); writer.newLine(); writer.write("生成时间: " + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))); writer.newLine(); writer.write("=".repeat(60)); writer.newLine(); for (CrawlResult result : results) { writer.write("\n【来源】: " + result.getSource()); writer.newLine(); writer.write("【耗时】: " + result.getCrawlTime() + "ms"); writer.newLine(); writer.write("【数量】: " + result.getTotalCount() + " 条"); writer.newLine(); writer.write("-".repeat(60)); writer.newLine(); int index = 1; for (Object item : result.getItems()) { writer.write("\n" + index + ". "); if (item instanceof Book) { Book b = (Book) item; writer.write(b.getTitle()); writer.newLine(); if (b.getAuthors() != null) writer.write(" 作者: " + b.getAuthors() + "\n"); if (b.getPublisher() != null) writer.write(" 出版社: " + b.getPublisher() + "\n"); if (b.getPublishDate() != null) writer.write(" 出版时间: " + b.getPublishDate() + "\n"); if (b.getRating() > 0) writer.write(" 评分: " + b.getRating() + "\n"); } else if (item instanceof Movie) { Movie m = (Movie) item; writer.write(m.getTitle()); writer.newLine(); if (m.getDirector() != null) writer.write(" 导演: " + m.getDirector() + "\n"); if (m.getActors() != null) writer.write(" 主演: " + m.getActors() + "\n"); if (m.getYear() != null) writer.write(" 年份: " + m.getYear() + "\n"); if (m.getRating() > 0) writer.write(" 评分: " + m.getRating() + "\n"); } else if (item instanceof Music) { Music m = (Music) item; writer.write(m.getTitle()); writer.newLine(); if (m.getArtist() != null) writer.write(" 歌手: " + m.getArtist() + "\n"); if (m.getAlbum() != null) writer.write(" 专辑: " + m.getAlbum() + "\n"); if (m.getRating() > 0) writer.write(" 评分: " + m.getRating() + "\n"); } else if (item instanceof Article) { Article a = (Article) item; writer.write(a.getTitle()); writer.newLine(); if (a.getAuthor() != null) writer.write(" 作者: " + a.getAuthor() + "\n"); if (a.getPublishDate() != null) writer.write(" 发布时间: " + a.getPublishDate() + "\n"); if (a.getViews() != null) writer.write(" 阅读量: " + a.getViews() + "\n"); } index++; } writer.write("=".repeat(60)); writer.newLine(); } view.showMessage("数据已保存到文件: " + fileName); } catch (IOException e) { view.showError("保存文件失败: " + e.getMessage()); } } @Override public String getCommandName() { return "SaveCommand"; } } // ==================== 6. MVC视图 (View) ==================== class CLIView { private Scanner scanner; public CLIView() { this.scanner = new Scanner(System.in); } public int showMenu() { System.out.println("\n========== 爬虫系统 v1.0 =========="); System.out.println("1. 爬取豆瓣读书"); System.out.println("2. 爬取豆瓣电影"); System.out.println("3. 爬取豆瓣音乐"); System.out.println("4. 爬取CSDN博客"); System.out.println("5. 爬取所有网站"); System.out.println("6. 查看爬取结果"); System.out.println("7. 保存结果到文件"); System.out.println("0. 退出"); System.out.print("请输入选择: "); try { return Integer.parseInt(scanner.nextLine()); } catch (NumberFormatException e) { return -1; } } public void showMessage(String message) { System.out.println("[信息] " + message); } public void showError(String error) { System.err.println("[错误] " + error); } public void displayResults(List results) { if (results.isEmpty()) { System.out.println("暂无爬取结果"); return; } System.out.println("\n========== 爬取结果 =========="); for (CrawlResult result : results) { System.out.println("\n【来源】: " + result.getSource()); System.out.println("【耗时】: " + result.getCrawlTime() + "ms | 【数量】: " + result.getTotalCount() + " 条"); System.out.println("-".repeat(50)); int index = 1; for (Object item : result.getItems()) { if (item instanceof Book) { Book b = (Book) item; System.out.println(index + ". " + b.getTitle()); if (b.getAuthors() != null) System.out.println(" 作者: " + b.getAuthors()); if (b.getRating() > 0) System.out.println(" 评分: " + b.getRating()); } else if (item instanceof Movie) { Movie m = (Movie) item; System.out.println(index + ". " + m.getTitle()); if (m.getDirector() != null) System.out.println(" 导演: " + m.getDirector()); if (m.getRating() > 0) System.out.println(" 评分: " + m.getRating()); } else if (item instanceof Music) { Music m = (Music) item; System.out.println(index + ". " + m.getTitle()); if (m.getArtist() != null) System.out.println(" 歌手: " + m.getArtist()); if (m.getRating() > 0) System.out.println(" 评分: " + m.getRating()); } else if (item instanceof Article) { Article a = (Article) item; System.out.println(index + ". " + a.getTitle()); if (a.getAuthor() != null) System.out.println(" 作者: " + a.getAuthor()); if (a.getViews() != null) System.out.println(" 阅读量: " + a.getViews()); } if (index++ >= 10) { System.out.println(" ... (仅显示前10条)"); break; } } } } } // ==================== 7. MVC控制器 (Controller) ==================== class CrawlerController { private CLIView view; private List results; public CrawlerController(CLIView view) { this.view = view; this.results = new ArrayList<>(); } public void executeCrawlCommand(String source) { List commands = new ArrayList<>(); int pageCount = 3; // 确保获取至少30条数据 switch (source.toLowerCase()) { case "doubanbook": commands.add(new CrawlCommand(new DoubanBookStrategy(), view, results, pageCount)); break; case "doubanmovie": commands.add(new CrawlCommand(new DoubanMovieStrategy(), view, results, pageCount)); break; case "doubanmusic": commands.add(new CrawlCommand(new DoubanMusicStrategy(), view, results, pageCount)); break; case "csdn": commands.add(new CrawlCommand(new CsdnBlogStrategy(), view, results, pageCount)); break; case "all": commands.add(new CrawlCommand(new DoubanBookStrategy(), view, results, pageCount)); commands.add(new CrawlCommand(new DoubanMovieStrategy(), view, results, pageCount)); commands.add(new CrawlCommand(new DoubanMusicStrategy(), view, results, pageCount)); commands.add(new CrawlCommand(new CsdnBlogStrategy(), view, results, pageCount)); break; default: view.showError("未知的爬取源: " + source); return; } for (Command command : commands) { command.execute(); } } public void saveResults() { new SaveCommand(results, view).execute(); } public List getResults() { return new ArrayList<>(results); } } // ==================== 8. 主程序入口 ==================== public class CrawlerApp { public static void main(String[] args) { CLIView view = new CLIView(); CrawlerController controller = new CrawlerController(view); System.out.println("========== 爬虫系统 v1.0 自动运行模式 =========="); System.out.println("开始爬取所有4个网站...\n"); controller.executeCrawlCommand("all"); System.out.println("\n========== 爬取结果汇总 =========="); view.displayResults(controller.getResults()); System.out.println("\n========== 保存结果到文件 =========="); controller.saveResults(); System.out.println("\n========== 程序结束 =========="); } }