import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.jfree.chart.ChartFactory; import org.jfree.chart.ChartUtils; import org.jfree.chart.JFreeChart; import org.jfree.chart.plot.PlotOrientation; import org.jfree.chart.title.TextTitle; import org.jfree.chart.axis.CategoryAxis; import org.jfree.chart.axis.NumberAxis; import org.jfree.chart.axis.ValueAxis; import org.jfree.chart.plot.CategoryPlot; import org.jfree.chart.plot.PiePlot; import org.jfree.chart.plot.XYPlot; import org.jfree.data.category.DefaultCategoryDataset; import org.jfree.data.general.DefaultPieDataset; import org.jfree.data.xy.XYSeries; import org.jfree.data.xy.XYSeriesCollection; import java.awt.Font; import java.io.*; import java.util.*; import java.util.stream.Collectors; public class DoubanMovieCrawler { public static void main(String[] args) { List movies = new ArrayList<>(); // 爬取豆瓣电影Top250的10个页面 for (int start = 0; start < 250; start += 25) { String url = "https://movie.douban.com/top250?start=" + start; try { // 发送请求并获取页面内容 Document document = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") .timeout(10000) .get(); // 解析电影列表 Elements movieElements = document.select(".grid_view li"); for (Element movieElement : movieElements) { // 提取电影名 String title = movieElement.select(".title").first().text(); // 提取评分 String rating = movieElement.select(".rating_num").first().text(); // 提取年份、导演、主演、国家和类型 String info = movieElement.select(".bd p").first().text(); // 解析年份信息 String year = ""; java.util.regex.Pattern yearPattern = java.util.regex.Pattern.compile("(\\d{4})"); java.util.regex.Matcher yearMatcher = yearPattern.matcher(info); if (yearMatcher.find()) { year = yearMatcher.group(1); } // 提取导演信息 String director = ""; if (info.contains("导演:")) { int directorIndex = info.indexOf("导演:"); int actorsIndex = info.indexOf("主演:", directorIndex); int endIndex = info.indexOf("/", directorIndex); // 优先使用主演作为结束点 if (actorsIndex != -1 && (endIndex == -1 || actorsIndex < endIndex)) { endIndex = actorsIndex; } if (endIndex == -1) { endIndex = info.length(); } director = info.substring(directorIndex + 3, endIndex).trim(); } // 提取主演信息 String actors = ""; if (info.contains("主演:")) { int actorsIndex = info.indexOf("主演:"); int endIndex = info.indexOf("/", actorsIndex); if (endIndex == -1) { endIndex = info.length(); } actors = info.substring(actorsIndex + 3, endIndex).trim(); // 移除省略号和年份 // 移除年份(4位数字) actors = actors.replaceAll("\\s+\\d{4}$", ""); // 移除省略号 actors = actors.replaceAll("\\s*\\.\\.\\.", ""); // 移除多余的空格 actors = actors.trim(); } // 提取国家和类型信息 String country = ""; String genre = ""; if (info.contains("/")) { // 找到年份后的第一个斜杠 int yearEndIndex = info.indexOf(year) + 4; if (yearEndIndex < info.length()) { String afterYear = info.substring(yearEndIndex).trim(); if (afterYear.startsWith("/")) { afterYear = afterYear.substring(1).trim(); } // 分割国家和类型 String[] parts = afterYear.split("\\s*/\\s*"); if (parts.length > 0) { country = parts[0]; } if (parts.length > 1) { genre = parts[parts.length - 1]; } } } // 创建电影对象并添加到列表 movies.add(new Movie(title, rating, year, director, actors, country, genre)); } // 休眠一下,避免请求过于频繁 Thread.sleep(1000); } catch (IOException | InterruptedException e) { e.printStackTrace(); } } // 数据清洗 List cleanedMovies = cleanData(movies); // 保存为不同格式文件 saveToCSV(cleanedMovies, "douban_top250.csv"); saveToJSON(cleanedMovies, "douban_top250.json"); saveToXML(cleanedMovies, "douban_top250.xml"); // 数据分析 analyzeData(cleanedMovies); // 结果展示 displayResults(cleanedMovies); // 生成图表 generateCharts(cleanedMovies); System.out.println("爬取完成,共获取" + cleanedMovies.size() + "部电影"); System.out.println("数据已保存到CSV、JSON和XML文件"); System.out.println("图表已生成并保存为PNG图片"); } // 数据清洗 private static List cleanData(List movies) { return movies.stream() .map(movie -> { // 清洗电影名 String cleanTitle = movie.getTitle().trim(); // 清洗年份 String cleanYear = movie.getYear().trim(); // 清洗导演 String cleanDirector = movie.getDirector().trim(); // 清洗主演 String cleanActors = movie.getActors().trim(); // 清洗国家 String cleanCountry = movie.getCountry().trim(); // 清洗类型 String cleanGenre = movie.getGenre().trim(); // 处理缺失值 if (cleanYear.isEmpty()) cleanYear = "未知"; if (cleanDirector.isEmpty()) cleanDirector = "未知"; if (cleanActors.isEmpty()) cleanActors = "未知"; if (cleanCountry.isEmpty()) cleanCountry = "未知"; if (cleanGenre.isEmpty()) cleanGenre = "未知"; return new Movie(cleanTitle, movie.getRating(), cleanYear, cleanDirector, cleanActors, cleanCountry, cleanGenre); }) .collect(Collectors.toList()); } // 保存为JSON文件 private static void saveToJSON(List movies, String filename) { try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { writer.write("["); for (int i = 0; i < movies.size(); i++) { Movie movie = movies.get(i); writer.write("{"); writer.write(String.format("\"title\":\"%s\",\"rating\":\"%s\",\"year\":\"%s\",\"director\":\"%s\",\"actors\":\"%s\",\"country\":\"%s\",\"genre\":\"%s\"", escapeJson(movie.getTitle()), escapeJson(movie.getRating()), escapeJson(movie.getYear()), escapeJson(movie.getDirector()), escapeJson(movie.getActors()), escapeJson(movie.getCountry()), escapeJson(movie.getGenre()))); writer.write("}"); if (i < movies.size() - 1) { writer.write(","); } writer.newLine(); } writer.write("]"); System.out.println("数据已保存到" + filename); } catch (IOException e) { e.printStackTrace(); } } // 保存为XML文件 private static void saveToXML(List movies, String filename) { try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { writer.write(""); writer.newLine(); writer.write(""); writer.newLine(); for (Movie movie : movies) { writer.write(" "); writer.newLine(); writer.write(String.format(" %s", escapeXml(movie.getTitle()))); writer.newLine(); writer.write(String.format(" %s", escapeXml(movie.getRating()))); writer.newLine(); writer.write(String.format(" %s", escapeXml(movie.getYear()))); writer.newLine(); writer.write(String.format(" %s", escapeXml(movie.getDirector()))); writer.newLine(); writer.write(String.format(" %s", escapeXml(movie.getActors()))); writer.newLine(); writer.write(String.format(" %s", escapeXml(movie.getCountry()))); writer.newLine(); writer.write(String.format(" %s", escapeXml(movie.getGenre()))); writer.newLine(); writer.write(" "); writer.newLine(); } writer.write(""); System.out.println("数据已保存到" + filename); } catch (IOException e) { e.printStackTrace(); } } // 数据分析 private static void analyzeData(List movies) { System.out.println("\n=== 数据分析结果 ==="); // 1. 评分分布 System.out.println("\n1. 评分分布:"); Map ratingDistribution = movies.stream() .collect(Collectors.groupingBy(Movie::getRating, Collectors.counting())); ratingDistribution.entrySet().stream() .sorted(Map.Entry.comparingByKey()) .forEach(entry -> System.out.printf("评分 %s: %d部\n", entry.getKey(), entry.getValue())); // 2. 年份与评分相关性 System.out.println("\n2. 年份与评分统计:"); Map yearRatingMap = movies.stream() .collect(Collectors.groupingBy(Movie::getYear, Collectors.averagingDouble(m -> Double.parseDouble(m.getRating())))); yearRatingMap.entrySet().stream() .sorted(Map.Entry.comparingByKey()) .forEach(entry -> System.out.printf("年份 %s: 平均评分 %.2f\n", entry.getKey(), entry.getValue())); // 3. 导演作品数排行 System.out.println("\n3. 导演作品数排行:"); Map directorWorksCount = movies.stream() .collect(Collectors.groupingBy(Movie::getDirector, Collectors.counting())); directorWorksCount.entrySet().stream() .sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())) .limit(10) .forEach(entry -> System.out.printf("%s: %d部\n", entry.getKey(), entry.getValue())); // 4. 国家/地区分布 System.out.println("\n4. 国家/地区分布:"); Map countryDistribution = movies.stream() .collect(Collectors.groupingBy(Movie::getCountry, Collectors.counting())); countryDistribution.entrySet().stream() .sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())) .limit(10) .forEach(entry -> System.out.printf("%s: %d部\n", entry.getKey(), entry.getValue())); // 5. 类型分布 System.out.println("\n5. 类型分布:"); Map genreDistribution = movies.stream() .flatMap(movie -> Arrays.stream(movie.getGenre().split("\\s+"))) .collect(Collectors.groupingBy(genre -> genre, Collectors.counting())); genreDistribution.entrySet().stream() .sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())) .limit(10) .forEach(entry -> System.out.printf("%s: %d部\n", entry.getKey(), entry.getValue())); } // 结果展示 private static void displayResults(List movies) { System.out.println("\n=== 电影Top10 ==="); System.out.printf("%-30s %-10s %-10s %-20s %-30s %-20s %-20s\n", "电影名", "评分", "年份", "导演", "主演", "国家", "类型"); System.out.println("----------------------------------------------------------------------------------------------------------------------------------------------------------------"); movies.stream() .limit(10) .forEach(movie -> { System.out.printf("%-30s %-10s %-10s %-20s %-30s %-20s %-20s\n", truncate(movie.getTitle(), 30), movie.getRating(), movie.getYear(), truncate(movie.getDirector(), 20), truncate(movie.getActors(), 30), truncate(movie.getCountry(), 20), truncate(movie.getGenre(), 20)); }); } // 辅助方法:截断字符串 private static String truncate(String str, int maxLength) { if (str.length() <= maxLength) return str; return str.substring(0, maxLength - 3) + "..."; } // 辅助方法:转义JSON字符串 private static String escapeJson(String str) { return str.replaceAll("\\\"", "\\\\\"") .replaceAll("\\n", "\\\\n") .replaceAll("\\r", "\\\\r"); } // 辅助方法:转义XML字符串 private static String escapeXml(String str) { return str.replaceAll("&", "&") .replaceAll("<", "<") .replaceAll(">", ">") .replaceAll("'", "'") .replaceAll("\"", """); } // 生成图表 private static void generateCharts(List movies) { try { // 创建中文字体 Font chineseFont = new Font("SimHei", Font.PLAIN, 12); Font titleFont = new Font("SimHei", Font.BOLD, 16); // 1. 评分分布柱状图 DefaultCategoryDataset ratingDataset = new DefaultCategoryDataset(); Map ratingDistribution = movies.stream() .collect(Collectors.groupingBy(Movie::getRating, Collectors.counting())); ratingDistribution.entrySet().stream() .sorted(Map.Entry.comparingByKey()) .forEach(entry -> ratingDataset.addValue(entry.getValue(), "电影数量", entry.getKey())); JFreeChart ratingChart = ChartFactory.createBarChart( "豆瓣电影Top250评分分布", "评分", "电影数量", ratingDataset, PlotOrientation.VERTICAL, true, true, false ); // 设置中文字体 ratingChart.setTitle(new TextTitle("豆瓣电影Top250评分分布", titleFont)); CategoryPlot ratingPlot = ratingChart.getCategoryPlot(); CategoryAxis ratingDomainAxis = ratingPlot.getDomainAxis(); ratingDomainAxis.setLabelFont(chineseFont); ratingDomainAxis.setTickLabelFont(chineseFont); NumberAxis ratingRangeAxis = (NumberAxis) ratingPlot.getRangeAxis(); ratingRangeAxis.setLabelFont(chineseFont); ratingRangeAxis.setTickLabelFont(chineseFont); ratingChart.getLegend().setItemFont(chineseFont); ChartUtils.saveChartAsPNG(new File("rating_distribution.png"), ratingChart, 800, 600); System.out.println("评分分布图表已保存: rating_distribution.png"); // 2. 国家/地区分布饼图 DefaultPieDataset countryDataset = new DefaultPieDataset(); Map countryDistribution = movies.stream() .collect(Collectors.groupingBy(Movie::getCountry, Collectors.counting())); countryDistribution.entrySet().stream() .sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())) .limit(10) .forEach(entry -> countryDataset.setValue(entry.getKey(), entry.getValue())); JFreeChart countryChart = ChartFactory.createPieChart( "豆瓣电影Top250国家/地区分布", countryDataset, true, true, false ); // 设置中文字体 countryChart.setTitle(new TextTitle("豆瓣电影Top250国家/地区分布", titleFont)); PiePlot countryPlot = (PiePlot) countryChart.getPlot(); countryPlot.setLabelFont(chineseFont); countryChart.getLegend().setItemFont(chineseFont); ChartUtils.saveChartAsPNG(new File("country_distribution.png"), countryChart, 800, 600); System.out.println("国家分布图表已保存: country_distribution.png"); // 3. 年份与评分相关性折线图 XYSeries yearRatingSeries = new XYSeries("平均评分"); Map yearRatingMap = movies.stream() .collect(Collectors.groupingBy(Movie::getYear, Collectors.averagingDouble(m -> Double.parseDouble(m.getRating())))); yearRatingMap.entrySet().stream() .filter(entry -> !entry.getKey().equals("未知")) .sorted(Map.Entry.comparingByKey()) .forEach(entry -> { try { int year = Integer.parseInt(entry.getKey()); yearRatingSeries.add(year, entry.getValue()); } catch (NumberFormatException e) { // 跳过非数字年份 } }); XYSeriesCollection yearRatingDataset = new XYSeriesCollection(yearRatingSeries); JFreeChart yearRatingChart = ChartFactory.createXYLineChart( "豆瓣电影Top250年份与评分相关性", "年份", "平均评分", yearRatingDataset, PlotOrientation.VERTICAL, true, true, false ); // 设置中文字体 yearRatingChart.setTitle(new TextTitle("豆瓣电影Top250年份与评分相关性", titleFont)); XYPlot yearPlot = yearRatingChart.getXYPlot(); ValueAxis yearDomainAxis = yearPlot.getDomainAxis(); yearDomainAxis.setLabelFont(chineseFont); yearDomainAxis.setTickLabelFont(chineseFont); ValueAxis yearRangeAxis = yearPlot.getRangeAxis(); yearRangeAxis.setLabelFont(chineseFont); yearRangeAxis.setTickLabelFont(chineseFont); yearRatingChart.getLegend().setItemFont(chineseFont); ChartUtils.saveChartAsPNG(new File("year_rating_correlation.png"), yearRatingChart, 800, 600); System.out.println("年份评分相关性图表已保存: year_rating_correlation.png"); // 4. 类型分布柱状图 DefaultCategoryDataset genreDataset = new DefaultCategoryDataset(); Map genreDistribution = movies.stream() .flatMap(movie -> Arrays.stream(movie.getGenre().split("\\s+"))) .collect(Collectors.groupingBy(genre -> genre, Collectors.counting())); genreDistribution.entrySet().stream() .sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())) .limit(10) .forEach(entry -> genreDataset.addValue(entry.getValue(), "电影数量", entry.getKey())); JFreeChart genreChart = ChartFactory.createBarChart( "豆瓣电影Top250类型分布", "类型", "电影数量", genreDataset, PlotOrientation.VERTICAL, true, true, false ); // 设置中文字体 genreChart.setTitle(new TextTitle("豆瓣电影Top250类型分布", titleFont)); CategoryPlot genrePlot = genreChart.getCategoryPlot(); CategoryAxis genreDomainAxis = genrePlot.getDomainAxis(); genreDomainAxis.setLabelFont(chineseFont); genreDomainAxis.setTickLabelFont(chineseFont); NumberAxis genreRangeAxis = (NumberAxis) genrePlot.getRangeAxis(); genreRangeAxis.setLabelFont(chineseFont); genreRangeAxis.setTickLabelFont(chineseFont); genreChart.getLegend().setItemFont(chineseFont); ChartUtils.saveChartAsPNG(new File("genre_distribution.png"), genreChart, 800, 600); System.out.println("类型分布图表已保存: genre_distribution.png"); } catch (IOException e) { System.err.println("生成图表时出错: " + e.getMessage()); e.printStackTrace(); } } private static void saveToCSV(List movies, String filename) { try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { // 写入CSV表头 writer.write("电影名,评分,年份,导演,主演,国家,类型"); writer.newLine(); // 写入电影数据 for (Movie movie : movies) { writer.write(String.format("%s,%s,%s,%s,%s,%s,%s", movie.getTitle(), movie.getRating(), movie.getYear(), movie.getDirector(), movie.getActors(), movie.getCountry(), movie.getGenre())); writer.newLine(); } System.out.println("数据已保存到" + filename); } catch (IOException e) { e.printStackTrace(); } } // 电影类 static class Movie { private String title; private String rating; private String year; private String director; private String actors; private String country; private String genre; public Movie(String title, String rating, String year, String director, String actors, String country, String genre) { this.title = title; this.rating = rating; this.year = year; this.director = director; this.actors = actors; this.country = country; this.genre = genre; } public String getTitle() { return title; } public String getRating() { return rating; } public String getYear() { return year; } public String getDirector() { return director; } public String getActors() { return actors; } public String getCountry() { return country; } public String getGenre() { return genre; } } }