You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

554 lines
25 KiB

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jfree.chart.ChartFactory;
import org.jfree.chart.ChartUtils;
import org.jfree.chart.JFreeChart;
import org.jfree.chart.plot.PlotOrientation;
import org.jfree.chart.title.TextTitle;
import org.jfree.chart.axis.CategoryAxis;
import org.jfree.chart.axis.NumberAxis;
import org.jfree.chart.axis.ValueAxis;
import org.jfree.chart.plot.CategoryPlot;
import org.jfree.chart.plot.PiePlot;
import org.jfree.chart.plot.XYPlot;
import org.jfree.data.category.DefaultCategoryDataset;
import org.jfree.data.general.DefaultPieDataset;
import org.jfree.data.xy.XYSeries;
import org.jfree.data.xy.XYSeriesCollection;
import java.awt.Font;
import java.io.*;
import java.util.*;
import java.util.stream.Collectors;
public class DoubanMovieCrawler {
public static void main(String[] args) {
List<Movie> movies = new ArrayList<>();
// 爬取豆瓣电影Top250的10个页面
for (int start = 0; start < 250; start += 25) {
String url = "https://movie.douban.com/top250?start=" + start;
try {
// 发送请求并获取页面内容
Document document = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
.timeout(10000)
.get();
// 解析电影列表
Elements movieElements = document.select(".grid_view li");
for (Element movieElement : movieElements) {
// 提取电影名
String title = movieElement.select(".title").first().text();
// 提取评分
String rating = movieElement.select(".rating_num").first().text();
// 提取年份、导演、主演、国家和类型
String info = movieElement.select(".bd p").first().text();
// 解析年份信息
String year = "";
java.util.regex.Pattern yearPattern = java.util.regex.Pattern.compile("(\\d{4})");
java.util.regex.Matcher yearMatcher = yearPattern.matcher(info);
if (yearMatcher.find()) {
year = yearMatcher.group(1);
}
// 提取导演信息
String director = "";
if (info.contains("导演:")) {
int directorIndex = info.indexOf("导演:");
int actorsIndex = info.indexOf("主演:", directorIndex);
int endIndex = info.indexOf("/", directorIndex);
// 优先使用主演作为结束点
if (actorsIndex != -1 && (endIndex == -1 || actorsIndex < endIndex)) {
endIndex = actorsIndex;
}
if (endIndex == -1) {
endIndex = info.length();
}
director = info.substring(directorIndex + 3, endIndex).trim();
}
// 提取主演信息
String actors = "";
if (info.contains("主演:")) {
int actorsIndex = info.indexOf("主演:");
int endIndex = info.indexOf("/", actorsIndex);
if (endIndex == -1) {
endIndex = info.length();
}
actors = info.substring(actorsIndex + 3, endIndex).trim();
// 移除省略号和年份
// 移除年份(4位数字)
actors = actors.replaceAll("\\s+\\d{4}$", "");
// 移除省略号
actors = actors.replaceAll("\\s*\\.\\.\\.", "");
// 移除多余的空格
actors = actors.trim();
}
// 提取国家和类型信息
String country = "";
String genre = "";
if (info.contains("/")) {
// 找到年份后的第一个斜杠
int yearEndIndex = info.indexOf(year) + 4;
if (yearEndIndex < info.length()) {
String afterYear = info.substring(yearEndIndex).trim();
if (afterYear.startsWith("/")) {
afterYear = afterYear.substring(1).trim();
}
// 分割国家和类型
String[] parts = afterYear.split("\\s*/\\s*");
if (parts.length > 0) {
country = parts[0];
}
if (parts.length > 1) {
genre = parts[parts.length - 1];
}
}
}
// 创建电影对象并添加到列表
movies.add(new Movie(title, rating, year, director, actors, country, genre));
}
// 休眠一下,避免请求过于频繁
Thread.sleep(1000);
} catch (IOException | InterruptedException e) {
e.printStackTrace();
}
}
// 数据清洗
List<Movie> cleanedMovies = cleanData(movies);
// 保存为不同格式文件
saveToCSV(cleanedMovies, "douban_top250.csv");
saveToJSON(cleanedMovies, "douban_top250.json");
saveToXML(cleanedMovies, "douban_top250.xml");
// 数据分析
analyzeData(cleanedMovies);
// 结果展示
displayResults(cleanedMovies);
// 生成图表
generateCharts(cleanedMovies);
System.out.println("爬取完成,共获取" + cleanedMovies.size() + "部电影");
System.out.println("数据已保存到CSV、JSON和XML文件");
System.out.println("图表已生成并保存为PNG图片");
}
// 数据清洗
private static List<Movie> cleanData(List<Movie> movies) {
return movies.stream()
.map(movie -> {
// 清洗电影名
String cleanTitle = movie.getTitle().trim();
// 清洗年份
String cleanYear = movie.getYear().trim();
// 清洗导演
String cleanDirector = movie.getDirector().trim();
// 清洗主演
String cleanActors = movie.getActors().trim();
// 清洗国家
String cleanCountry = movie.getCountry().trim();
// 清洗类型
String cleanGenre = movie.getGenre().trim();
// 处理缺失值
if (cleanYear.isEmpty()) cleanYear = "未知";
if (cleanDirector.isEmpty()) cleanDirector = "未知";
if (cleanActors.isEmpty()) cleanActors = "未知";
if (cleanCountry.isEmpty()) cleanCountry = "未知";
if (cleanGenre.isEmpty()) cleanGenre = "未知";
return new Movie(cleanTitle, movie.getRating(), cleanYear, cleanDirector, cleanActors, cleanCountry, cleanGenre);
})
.collect(Collectors.toList());
}
// 保存为JSON文件
private static void saveToJSON(List<Movie> movies, String filename) {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) {
writer.write("[");
for (int i = 0; i < movies.size(); i++) {
Movie movie = movies.get(i);
writer.write("{");
writer.write(String.format("\"title\":\"%s\",\"rating\":\"%s\",\"year\":\"%s\",\"director\":\"%s\",\"actors\":\"%s\",\"country\":\"%s\",\"genre\":\"%s\"",
escapeJson(movie.getTitle()),
escapeJson(movie.getRating()),
escapeJson(movie.getYear()),
escapeJson(movie.getDirector()),
escapeJson(movie.getActors()),
escapeJson(movie.getCountry()),
escapeJson(movie.getGenre())));
writer.write("}");
if (i < movies.size() - 1) {
writer.write(",");
}
writer.newLine();
}
writer.write("]");
System.out.println("数据已保存到" + filename);
} catch (IOException e) {
e.printStackTrace();
}
}
// 保存为XML文件
private static void saveToXML(List<Movie> movies, String filename) {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) {
writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
writer.newLine();
writer.write("<movies>");
writer.newLine();
for (Movie movie : movies) {
writer.write(" <movie>");
writer.newLine();
writer.write(String.format(" <title>%s</title>", escapeXml(movie.getTitle())));
writer.newLine();
writer.write(String.format(" <rating>%s</rating>", escapeXml(movie.getRating())));
writer.newLine();
writer.write(String.format(" <year>%s</year>", escapeXml(movie.getYear())));
writer.newLine();
writer.write(String.format(" <director>%s</director>", escapeXml(movie.getDirector())));
writer.newLine();
writer.write(String.format(" <actors>%s</actors>", escapeXml(movie.getActors())));
writer.newLine();
writer.write(String.format(" <country>%s</country>", escapeXml(movie.getCountry())));
writer.newLine();
writer.write(String.format(" <genre>%s</genre>", escapeXml(movie.getGenre())));
writer.newLine();
writer.write(" </movie>");
writer.newLine();
}
writer.write("</movies>");
System.out.println("数据已保存到" + filename);
} catch (IOException e) {
e.printStackTrace();
}
}
// 数据分析
private static void analyzeData(List<Movie> movies) {
System.out.println("\n=== 数据分析结果 ===");
// 1. 评分分布
System.out.println("\n1. 评分分布:");
Map<String, Long> ratingDistribution = movies.stream()
.collect(Collectors.groupingBy(Movie::getRating, Collectors.counting()));
ratingDistribution.entrySet().stream()
.sorted(Map.Entry.comparingByKey())
.forEach(entry -> System.out.printf("评分 %s: %d部\n", entry.getKey(), entry.getValue()));
// 2. 年份与评分相关性
System.out.println("\n2. 年份与评分统计:");
Map<String, Double> yearRatingMap = movies.stream()
.collect(Collectors.groupingBy(Movie::getYear,
Collectors.averagingDouble(m -> Double.parseDouble(m.getRating()))));
yearRatingMap.entrySet().stream()
.sorted(Map.Entry.comparingByKey())
.forEach(entry -> System.out.printf("年份 %s: 平均评分 %.2f\n", entry.getKey(), entry.getValue()));
// 3. 导演作品数排行
System.out.println("\n3. 导演作品数排行:");
Map<String, Long> directorWorksCount = movies.stream()
.collect(Collectors.groupingBy(Movie::getDirector, Collectors.counting()));
directorWorksCount.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
.limit(10)
.forEach(entry -> System.out.printf("%s: %d部\n", entry.getKey(), entry.getValue()));
// 4. 国家/地区分布
System.out.println("\n4. 国家/地区分布:");
Map<String, Long> countryDistribution = movies.stream()
.collect(Collectors.groupingBy(Movie::getCountry, Collectors.counting()));
countryDistribution.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
.limit(10)
.forEach(entry -> System.out.printf("%s: %d部\n", entry.getKey(), entry.getValue()));
// 5. 类型分布
System.out.println("\n5. 类型分布:");
Map<String, Long> genreDistribution = movies.stream()
.flatMap(movie -> Arrays.stream(movie.getGenre().split("\\s+")))
.collect(Collectors.groupingBy(genre -> genre, Collectors.counting()));
genreDistribution.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
.limit(10)
.forEach(entry -> System.out.printf("%s: %d部\n", entry.getKey(), entry.getValue()));
}
// 结果展示
private static void displayResults(List<Movie> movies) {
System.out.println("\n=== 电影Top10 ===");
System.out.printf("%-30s %-10s %-10s %-20s %-30s %-20s %-20s\n",
"电影名", "评分", "年份", "导演", "主演", "国家", "类型");
System.out.println("----------------------------------------------------------------------------------------------------------------------------------------------------------------");
movies.stream()
.limit(10)
.forEach(movie -> {
System.out.printf("%-30s %-10s %-10s %-20s %-30s %-20s %-20s\n",
truncate(movie.getTitle(), 30),
movie.getRating(),
movie.getYear(),
truncate(movie.getDirector(), 20),
truncate(movie.getActors(), 30),
truncate(movie.getCountry(), 20),
truncate(movie.getGenre(), 20));
});
}
// 辅助方法:截断字符串
private static String truncate(String str, int maxLength) {
if (str.length() <= maxLength) return str;
return str.substring(0, maxLength - 3) + "...";
}
// 辅助方法:转义JSON字符串
private static String escapeJson(String str) {
return str.replaceAll("\\\"", "\\\\\"")
.replaceAll("\\n", "\\\\n")
.replaceAll("\\r", "\\\\r");
}
// 辅助方法:转义XML字符串
private static String escapeXml(String str) {
return str.replaceAll("&", "&amp;")
.replaceAll("<", "&lt;")
.replaceAll(">", "&gt;")
.replaceAll("'", "&apos;")
.replaceAll("\"", "&quot;");
}
// 生成图表
private static void generateCharts(List<Movie> movies) {
try {
// 创建中文字体
Font chineseFont = new Font("SimHei", Font.PLAIN, 12);
Font titleFont = new Font("SimHei", Font.BOLD, 16);
// 1. 评分分布柱状图
DefaultCategoryDataset ratingDataset = new DefaultCategoryDataset();
Map<String, Long> ratingDistribution = movies.stream()
.collect(Collectors.groupingBy(Movie::getRating, Collectors.counting()));
ratingDistribution.entrySet().stream()
.sorted(Map.Entry.comparingByKey())
.forEach(entry -> ratingDataset.addValue(entry.getValue(), "电影数量", entry.getKey()));
JFreeChart ratingChart = ChartFactory.createBarChart(
"豆瓣电影Top250评分分布",
"评分",
"电影数量",
ratingDataset,
PlotOrientation.VERTICAL,
true,
true,
false
);
// 设置中文字体
ratingChart.setTitle(new TextTitle("豆瓣电影Top250评分分布", titleFont));
CategoryPlot ratingPlot = ratingChart.getCategoryPlot();
CategoryAxis ratingDomainAxis = ratingPlot.getDomainAxis();
ratingDomainAxis.setLabelFont(chineseFont);
ratingDomainAxis.setTickLabelFont(chineseFont);
NumberAxis ratingRangeAxis = (NumberAxis) ratingPlot.getRangeAxis();
ratingRangeAxis.setLabelFont(chineseFont);
ratingRangeAxis.setTickLabelFont(chineseFont);
ratingChart.getLegend().setItemFont(chineseFont);
ChartUtils.saveChartAsPNG(new File("rating_distribution.png"), ratingChart, 800, 600);
System.out.println("评分分布图表已保存: rating_distribution.png");
// 2. 国家/地区分布饼图
DefaultPieDataset countryDataset = new DefaultPieDataset();
Map<String, Long> countryDistribution = movies.stream()
.collect(Collectors.groupingBy(Movie::getCountry, Collectors.counting()));
countryDistribution.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
.limit(10)
.forEach(entry -> countryDataset.setValue(entry.getKey(), entry.getValue()));
JFreeChart countryChart = ChartFactory.createPieChart(
"豆瓣电影Top250国家/地区分布",
countryDataset,
true,
true,
false
);
// 设置中文字体
countryChart.setTitle(new TextTitle("豆瓣电影Top250国家/地区分布", titleFont));
PiePlot countryPlot = (PiePlot) countryChart.getPlot();
countryPlot.setLabelFont(chineseFont);
countryChart.getLegend().setItemFont(chineseFont);
ChartUtils.saveChartAsPNG(new File("country_distribution.png"), countryChart, 800, 600);
System.out.println("国家分布图表已保存: country_distribution.png");
// 3. 年份与评分相关性折线图
XYSeries yearRatingSeries = new XYSeries("平均评分");
Map<String, Double> yearRatingMap = movies.stream()
.collect(Collectors.groupingBy(Movie::getYear,
Collectors.averagingDouble(m -> Double.parseDouble(m.getRating()))));
yearRatingMap.entrySet().stream()
.filter(entry -> !entry.getKey().equals("未知"))
.sorted(Map.Entry.comparingByKey())
.forEach(entry -> {
try {
int year = Integer.parseInt(entry.getKey());
yearRatingSeries.add(year, entry.getValue());
} catch (NumberFormatException e) {
// 跳过非数字年份
}
});
XYSeriesCollection yearRatingDataset = new XYSeriesCollection(yearRatingSeries);
JFreeChart yearRatingChart = ChartFactory.createXYLineChart(
"豆瓣电影Top250年份与评分相关性",
"年份",
"平均评分",
yearRatingDataset,
PlotOrientation.VERTICAL,
true,
true,
false
);
// 设置中文字体
yearRatingChart.setTitle(new TextTitle("豆瓣电影Top250年份与评分相关性", titleFont));
XYPlot yearPlot = yearRatingChart.getXYPlot();
ValueAxis yearDomainAxis = yearPlot.getDomainAxis();
yearDomainAxis.setLabelFont(chineseFont);
yearDomainAxis.setTickLabelFont(chineseFont);
ValueAxis yearRangeAxis = yearPlot.getRangeAxis();
yearRangeAxis.setLabelFont(chineseFont);
yearRangeAxis.setTickLabelFont(chineseFont);
yearRatingChart.getLegend().setItemFont(chineseFont);
ChartUtils.saveChartAsPNG(new File("year_rating_correlation.png"), yearRatingChart, 800, 600);
System.out.println("年份评分相关性图表已保存: year_rating_correlation.png");
// 4. 类型分布柱状图
DefaultCategoryDataset genreDataset = new DefaultCategoryDataset();
Map<String, Long> genreDistribution = movies.stream()
.flatMap(movie -> Arrays.stream(movie.getGenre().split("\\s+")))
.collect(Collectors.groupingBy(genre -> genre, Collectors.counting()));
genreDistribution.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
.limit(10)
.forEach(entry -> genreDataset.addValue(entry.getValue(), "电影数量", entry.getKey()));
JFreeChart genreChart = ChartFactory.createBarChart(
"豆瓣电影Top250类型分布",
"类型",
"电影数量",
genreDataset,
PlotOrientation.VERTICAL,
true,
true,
false
);
// 设置中文字体
genreChart.setTitle(new TextTitle("豆瓣电影Top250类型分布", titleFont));
CategoryPlot genrePlot = genreChart.getCategoryPlot();
CategoryAxis genreDomainAxis = genrePlot.getDomainAxis();
genreDomainAxis.setLabelFont(chineseFont);
genreDomainAxis.setTickLabelFont(chineseFont);
NumberAxis genreRangeAxis = (NumberAxis) genrePlot.getRangeAxis();
genreRangeAxis.setLabelFont(chineseFont);
genreRangeAxis.setTickLabelFont(chineseFont);
genreChart.getLegend().setItemFont(chineseFont);
ChartUtils.saveChartAsPNG(new File("genre_distribution.png"), genreChart, 800, 600);
System.out.println("类型分布图表已保存: genre_distribution.png");
} catch (IOException e) {
System.err.println("生成图表时出错: " + e.getMessage());
e.printStackTrace();
}
}
private static void saveToCSV(List<Movie> movies, String filename) {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) {
// 写入CSV表头
writer.write("电影名,评分,年份,导演,主演,国家,类型");
writer.newLine();
// 写入电影数据
for (Movie movie : movies) {
writer.write(String.format("%s,%s,%s,%s,%s,%s,%s",
movie.getTitle(),
movie.getRating(),
movie.getYear(),
movie.getDirector(),
movie.getActors(),
movie.getCountry(),
movie.getGenre()));
writer.newLine();
}
System.out.println("数据已保存到" + filename);
} catch (IOException e) {
e.printStackTrace();
}
}
// 电影类
static class Movie {
private String title;
private String rating;
private String year;
private String director;
private String actors;
private String country;
private String genre;
public Movie(String title, String rating, String year, String director, String actors, String country, String genre) {
this.title = title;
this.rating = rating;
this.year = year;
this.director = director;
this.actors = actors;
this.country = country;
this.genre = genre;
}
public String getTitle() {
return title;
}
public String getRating() {
return rating;
}
public String getYear() {
return year;
}
public String getDirector() {
return director;
}
public String getActors() {
return actors;
}
public String getCountry() {
return country;
}
public String getGenre() {
return genre;
}
}
}