You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
520 lines
19 KiB
520 lines
19 KiB
import java.io.*;
|
|
import java.net.*;
|
|
import java.nio.charset.StandardCharsets;
|
|
import java.time.LocalDateTime;
|
|
import java.time.format.DateTimeFormatter;
|
|
import java.util.*;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
import java.util.stream.Collectors;
|
|
|
|
public class MovieCrawler {
|
|
|
|
private static final String BASE_URL = "https://movie.douban.com/top250";
|
|
private static final int MAX_PAGES = 10;
|
|
private static final String OUTPUT_DIR = "D:/";
|
|
|
|
public static void main(String[] args) {
|
|
System.out.println("=".repeat(80));
|
|
System.out.println("豆瓣电影Top250爬虫与数据分析系统");
|
|
System.out.println("=".repeat(80));
|
|
|
|
long startTime = System.currentTimeMillis();
|
|
|
|
try {
|
|
List<Movie> movies = crawlMovies();
|
|
|
|
if (movies.isEmpty()) {
|
|
System.err.println("未能爬取到任何电影数据,请检查网络连接或稍后重试。");
|
|
return;
|
|
}
|
|
|
|
System.out.println("\n数据爬取完成,共获取 " + movies.size() + " 部电影数据");
|
|
System.out.println("开始数据分析...");
|
|
|
|
analyzeMovies(movies);
|
|
|
|
System.out.println("\n开始保存数据...");
|
|
|
|
saveData(movies);
|
|
|
|
long endTime = System.currentTimeMillis();
|
|
long duration = (endTime - startTime) / 1000;
|
|
|
|
System.out.println("\n" + "=".repeat(80));
|
|
System.out.println("所有任务完成!");
|
|
System.out.println("总耗时: " + duration + " 秒");
|
|
System.out.println("输出文件位置: " + OUTPUT_DIR);
|
|
System.out.println("=".repeat(80));
|
|
|
|
} catch (Exception e) {
|
|
System.err.println("程序执行出错: " + e.getMessage());
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
private static List<Movie> crawlMovies() throws InterruptedException {
|
|
List<Movie> movies = new ArrayList<>();
|
|
|
|
for (int page = 0; page < MAX_PAGES; page++) {
|
|
int start = page * 25;
|
|
String url = BASE_URL + "?start=" + start;
|
|
|
|
try {
|
|
System.out.println("正在爬取第 " + (page + 1) + " 页...");
|
|
List<Movie> pageMovies = crawlPage(url, page * 25 + 1);
|
|
movies.addAll(pageMovies);
|
|
|
|
if (page < MAX_PAGES - 1) {
|
|
Thread.sleep(2000);
|
|
}
|
|
} catch (Exception e) {
|
|
System.err.println("爬取第 " + (page + 1) + " 页失败: " + e.getMessage());
|
|
}
|
|
}
|
|
|
|
return movies;
|
|
}
|
|
|
|
private static List<Movie> crawlPage(String url, int startRank) throws Exception {
|
|
String html = fetchHtml(url);
|
|
List<Movie> movies = new ArrayList<>();
|
|
|
|
Pattern moviePattern = Pattern.compile("<div class=\"item\">(.*?)</div>\s*</div>", Pattern.DOTALL);
|
|
Matcher movieMatcher = moviePattern.matcher(html);
|
|
|
|
int rank = startRank;
|
|
while (movieMatcher.find()) {
|
|
String movieHtml = movieMatcher.group(1);
|
|
Movie movie = parseMovie(movieHtml, rank++);
|
|
if (movie != null) {
|
|
movies.add(movie);
|
|
System.out.printf(" [%d] %s - %.1f分\n", movie.getRank(), movie.getTitle(), movie.getRating());
|
|
}
|
|
}
|
|
|
|
return movies;
|
|
}
|
|
|
|
private static String fetchHtml(String url) throws Exception {
|
|
URL obj = new URL(url);
|
|
HttpURLConnection con = (HttpURLConnection) obj.openConnection();
|
|
con.setRequestMethod("GET");
|
|
con.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
|
|
con.setConnectTimeout(10000);
|
|
con.setReadTimeout(10000);
|
|
|
|
int responseCode = con.getResponseCode();
|
|
if (responseCode != HttpURLConnection.HTTP_OK) {
|
|
throw new Exception("HTTP error code: " + responseCode);
|
|
}
|
|
|
|
BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), StandardCharsets.UTF_8));
|
|
String inputLine;
|
|
StringBuilder content = new StringBuilder();
|
|
|
|
while ((inputLine = in.readLine()) != null) {
|
|
content.append(inputLine).append("\n");
|
|
}
|
|
|
|
in.close();
|
|
con.disconnect();
|
|
|
|
return content.toString();
|
|
}
|
|
|
|
private static Movie parseMovie(String html, int rank) {
|
|
Movie movie = new Movie();
|
|
movie.setRank(rank);
|
|
|
|
// 解析标题
|
|
Pattern titlePattern = Pattern.compile("<span class=\"title\">(.*?)</span>");
|
|
Matcher titleMatcher = titlePattern.matcher(html);
|
|
if (titleMatcher.find()) {
|
|
movie.setTitle(cleanText(titleMatcher.group(1)));
|
|
}
|
|
|
|
// 解析评分
|
|
Pattern ratingPattern = Pattern.compile("<span class=\"rating_num\" property=\"v:average\">(.*?)</span>");
|
|
Matcher ratingMatcher = ratingPattern.matcher(html);
|
|
if (ratingMatcher.find()) {
|
|
try {
|
|
movie.setRating(Double.parseDouble(cleanText(ratingMatcher.group(1))));
|
|
} catch (NumberFormatException e) {
|
|
movie.setRating(0.0);
|
|
}
|
|
}
|
|
|
|
// 解析评价人数
|
|
Pattern ratingPeoplePattern = Pattern.compile("<span>(\d+)人评价</span>");
|
|
Matcher ratingPeopleMatcher = ratingPeoplePattern.matcher(html);
|
|
if (ratingPeopleMatcher.find()) {
|
|
try {
|
|
movie.setRatingPeople(Integer.parseInt(ratingPeopleMatcher.group(1)));
|
|
} catch (NumberFormatException e) {
|
|
movie.setRatingPeople(0);
|
|
}
|
|
}
|
|
|
|
// 解析详细信息
|
|
Pattern infoPattern = Pattern.compile("<div class=\"bd\">\s*<p class=\"\">(.*?)</p>", Pattern.DOTALL);
|
|
Matcher infoMatcher = infoPattern.matcher(html);
|
|
if (infoMatcher.find()) {
|
|
String info = cleanText(infoMatcher.group(1));
|
|
movie.setInfo(info);
|
|
movie.setYear(extractYear(info));
|
|
movie.setDirector(extractDirector(info));
|
|
}
|
|
|
|
// 解析推荐语
|
|
Pattern quotePattern = Pattern.compile("<span class=\"inq\">(.*?)</span>");
|
|
Matcher quoteMatcher = quotePattern.matcher(html);
|
|
if (quoteMatcher.find()) {
|
|
movie.setQuote(cleanText(quoteMatcher.group(1)));
|
|
} else {
|
|
movie.setQuote("暂无推荐语");
|
|
}
|
|
|
|
return movie;
|
|
}
|
|
|
|
private static String cleanText(String text) {
|
|
if (text == null || text.isEmpty()) {
|
|
return "";
|
|
}
|
|
return text.replaceAll("<.*?>", "").replaceAll("\\s+", " ").trim();
|
|
}
|
|
|
|
private static String extractYear(String text) {
|
|
if (text == null || text.isEmpty()) {
|
|
return "未知";
|
|
}
|
|
Pattern pattern = Pattern.compile("(\\d{4})");
|
|
Matcher matcher = pattern.matcher(text);
|
|
if (matcher.find()) {
|
|
return matcher.group(1);
|
|
}
|
|
return "未知";
|
|
}
|
|
|
|
private static String extractDirector(String info) {
|
|
if (info == null || info.isEmpty()) {
|
|
return "未知";
|
|
}
|
|
String cleaned = info.trim();
|
|
if (cleaned.startsWith("导演:")) {
|
|
int endIndex = cleaned.indexOf("主演:");
|
|
if (endIndex == -1) {
|
|
endIndex = cleaned.indexOf("类型:");
|
|
}
|
|
if (endIndex == -1) {
|
|
endIndex = cleaned.length();
|
|
}
|
|
String directorPart = cleaned.substring(3, endIndex).trim();
|
|
String[] directors = directorPart.split("/");
|
|
if (directors.length > 0) {
|
|
return directors[0].trim();
|
|
}
|
|
}
|
|
return "未知";
|
|
}
|
|
|
|
private static void analyzeMovies(List<Movie> movies) {
|
|
System.out.println("\n" + "=".repeat(80));
|
|
System.out.println("豆瓣电影Top250数据分析报告");
|
|
System.out.println("=".repeat(80));
|
|
|
|
analyzeRatingDistribution(movies);
|
|
analyzeYearDistribution(movies);
|
|
analyzeDirectorRanking(movies);
|
|
analyzeTopRatedMovies(movies);
|
|
analyzeMostPopularMovies(movies);
|
|
}
|
|
|
|
private static void analyzeRatingDistribution(List<Movie> movies) {
|
|
System.out.println("\n【评分分布统计】");
|
|
System.out.println("-".repeat(80));
|
|
|
|
Map<Double, Long> ratingDistribution = movies.stream()
|
|
.collect(Collectors.groupingBy(
|
|
movie -> Math.round(movie.getRating() * 10.0) / 10.0,
|
|
TreeMap::new,
|
|
Collectors.counting()
|
|
));
|
|
|
|
System.out.printf("%-10s %-15s %-15s\n", "评分区间", "电影数量", "占比");
|
|
System.out.println("-".repeat(80));
|
|
|
|
int total = movies.size();
|
|
for (Map.Entry<Double, Long> entry : ratingDistribution.entrySet()) {
|
|
double percentage = (entry.getValue() * 100.0) / total;
|
|
System.out.printf("%-10.1f %-15d %-14.2f%%\n", entry.getKey(), entry.getValue(), percentage);
|
|
}
|
|
|
|
double avgRating = movies.stream()
|
|
.mapToDouble(Movie::getRating)
|
|
.average()
|
|
.orElse(0.0);
|
|
System.out.printf("\n平均评分: %.2f\n", avgRating);
|
|
}
|
|
|
|
private static void analyzeYearDistribution(List<Movie> movies) {
|
|
System.out.println("\n【年份分布统计】");
|
|
System.out.println("-".repeat(80));
|
|
|
|
Map<String, Long> yearDistribution = movies.stream()
|
|
.filter(movie -> !"未知".equals(movie.getYear()))
|
|
.collect(Collectors.groupingBy(
|
|
Movie::getYear,
|
|
TreeMap::new,
|
|
Collectors.counting()
|
|
));
|
|
|
|
System.out.printf("%-10s %-15s\n", "年份", "电影数量");
|
|
System.out.println("-".repeat(80));
|
|
|
|
yearDistribution.entrySet().stream()
|
|
.sorted(Map.Entry.<String, Long>comparingByValue().reversed())
|
|
.limit(10)
|
|
.forEach(entry -> System.out.printf("%-10s %-15d\n", entry.getKey(), entry.getValue()));
|
|
|
|
String mostProductiveYear = yearDistribution.entrySet().stream()
|
|
.max(Map.Entry.comparingByValue())
|
|
.map(Map.Entry::getKey)
|
|
.orElse("未知");
|
|
|
|
System.out.printf("\n电影产量最高的年份: %s\n", mostProductiveYear);
|
|
}
|
|
|
|
private static void analyzeDirectorRanking(List<Movie> movies) {
|
|
System.out.println("\n【导演作品排行】");
|
|
System.out.println("-".repeat(80));
|
|
|
|
Map<String, Long> directorCount = movies.stream()
|
|
.filter(movie -> !"未知".equals(movie.getDirector()))
|
|
.collect(Collectors.groupingBy(
|
|
Movie::getDirector,
|
|
Collectors.counting()
|
|
));
|
|
|
|
List<Map.Entry<String, Long>> topDirectors = directorCount.entrySet().stream()
|
|
.sorted(Map.Entry.<String, Long>comparingByValue().reversed())
|
|
.limit(10)
|
|
.collect(Collectors.toList());
|
|
|
|
System.out.printf("%-5s %-30s %-15s\n", "排名", "导演", "作品数量");
|
|
System.out.println("-".repeat(80));
|
|
|
|
for (int i = 0; i < topDirectors.size(); i++) {
|
|
Map.Entry<String, Long> entry = topDirectors.get(i);
|
|
System.out.printf("%-5d %-30s %-15d\n", i + 1, entry.getKey(), entry.getValue());
|
|
}
|
|
}
|
|
|
|
private static void analyzeTopRatedMovies(List<Movie> movies) {
|
|
System.out.println("\n【高分电影TOP10】");
|
|
System.out.println("-".repeat(80));
|
|
|
|
System.out.printf("%-5s %-40s %-10s %-10s\n", "排名", "电影名称", "评分", "年份");
|
|
System.out.println("-".repeat(80));
|
|
|
|
movies.stream()
|
|
.sorted(Comparator.comparing(Movie::getRating).reversed())
|
|
.limit(10)
|
|
.forEach(movie -> System.out.printf("%-5d %-40s %-10.1f %-10s\n",
|
|
movie.getRank(), movie.getTitle(), movie.getRating(), movie.getYear()));
|
|
}
|
|
|
|
private static void analyzeMostPopularMovies(List<Movie> movies) {
|
|
System.out.println("\n【最受关注电影TOP10】");
|
|
System.out.println("-".repeat(80));
|
|
|
|
System.out.printf("%-5s %-40s %-15s %-10s\n", "排名", "电影名称", "评价人数", "评分");
|
|
System.out.println("-".repeat(80));
|
|
|
|
movies.stream()
|
|
.sorted(Comparator.comparing(Movie::getRatingPeople).reversed())
|
|
.limit(10)
|
|
.forEach(movie -> System.out.printf("%-5d %-40s %-15d %-10.1f\n",
|
|
movie.getRank(), movie.getTitle(), movie.getRatingPeople(), movie.getRating()));
|
|
}
|
|
|
|
private static void saveData(List<Movie> movies) {
|
|
saveToCSV(movies, OUTPUT_DIR + "douban_movies.csv");
|
|
saveToJSON(movies, OUTPUT_DIR + "douban_movies.json");
|
|
saveToText(movies, OUTPUT_DIR + "douban_movies.txt");
|
|
}
|
|
|
|
private static void saveToCSV(List<Movie> movies, String filename) {
|
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) {
|
|
writer.write("排名,电影名称,评分,评价人数,年份,导演,推荐语\n");
|
|
|
|
for (Movie movie : movies) {
|
|
String line = String.format("%d,%s,%.1f,%d,%s,%s,%s\n",
|
|
movie.getRank(),
|
|
escapeCSV(movie.getTitle()),
|
|
movie.getRating(),
|
|
movie.getRatingPeople(),
|
|
movie.getYear(),
|
|
escapeCSV(movie.getDirector()),
|
|
escapeCSV(movie.getQuote()));
|
|
writer.write(line);
|
|
}
|
|
|
|
System.out.println("CSV数据已保存到: " + filename);
|
|
} catch (IOException e) {
|
|
System.err.println("保存CSV文件失败: " + e.getMessage());
|
|
}
|
|
}
|
|
|
|
private static String escapeCSV(String text) {
|
|
if (text == null) {
|
|
return "";
|
|
}
|
|
if (text.contains(",") || text.contains("\"") || text.contains("\n")) {
|
|
return "\"" + text.replace("\"", "\"\"") + "\"";
|
|
}
|
|
return text;
|
|
}
|
|
|
|
private static void saveToJSON(List<Movie> movies, String filename) {
|
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) {
|
|
StringBuilder json = new StringBuilder();
|
|
json.append("[\n");
|
|
|
|
for (int i = 0; i < movies.size(); i++) {
|
|
Movie movie = movies.get(i);
|
|
json.append(" {\n");
|
|
json.append(" \"rank\": ").append(movie.getRank()).append(",\n");
|
|
json.append(" \"title\": \"").append(escapeJSON(movie.getTitle())).append("\",\n");
|
|
json.append(" \"rating\": ").append(movie.getRating()).append(",\n");
|
|
json.append(" \"ratingPeople\": ").append(movie.getRatingPeople()).append(",\n");
|
|
json.append(" \"year\": \"").append(movie.getYear()).append("\",\n");
|
|
json.append(" \"director\": \"").append(escapeJSON(movie.getDirector())).append("\",\n");
|
|
json.append(" \"quote\": \"").append(escapeJSON(movie.getQuote())).append("\"\n");
|
|
json.append(" }");
|
|
|
|
if (i < movies.size() - 1) {
|
|
json.append(",");
|
|
}
|
|
json.append("\n");
|
|
}
|
|
|
|
json.append("]");
|
|
writer.write(json.toString());
|
|
System.out.println("JSON数据已保存到: " + filename);
|
|
} catch (IOException e) {
|
|
System.err.println("保存JSON文件失败: " + e.getMessage());
|
|
}
|
|
}
|
|
|
|
private static String escapeJSON(String text) {
|
|
if (text == null) {
|
|
return "";
|
|
}
|
|
return text.replace("\\", "\\\\")
|
|
.replace("\"", "\\\"")
|
|
.replace("\n", "\\n")
|
|
.replace("\r", "\\r")
|
|
.replace("\t", "\\t");
|
|
}
|
|
|
|
private static void saveToText(List<Movie> movies, String filename) {
|
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) {
|
|
writer.write("豆瓣电影Top250评分数据\n");
|
|
writer.write("爬取时间: " + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")) + "\n");
|
|
writer.write("=".repeat(80) + "\n\n");
|
|
|
|
for (Movie movie : movies) {
|
|
writer.write(String.format("排名: %d\n", movie.getRank()));
|
|
writer.write(String.format("电影名称: %s\n", movie.getTitle()));
|
|
writer.write(String.format("评分: %.1f\n", movie.getRating()));
|
|
writer.write(String.format("评价人数: %d人\n", movie.getRatingPeople()));
|
|
writer.write(String.format("年份: %s\n", movie.getYear()));
|
|
writer.write(String.format("导演: %s\n", movie.getDirector()));
|
|
writer.write(String.format("详细信息: %s\n", movie.getInfo()));
|
|
writer.write(String.format("推荐语: %s\n", movie.getQuote()));
|
|
writer.write("-".repeat(80) + "\n\n");
|
|
}
|
|
|
|
System.out.println("文本数据已保存到: " + filename);
|
|
} catch (IOException e) {
|
|
System.err.println("保存文本文件失败: " + e.getMessage());
|
|
}
|
|
}
|
|
|
|
static class Movie {
|
|
private int rank;
|
|
private String title;
|
|
private double rating;
|
|
private int ratingPeople;
|
|
private String year;
|
|
private String director;
|
|
private String info;
|
|
private String quote;
|
|
|
|
public int getRank() {
|
|
return rank;
|
|
}
|
|
|
|
public void setRank(int rank) {
|
|
this.rank = rank;
|
|
}
|
|
|
|
public String getTitle() {
|
|
return title;
|
|
}
|
|
|
|
public void setTitle(String title) {
|
|
this.title = title;
|
|
}
|
|
|
|
public double getRating() {
|
|
return rating;
|
|
}
|
|
|
|
public void setRating(double rating) {
|
|
this.rating = rating;
|
|
}
|
|
|
|
public int getRatingPeople() {
|
|
return ratingPeople;
|
|
}
|
|
|
|
public void setRatingPeople(int ratingPeople) {
|
|
this.ratingPeople = ratingPeople;
|
|
}
|
|
|
|
public String getYear() {
|
|
return year;
|
|
}
|
|
|
|
public void setYear(String year) {
|
|
this.year = year;
|
|
}
|
|
|
|
public String getDirector() {
|
|
return director;
|
|
}
|
|
|
|
public void setDirector(String director) {
|
|
this.director = director;
|
|
}
|
|
|
|
public String getInfo() {
|
|
return info;
|
|
}
|
|
|
|
public void setInfo(String info) {
|
|
this.info = info;
|
|
}
|
|
|
|
public String getQuote() {
|
|
return quote;
|
|
}
|
|
|
|
public void setQuote(String quote) {
|
|
this.quote = quote;
|
|
}
|
|
}
|
|
}
|