Browse Source

移动"MovieCrawler.java"至project

main
GaoGeng 2 weeks ago
parent
commit
72390a0009
  1. 520
      MovieCrawler.java

520
MovieCrawler.java

@ -1,520 +0,0 @@
import java.io.*;
import java.net.*;
import java.nio.charset.StandardCharsets;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class MovieCrawler {
private static final String BASE_URL = "https://movie.douban.com/top250";
private static final int MAX_PAGES = 10;
private static final String OUTPUT_DIR = "D:/";
public static void main(String[] args) {
System.out.println("=".repeat(80));
System.out.println("豆瓣电影Top250爬虫与数据分析系统");
System.out.println("=".repeat(80));
long startTime = System.currentTimeMillis();
try {
List<Movie> movies = crawlMovies();
if (movies.isEmpty()) {
System.err.println("未能爬取到任何电影数据,请检查网络连接或稍后重试。");
return;
}
System.out.println("\n数据爬取完成,共获取 " + movies.size() + " 部电影数据");
System.out.println("开始数据分析...");
analyzeMovies(movies);
System.out.println("\n开始保存数据...");
saveData(movies);
long endTime = System.currentTimeMillis();
long duration = (endTime - startTime) / 1000;
System.out.println("\n" + "=".repeat(80));
System.out.println("所有任务完成!");
System.out.println("总耗时: " + duration + " 秒");
System.out.println("输出文件位置: " + OUTPUT_DIR);
System.out.println("=".repeat(80));
} catch (Exception e) {
System.err.println("程序执行出错: " + e.getMessage());
e.printStackTrace();
}
}
private static List<Movie> crawlMovies() throws InterruptedException {
List<Movie> movies = new ArrayList<>();
for (int page = 0; page < MAX_PAGES; page++) {
int start = page * 25;
String url = BASE_URL + "?start=" + start;
try {
System.out.println("正在爬取第 " + (page + 1) + " 页...");
List<Movie> pageMovies = crawlPage(url, page * 25 + 1);
movies.addAll(pageMovies);
if (page < MAX_PAGES - 1) {
Thread.sleep(2000);
}
} catch (Exception e) {
System.err.println("爬取第 " + (page + 1) + " 页失败: " + e.getMessage());
}
}
return movies;
}
private static List<Movie> crawlPage(String url, int startRank) throws Exception {
String html = fetchHtml(url);
List<Movie> movies = new ArrayList<>();
Pattern moviePattern = Pattern.compile("<div class=\"item\">(.*?)</div>\s*</div>", Pattern.DOTALL);
Matcher movieMatcher = moviePattern.matcher(html);
int rank = startRank;
while (movieMatcher.find()) {
String movieHtml = movieMatcher.group(1);
Movie movie = parseMovie(movieHtml, rank++);
if (movie != null) {
movies.add(movie);
System.out.printf(" [%d] %s - %.1f分\n", movie.getRank(), movie.getTitle(), movie.getRating());
}
}
return movies;
}
private static String fetchHtml(String url) throws Exception {
URL obj = new URL(url);
HttpURLConnection con = (HttpURLConnection) obj.openConnection();
con.setRequestMethod("GET");
con.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
con.setConnectTimeout(10000);
con.setReadTimeout(10000);
int responseCode = con.getResponseCode();
if (responseCode != HttpURLConnection.HTTP_OK) {
throw new Exception("HTTP error code: " + responseCode);
}
BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), StandardCharsets.UTF_8));
String inputLine;
StringBuilder content = new StringBuilder();
while ((inputLine = in.readLine()) != null) {
content.append(inputLine).append("\n");
}
in.close();
con.disconnect();
return content.toString();
}
private static Movie parseMovie(String html, int rank) {
Movie movie = new Movie();
movie.setRank(rank);
// 解析标题
Pattern titlePattern = Pattern.compile("<span class=\"title\">(.*?)</span>");
Matcher titleMatcher = titlePattern.matcher(html);
if (titleMatcher.find()) {
movie.setTitle(cleanText(titleMatcher.group(1)));
}
// 解析评分
Pattern ratingPattern = Pattern.compile("<span class=\"rating_num\" property=\"v:average\">(.*?)</span>");
Matcher ratingMatcher = ratingPattern.matcher(html);
if (ratingMatcher.find()) {
try {
movie.setRating(Double.parseDouble(cleanText(ratingMatcher.group(1))));
} catch (NumberFormatException e) {
movie.setRating(0.0);
}
}
// 解析评价人数
Pattern ratingPeoplePattern = Pattern.compile("<span>(\d+)人评价</span>");
Matcher ratingPeopleMatcher = ratingPeoplePattern.matcher(html);
if (ratingPeopleMatcher.find()) {
try {
movie.setRatingPeople(Integer.parseInt(ratingPeopleMatcher.group(1)));
} catch (NumberFormatException e) {
movie.setRatingPeople(0);
}
}
// 解析详细信息
Pattern infoPattern = Pattern.compile("<div class=\"bd\">\s*<p class=\"\">(.*?)</p>", Pattern.DOTALL);
Matcher infoMatcher = infoPattern.matcher(html);
if (infoMatcher.find()) {
String info = cleanText(infoMatcher.group(1));
movie.setInfo(info);
movie.setYear(extractYear(info));
movie.setDirector(extractDirector(info));
}
// 解析推荐语
Pattern quotePattern = Pattern.compile("<span class=\"inq\">(.*?)</span>");
Matcher quoteMatcher = quotePattern.matcher(html);
if (quoteMatcher.find()) {
movie.setQuote(cleanText(quoteMatcher.group(1)));
} else {
movie.setQuote("暂无推荐语");
}
return movie;
}
private static String cleanText(String text) {
if (text == null || text.isEmpty()) {
return "";
}
return text.replaceAll("<.*?>", "").replaceAll("\\s+", " ").trim();
}
private static String extractYear(String text) {
if (text == null || text.isEmpty()) {
return "未知";
}
Pattern pattern = Pattern.compile("(\\d{4})");
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
return matcher.group(1);
}
return "未知";
}
private static String extractDirector(String info) {
if (info == null || info.isEmpty()) {
return "未知";
}
String cleaned = info.trim();
if (cleaned.startsWith("导演:")) {
int endIndex = cleaned.indexOf("主演:");
if (endIndex == -1) {
endIndex = cleaned.indexOf("类型:");
}
if (endIndex == -1) {
endIndex = cleaned.length();
}
String directorPart = cleaned.substring(3, endIndex).trim();
String[] directors = directorPart.split("/");
if (directors.length > 0) {
return directors[0].trim();
}
}
return "未知";
}
private static void analyzeMovies(List<Movie> movies) {
System.out.println("\n" + "=".repeat(80));
System.out.println("豆瓣电影Top250数据分析报告");
System.out.println("=".repeat(80));
analyzeRatingDistribution(movies);
analyzeYearDistribution(movies);
analyzeDirectorRanking(movies);
analyzeTopRatedMovies(movies);
analyzeMostPopularMovies(movies);
}
private static void analyzeRatingDistribution(List<Movie> movies) {
System.out.println("\n【评分分布统计】");
System.out.println("-".repeat(80));
Map<Double, Long> ratingDistribution = movies.stream()
.collect(Collectors.groupingBy(
movie -> Math.round(movie.getRating() * 10.0) / 10.0,
TreeMap::new,
Collectors.counting()
));
System.out.printf("%-10s %-15s %-15s\n", "评分区间", "电影数量", "占比");
System.out.println("-".repeat(80));
int total = movies.size();
for (Map.Entry<Double, Long> entry : ratingDistribution.entrySet()) {
double percentage = (entry.getValue() * 100.0) / total;
System.out.printf("%-10.1f %-15d %-14.2f%%\n", entry.getKey(), entry.getValue(), percentage);
}
double avgRating = movies.stream()
.mapToDouble(Movie::getRating)
.average()
.orElse(0.0);
System.out.printf("\n平均评分: %.2f\n", avgRating);
}
private static void analyzeYearDistribution(List<Movie> movies) {
System.out.println("\n【年份分布统计】");
System.out.println("-".repeat(80));
Map<String, Long> yearDistribution = movies.stream()
.filter(movie -> !"未知".equals(movie.getYear()))
.collect(Collectors.groupingBy(
Movie::getYear,
TreeMap::new,
Collectors.counting()
));
System.out.printf("%-10s %-15s\n", "年份", "电影数量");
System.out.println("-".repeat(80));
yearDistribution.entrySet().stream()
.sorted(Map.Entry.<String, Long>comparingByValue().reversed())
.limit(10)
.forEach(entry -> System.out.printf("%-10s %-15d\n", entry.getKey(), entry.getValue()));
String mostProductiveYear = yearDistribution.entrySet().stream()
.max(Map.Entry.comparingByValue())
.map(Map.Entry::getKey)
.orElse("未知");
System.out.printf("\n电影产量最高的年份: %s\n", mostProductiveYear);
}
private static void analyzeDirectorRanking(List<Movie> movies) {
System.out.println("\n【导演作品排行】");
System.out.println("-".repeat(80));
Map<String, Long> directorCount = movies.stream()
.filter(movie -> !"未知".equals(movie.getDirector()))
.collect(Collectors.groupingBy(
Movie::getDirector,
Collectors.counting()
));
List<Map.Entry<String, Long>> topDirectors = directorCount.entrySet().stream()
.sorted(Map.Entry.<String, Long>comparingByValue().reversed())
.limit(10)
.collect(Collectors.toList());
System.out.printf("%-5s %-30s %-15s\n", "排名", "导演", "作品数量");
System.out.println("-".repeat(80));
for (int i = 0; i < topDirectors.size(); i++) {
Map.Entry<String, Long> entry = topDirectors.get(i);
System.out.printf("%-5d %-30s %-15d\n", i + 1, entry.getKey(), entry.getValue());
}
}
private static void analyzeTopRatedMovies(List<Movie> movies) {
System.out.println("\n【高分电影TOP10】");
System.out.println("-".repeat(80));
System.out.printf("%-5s %-40s %-10s %-10s\n", "排名", "电影名称", "评分", "年份");
System.out.println("-".repeat(80));
movies.stream()
.sorted(Comparator.comparing(Movie::getRating).reversed())
.limit(10)
.forEach(movie -> System.out.printf("%-5d %-40s %-10.1f %-10s\n",
movie.getRank(), movie.getTitle(), movie.getRating(), movie.getYear()));
}
private static void analyzeMostPopularMovies(List<Movie> movies) {
System.out.println("\n【最受关注电影TOP10】");
System.out.println("-".repeat(80));
System.out.printf("%-5s %-40s %-15s %-10s\n", "排名", "电影名称", "评价人数", "评分");
System.out.println("-".repeat(80));
movies.stream()
.sorted(Comparator.comparing(Movie::getRatingPeople).reversed())
.limit(10)
.forEach(movie -> System.out.printf("%-5d %-40s %-15d %-10.1f\n",
movie.getRank(), movie.getTitle(), movie.getRatingPeople(), movie.getRating()));
}
private static void saveData(List<Movie> movies) {
saveToCSV(movies, OUTPUT_DIR + "douban_movies.csv");
saveToJSON(movies, OUTPUT_DIR + "douban_movies.json");
saveToText(movies, OUTPUT_DIR + "douban_movies.txt");
}
private static void saveToCSV(List<Movie> movies, String filename) {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) {
writer.write("排名,电影名称,评分,评价人数,年份,导演,推荐语\n");
for (Movie movie : movies) {
String line = String.format("%d,%s,%.1f,%d,%s,%s,%s\n",
movie.getRank(),
escapeCSV(movie.getTitle()),
movie.getRating(),
movie.getRatingPeople(),
movie.getYear(),
escapeCSV(movie.getDirector()),
escapeCSV(movie.getQuote()));
writer.write(line);
}
System.out.println("CSV数据已保存到: " + filename);
} catch (IOException e) {
System.err.println("保存CSV文件失败: " + e.getMessage());
}
}
private static String escapeCSV(String text) {
if (text == null) {
return "";
}
if (text.contains(",") || text.contains("\"") || text.contains("\n")) {
return "\"" + text.replace("\"", "\"\"") + "\"";
}
return text;
}
private static void saveToJSON(List<Movie> movies, String filename) {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) {
StringBuilder json = new StringBuilder();
json.append("[\n");
for (int i = 0; i < movies.size(); i++) {
Movie movie = movies.get(i);
json.append(" {\n");
json.append(" \"rank\": ").append(movie.getRank()).append(",\n");
json.append(" \"title\": \"").append(escapeJSON(movie.getTitle())).append("\",\n");
json.append(" \"rating\": ").append(movie.getRating()).append(",\n");
json.append(" \"ratingPeople\": ").append(movie.getRatingPeople()).append(",\n");
json.append(" \"year\": \"").append(movie.getYear()).append("\",\n");
json.append(" \"director\": \"").append(escapeJSON(movie.getDirector())).append("\",\n");
json.append(" \"quote\": \"").append(escapeJSON(movie.getQuote())).append("\"\n");
json.append(" }");
if (i < movies.size() - 1) {
json.append(",");
}
json.append("\n");
}
json.append("]");
writer.write(json.toString());
System.out.println("JSON数据已保存到: " + filename);
} catch (IOException e) {
System.err.println("保存JSON文件失败: " + e.getMessage());
}
}
private static String escapeJSON(String text) {
if (text == null) {
return "";
}
return text.replace("\\", "\\\\")
.replace("\"", "\\\"")
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t");
}
private static void saveToText(List<Movie> movies, String filename) {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) {
writer.write("豆瓣电影Top250评分数据\n");
writer.write("爬取时间: " + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")) + "\n");
writer.write("=".repeat(80) + "\n\n");
for (Movie movie : movies) {
writer.write(String.format("排名: %d\n", movie.getRank()));
writer.write(String.format("电影名称: %s\n", movie.getTitle()));
writer.write(String.format("评分: %.1f\n", movie.getRating()));
writer.write(String.format("评价人数: %d人\n", movie.getRatingPeople()));
writer.write(String.format("年份: %s\n", movie.getYear()));
writer.write(String.format("导演: %s\n", movie.getDirector()));
writer.write(String.format("详细信息: %s\n", movie.getInfo()));
writer.write(String.format("推荐语: %s\n", movie.getQuote()));
writer.write("-".repeat(80) + "\n\n");
}
System.out.println("文本数据已保存到: " + filename);
} catch (IOException e) {
System.err.println("保存文本文件失败: " + e.getMessage());
}
}
static class Movie {
private int rank;
private String title;
private double rating;
private int ratingPeople;
private String year;
private String director;
private String info;
private String quote;
public int getRank() {
return rank;
}
public void setRank(int rank) {
this.rank = rank;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public double getRating() {
return rating;
}
public void setRating(double rating) {
this.rating = rating;
}
public int getRatingPeople() {
return ratingPeople;
}
public void setRatingPeople(int ratingPeople) {
this.ratingPeople = ratingPeople;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getDirector() {
return director;
}
public void setDirector(String director) {
this.director = director;
}
public String getInfo() {
return info;
}
public void setInfo(String info) {
this.info = info;
}
public String getQuote() {
return quote;
}
public void setQuote(String quote) {
this.quote = quote;
}
}
}
Loading…
Cancel
Save