import java.io.*; import java.net.*; import java.nio.charset.StandardCharsets; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; public class MovieCrawler { private static final String BASE_URL = "https://movie.douban.com/top250"; private static final int MAX_PAGES = 10; private static final String OUTPUT_DIR = "D:/"; public static void main(String[] args) { System.out.println("=".repeat(80)); System.out.println("豆瓣电影Top250爬虫与数据分析系统"); System.out.println("=".repeat(80)); long startTime = System.currentTimeMillis(); try { List movies = crawlMovies(); if (movies.isEmpty()) { System.err.println("未能爬取到任何电影数据,请检查网络连接或稍后重试。"); return; } System.out.println("\n数据爬取完成,共获取 " + movies.size() + " 部电影数据"); System.out.println("开始数据分析..."); analyzeMovies(movies); System.out.println("\n开始保存数据..."); saveData(movies); long endTime = System.currentTimeMillis(); long duration = (endTime - startTime) / 1000; System.out.println("\n" + "=".repeat(80)); System.out.println("所有任务完成!"); System.out.println("总耗时: " + duration + " 秒"); System.out.println("输出文件位置: " + OUTPUT_DIR); System.out.println("=".repeat(80)); } catch (Exception e) { System.err.println("程序执行出错: " + e.getMessage()); e.printStackTrace(); } } private static List crawlMovies() throws InterruptedException { List movies = new ArrayList<>(); for (int page = 0; page < MAX_PAGES; page++) { int start = page * 25; String url = BASE_URL + "?start=" + start; try { System.out.println("正在爬取第 " + (page + 1) + " 页..."); List pageMovies = crawlPage(url, page * 25 + 1); movies.addAll(pageMovies); if (page < MAX_PAGES - 1) { Thread.sleep(2000); } } catch (Exception e) { System.err.println("爬取第 " + (page + 1) + " 页失败: " + e.getMessage()); } } return movies; } private static List crawlPage(String url, int startRank) throws Exception { String html = fetchHtml(url); List movies = new ArrayList<>(); Pattern moviePattern = Pattern.compile("
(.*?)
\s*", Pattern.DOTALL); Matcher movieMatcher = moviePattern.matcher(html); int rank = startRank; while (movieMatcher.find()) { String movieHtml = movieMatcher.group(1); Movie movie = parseMovie(movieHtml, rank++); if (movie != null) { movies.add(movie); System.out.printf(" [%d] %s - %.1f分\n", movie.getRank(), movie.getTitle(), movie.getRating()); } } return movies; } private static String fetchHtml(String url) throws Exception { URL obj = new URL(url); HttpURLConnection con = (HttpURLConnection) obj.openConnection(); con.setRequestMethod("GET"); con.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); con.setConnectTimeout(10000); con.setReadTimeout(10000); int responseCode = con.getResponseCode(); if (responseCode != HttpURLConnection.HTTP_OK) { throw new Exception("HTTP error code: " + responseCode); } BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), StandardCharsets.UTF_8)); String inputLine; StringBuilder content = new StringBuilder(); while ((inputLine = in.readLine()) != null) { content.append(inputLine).append("\n"); } in.close(); con.disconnect(); return content.toString(); } private static Movie parseMovie(String html, int rank) { Movie movie = new Movie(); movie.setRank(rank); // 解析标题 Pattern titlePattern = Pattern.compile("(.*?)"); Matcher titleMatcher = titlePattern.matcher(html); if (titleMatcher.find()) { movie.setTitle(cleanText(titleMatcher.group(1))); } // 解析评分 Pattern ratingPattern = Pattern.compile("(.*?)"); Matcher ratingMatcher = ratingPattern.matcher(html); if (ratingMatcher.find()) { try { movie.setRating(Double.parseDouble(cleanText(ratingMatcher.group(1)))); } catch (NumberFormatException e) { movie.setRating(0.0); } } // 解析评价人数 Pattern ratingPeoplePattern = Pattern.compile("(\d+)人评价"); Matcher ratingPeopleMatcher = ratingPeoplePattern.matcher(html); if (ratingPeopleMatcher.find()) { try { movie.setRatingPeople(Integer.parseInt(ratingPeopleMatcher.group(1))); } catch (NumberFormatException e) { movie.setRatingPeople(0); } } // 解析详细信息 Pattern infoPattern = Pattern.compile("
\s*

(.*?)

", Pattern.DOTALL); Matcher infoMatcher = infoPattern.matcher(html); if (infoMatcher.find()) { String info = cleanText(infoMatcher.group(1)); movie.setInfo(info); movie.setYear(extractYear(info)); movie.setDirector(extractDirector(info)); } // 解析推荐语 Pattern quotePattern = Pattern.compile("(.*?)"); Matcher quoteMatcher = quotePattern.matcher(html); if (quoteMatcher.find()) { movie.setQuote(cleanText(quoteMatcher.group(1))); } else { movie.setQuote("暂无推荐语"); } return movie; } private static String cleanText(String text) { if (text == null || text.isEmpty()) { return ""; } return text.replaceAll("<.*?>", "").replaceAll("\\s+", " ").trim(); } private static String extractYear(String text) { if (text == null || text.isEmpty()) { return "未知"; } Pattern pattern = Pattern.compile("(\\d{4})"); Matcher matcher = pattern.matcher(text); if (matcher.find()) { return matcher.group(1); } return "未知"; } private static String extractDirector(String info) { if (info == null || info.isEmpty()) { return "未知"; } String cleaned = info.trim(); if (cleaned.startsWith("导演:")) { int endIndex = cleaned.indexOf("主演:"); if (endIndex == -1) { endIndex = cleaned.indexOf("类型:"); } if (endIndex == -1) { endIndex = cleaned.length(); } String directorPart = cleaned.substring(3, endIndex).trim(); String[] directors = directorPart.split("/"); if (directors.length > 0) { return directors[0].trim(); } } return "未知"; } private static void analyzeMovies(List movies) { System.out.println("\n" + "=".repeat(80)); System.out.println("豆瓣电影Top250数据分析报告"); System.out.println("=".repeat(80)); analyzeRatingDistribution(movies); analyzeYearDistribution(movies); analyzeDirectorRanking(movies); analyzeTopRatedMovies(movies); analyzeMostPopularMovies(movies); } private static void analyzeRatingDistribution(List movies) { System.out.println("\n【评分分布统计】"); System.out.println("-".repeat(80)); Map ratingDistribution = movies.stream() .collect(Collectors.groupingBy( movie -> Math.round(movie.getRating() * 10.0) / 10.0, TreeMap::new, Collectors.counting() )); System.out.printf("%-10s %-15s %-15s\n", "评分区间", "电影数量", "占比"); System.out.println("-".repeat(80)); int total = movies.size(); for (Map.Entry entry : ratingDistribution.entrySet()) { double percentage = (entry.getValue() * 100.0) / total; System.out.printf("%-10.1f %-15d %-14.2f%%\n", entry.getKey(), entry.getValue(), percentage); } double avgRating = movies.stream() .mapToDouble(Movie::getRating) .average() .orElse(0.0); System.out.printf("\n平均评分: %.2f\n", avgRating); } private static void analyzeYearDistribution(List movies) { System.out.println("\n【年份分布统计】"); System.out.println("-".repeat(80)); Map yearDistribution = movies.stream() .filter(movie -> !"未知".equals(movie.getYear())) .collect(Collectors.groupingBy( Movie::getYear, TreeMap::new, Collectors.counting() )); System.out.printf("%-10s %-15s\n", "年份", "电影数量"); System.out.println("-".repeat(80)); yearDistribution.entrySet().stream() .sorted(Map.Entry.comparingByValue().reversed()) .limit(10) .forEach(entry -> System.out.printf("%-10s %-15d\n", entry.getKey(), entry.getValue())); String mostProductiveYear = yearDistribution.entrySet().stream() .max(Map.Entry.comparingByValue()) .map(Map.Entry::getKey) .orElse("未知"); System.out.printf("\n电影产量最高的年份: %s\n", mostProductiveYear); } private static void analyzeDirectorRanking(List movies) { System.out.println("\n【导演作品排行】"); System.out.println("-".repeat(80)); Map directorCount = movies.stream() .filter(movie -> !"未知".equals(movie.getDirector())) .collect(Collectors.groupingBy( Movie::getDirector, Collectors.counting() )); List> topDirectors = directorCount.entrySet().stream() .sorted(Map.Entry.comparingByValue().reversed()) .limit(10) .collect(Collectors.toList()); System.out.printf("%-5s %-30s %-15s\n", "排名", "导演", "作品数量"); System.out.println("-".repeat(80)); for (int i = 0; i < topDirectors.size(); i++) { Map.Entry entry = topDirectors.get(i); System.out.printf("%-5d %-30s %-15d\n", i + 1, entry.getKey(), entry.getValue()); } } private static void analyzeTopRatedMovies(List movies) { System.out.println("\n【高分电影TOP10】"); System.out.println("-".repeat(80)); System.out.printf("%-5s %-40s %-10s %-10s\n", "排名", "电影名称", "评分", "年份"); System.out.println("-".repeat(80)); movies.stream() .sorted(Comparator.comparing(Movie::getRating).reversed()) .limit(10) .forEach(movie -> System.out.printf("%-5d %-40s %-10.1f %-10s\n", movie.getRank(), movie.getTitle(), movie.getRating(), movie.getYear())); } private static void analyzeMostPopularMovies(List movies) { System.out.println("\n【最受关注电影TOP10】"); System.out.println("-".repeat(80)); System.out.printf("%-5s %-40s %-15s %-10s\n", "排名", "电影名称", "评价人数", "评分"); System.out.println("-".repeat(80)); movies.stream() .sorted(Comparator.comparing(Movie::getRatingPeople).reversed()) .limit(10) .forEach(movie -> System.out.printf("%-5d %-40s %-15d %-10.1f\n", movie.getRank(), movie.getTitle(), movie.getRatingPeople(), movie.getRating())); } private static void saveData(List movies) { saveToCSV(movies, OUTPUT_DIR + "douban_movies.csv"); saveToJSON(movies, OUTPUT_DIR + "douban_movies.json"); saveToText(movies, OUTPUT_DIR + "douban_movies.txt"); } private static void saveToCSV(List movies, String filename) { try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) { writer.write("排名,电影名称,评分,评价人数,年份,导演,推荐语\n"); for (Movie movie : movies) { String line = String.format("%d,%s,%.1f,%d,%s,%s,%s\n", movie.getRank(), escapeCSV(movie.getTitle()), movie.getRating(), movie.getRatingPeople(), movie.getYear(), escapeCSV(movie.getDirector()), escapeCSV(movie.getQuote())); writer.write(line); } System.out.println("CSV数据已保存到: " + filename); } catch (IOException e) { System.err.println("保存CSV文件失败: " + e.getMessage()); } } private static String escapeCSV(String text) { if (text == null) { return ""; } if (text.contains(",") || text.contains("\"") || text.contains("\n")) { return "\"" + text.replace("\"", "\"\"") + "\""; } return text; } private static void saveToJSON(List movies, String filename) { try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) { StringBuilder json = new StringBuilder(); json.append("[\n"); for (int i = 0; i < movies.size(); i++) { Movie movie = movies.get(i); json.append(" {\n"); json.append(" \"rank\": ").append(movie.getRank()).append(",\n"); json.append(" \"title\": \"").append(escapeJSON(movie.getTitle())).append("\",\n"); json.append(" \"rating\": ").append(movie.getRating()).append(",\n"); json.append(" \"ratingPeople\": ").append(movie.getRatingPeople()).append(",\n"); json.append(" \"year\": \"").append(movie.getYear()).append("\",\n"); json.append(" \"director\": \"").append(escapeJSON(movie.getDirector())).append("\",\n"); json.append(" \"quote\": \"").append(escapeJSON(movie.getQuote())).append("\"\n"); json.append(" }"); if (i < movies.size() - 1) { json.append(","); } json.append("\n"); } json.append("]"); writer.write(json.toString()); System.out.println("JSON数据已保存到: " + filename); } catch (IOException e) { System.err.println("保存JSON文件失败: " + e.getMessage()); } } private static String escapeJSON(String text) { if (text == null) { return ""; } return text.replace("\\", "\\\\") .replace("\"", "\\\"") .replace("\n", "\\n") .replace("\r", "\\r") .replace("\t", "\\t"); } private static void saveToText(List movies, String filename) { try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) { writer.write("豆瓣电影Top250评分数据\n"); writer.write("爬取时间: " + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")) + "\n"); writer.write("=".repeat(80) + "\n\n"); for (Movie movie : movies) { writer.write(String.format("排名: %d\n", movie.getRank())); writer.write(String.format("电影名称: %s\n", movie.getTitle())); writer.write(String.format("评分: %.1f\n", movie.getRating())); writer.write(String.format("评价人数: %d人\n", movie.getRatingPeople())); writer.write(String.format("年份: %s\n", movie.getYear())); writer.write(String.format("导演: %s\n", movie.getDirector())); writer.write(String.format("详细信息: %s\n", movie.getInfo())); writer.write(String.format("推荐语: %s\n", movie.getQuote())); writer.write("-".repeat(80) + "\n\n"); } System.out.println("文本数据已保存到: " + filename); } catch (IOException e) { System.err.println("保存文本文件失败: " + e.getMessage()); } } static class Movie { private int rank; private String title; private double rating; private int ratingPeople; private String year; private String director; private String info; private String quote; public int getRank() { return rank; } public void setRank(int rank) { this.rank = rank; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public double getRating() { return rating; } public void setRating(double rating) { this.rating = rating; } public int getRatingPeople() { return ratingPeople; } public void setRatingPeople(int ratingPeople) { this.ratingPeople = ratingPeople; } public String getYear() { return year; } public void setYear(String year) { this.year = year; } public String getDirector() { return director; } public void setDirector(String director) { this.director = director; } public String getInfo() { return info; } public void setInfo(String info) { this.info = info; } public String getQuote() { return quote; } public void setQuote(String quote) { this.quote = quote; } } }