diff --git a/MovieCrawler.java b/MovieCrawler.java deleted file mode 100644 index 3445e2a..0000000 --- a/MovieCrawler.java +++ /dev/null @@ -1,520 +0,0 @@ -import java.io.*; -import java.net.*; -import java.nio.charset.StandardCharsets; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -public class MovieCrawler { - - private static final String BASE_URL = "https://movie.douban.com/top250"; - private static final int MAX_PAGES = 10; - private static final String OUTPUT_DIR = "D:/"; - - public static void main(String[] args) { - System.out.println("=".repeat(80)); - System.out.println("豆瓣电影Top250爬虫与数据分析系统"); - System.out.println("=".repeat(80)); - - long startTime = System.currentTimeMillis(); - - try { - List movies = crawlMovies(); - - if (movies.isEmpty()) { - System.err.println("未能爬取到任何电影数据,请检查网络连接或稍后重试。"); - return; - } - - System.out.println("\n数据爬取完成,共获取 " + movies.size() + " 部电影数据"); - System.out.println("开始数据分析..."); - - analyzeMovies(movies); - - System.out.println("\n开始保存数据..."); - - saveData(movies); - - long endTime = System.currentTimeMillis(); - long duration = (endTime - startTime) / 1000; - - System.out.println("\n" + "=".repeat(80)); - System.out.println("所有任务完成!"); - System.out.println("总耗时: " + duration + " 秒"); - System.out.println("输出文件位置: " + OUTPUT_DIR); - System.out.println("=".repeat(80)); - - } catch (Exception e) { - System.err.println("程序执行出错: " + e.getMessage()); - e.printStackTrace(); - } - } - - private static List crawlMovies() throws InterruptedException { - List movies = new ArrayList<>(); - - for (int page = 0; page < MAX_PAGES; page++) { - int start = page * 25; - String url = BASE_URL + "?start=" + start; - - try { - System.out.println("正在爬取第 " + (page + 1) + " 页..."); - List pageMovies = crawlPage(url, page * 25 + 1); - movies.addAll(pageMovies); - - if (page < MAX_PAGES - 1) { - Thread.sleep(2000); - } - } catch (Exception e) { - System.err.println("爬取第 " + (page + 1) + " 页失败: " + e.getMessage()); - } - } - - return movies; - } - - private static List crawlPage(String url, int startRank) throws Exception { - String html = fetchHtml(url); - List movies = new ArrayList<>(); - - Pattern moviePattern = Pattern.compile("
(.*?)
\s*", Pattern.DOTALL); - Matcher movieMatcher = moviePattern.matcher(html); - - int rank = startRank; - while (movieMatcher.find()) { - String movieHtml = movieMatcher.group(1); - Movie movie = parseMovie(movieHtml, rank++); - if (movie != null) { - movies.add(movie); - System.out.printf(" [%d] %s - %.1f分\n", movie.getRank(), movie.getTitle(), movie.getRating()); - } - } - - return movies; - } - - private static String fetchHtml(String url) throws Exception { - URL obj = new URL(url); - HttpURLConnection con = (HttpURLConnection) obj.openConnection(); - con.setRequestMethod("GET"); - con.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); - con.setConnectTimeout(10000); - con.setReadTimeout(10000); - - int responseCode = con.getResponseCode(); - if (responseCode != HttpURLConnection.HTTP_OK) { - throw new Exception("HTTP error code: " + responseCode); - } - - BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), StandardCharsets.UTF_8)); - String inputLine; - StringBuilder content = new StringBuilder(); - - while ((inputLine = in.readLine()) != null) { - content.append(inputLine).append("\n"); - } - - in.close(); - con.disconnect(); - - return content.toString(); - } - - private static Movie parseMovie(String html, int rank) { - Movie movie = new Movie(); - movie.setRank(rank); - - // 解析标题 - Pattern titlePattern = Pattern.compile("(.*?)"); - Matcher titleMatcher = titlePattern.matcher(html); - if (titleMatcher.find()) { - movie.setTitle(cleanText(titleMatcher.group(1))); - } - - // 解析评分 - Pattern ratingPattern = Pattern.compile("(.*?)"); - Matcher ratingMatcher = ratingPattern.matcher(html); - if (ratingMatcher.find()) { - try { - movie.setRating(Double.parseDouble(cleanText(ratingMatcher.group(1)))); - } catch (NumberFormatException e) { - movie.setRating(0.0); - } - } - - // 解析评价人数 - Pattern ratingPeoplePattern = Pattern.compile("(\d+)人评价"); - Matcher ratingPeopleMatcher = ratingPeoplePattern.matcher(html); - if (ratingPeopleMatcher.find()) { - try { - movie.setRatingPeople(Integer.parseInt(ratingPeopleMatcher.group(1))); - } catch (NumberFormatException e) { - movie.setRatingPeople(0); - } - } - - // 解析详细信息 - Pattern infoPattern = Pattern.compile("
\s*

(.*?)

", Pattern.DOTALL); - Matcher infoMatcher = infoPattern.matcher(html); - if (infoMatcher.find()) { - String info = cleanText(infoMatcher.group(1)); - movie.setInfo(info); - movie.setYear(extractYear(info)); - movie.setDirector(extractDirector(info)); - } - - // 解析推荐语 - Pattern quotePattern = Pattern.compile("(.*?)"); - Matcher quoteMatcher = quotePattern.matcher(html); - if (quoteMatcher.find()) { - movie.setQuote(cleanText(quoteMatcher.group(1))); - } else { - movie.setQuote("暂无推荐语"); - } - - return movie; - } - - private static String cleanText(String text) { - if (text == null || text.isEmpty()) { - return ""; - } - return text.replaceAll("<.*?>", "").replaceAll("\\s+", " ").trim(); - } - - private static String extractYear(String text) { - if (text == null || text.isEmpty()) { - return "未知"; - } - Pattern pattern = Pattern.compile("(\\d{4})"); - Matcher matcher = pattern.matcher(text); - if (matcher.find()) { - return matcher.group(1); - } - return "未知"; - } - - private static String extractDirector(String info) { - if (info == null || info.isEmpty()) { - return "未知"; - } - String cleaned = info.trim(); - if (cleaned.startsWith("导演:")) { - int endIndex = cleaned.indexOf("主演:"); - if (endIndex == -1) { - endIndex = cleaned.indexOf("类型:"); - } - if (endIndex == -1) { - endIndex = cleaned.length(); - } - String directorPart = cleaned.substring(3, endIndex).trim(); - String[] directors = directorPart.split("/"); - if (directors.length > 0) { - return directors[0].trim(); - } - } - return "未知"; - } - - private static void analyzeMovies(List movies) { - System.out.println("\n" + "=".repeat(80)); - System.out.println("豆瓣电影Top250数据分析报告"); - System.out.println("=".repeat(80)); - - analyzeRatingDistribution(movies); - analyzeYearDistribution(movies); - analyzeDirectorRanking(movies); - analyzeTopRatedMovies(movies); - analyzeMostPopularMovies(movies); - } - - private static void analyzeRatingDistribution(List movies) { - System.out.println("\n【评分分布统计】"); - System.out.println("-".repeat(80)); - - Map ratingDistribution = movies.stream() - .collect(Collectors.groupingBy( - movie -> Math.round(movie.getRating() * 10.0) / 10.0, - TreeMap::new, - Collectors.counting() - )); - - System.out.printf("%-10s %-15s %-15s\n", "评分区间", "电影数量", "占比"); - System.out.println("-".repeat(80)); - - int total = movies.size(); - for (Map.Entry entry : ratingDistribution.entrySet()) { - double percentage = (entry.getValue() * 100.0) / total; - System.out.printf("%-10.1f %-15d %-14.2f%%\n", entry.getKey(), entry.getValue(), percentage); - } - - double avgRating = movies.stream() - .mapToDouble(Movie::getRating) - .average() - .orElse(0.0); - System.out.printf("\n平均评分: %.2f\n", avgRating); - } - - private static void analyzeYearDistribution(List movies) { - System.out.println("\n【年份分布统计】"); - System.out.println("-".repeat(80)); - - Map yearDistribution = movies.stream() - .filter(movie -> !"未知".equals(movie.getYear())) - .collect(Collectors.groupingBy( - Movie::getYear, - TreeMap::new, - Collectors.counting() - )); - - System.out.printf("%-10s %-15s\n", "年份", "电影数量"); - System.out.println("-".repeat(80)); - - yearDistribution.entrySet().stream() - .sorted(Map.Entry.comparingByValue().reversed()) - .limit(10) - .forEach(entry -> System.out.printf("%-10s %-15d\n", entry.getKey(), entry.getValue())); - - String mostProductiveYear = yearDistribution.entrySet().stream() - .max(Map.Entry.comparingByValue()) - .map(Map.Entry::getKey) - .orElse("未知"); - - System.out.printf("\n电影产量最高的年份: %s\n", mostProductiveYear); - } - - private static void analyzeDirectorRanking(List movies) { - System.out.println("\n【导演作品排行】"); - System.out.println("-".repeat(80)); - - Map directorCount = movies.stream() - .filter(movie -> !"未知".equals(movie.getDirector())) - .collect(Collectors.groupingBy( - Movie::getDirector, - Collectors.counting() - )); - - List> topDirectors = directorCount.entrySet().stream() - .sorted(Map.Entry.comparingByValue().reversed()) - .limit(10) - .collect(Collectors.toList()); - - System.out.printf("%-5s %-30s %-15s\n", "排名", "导演", "作品数量"); - System.out.println("-".repeat(80)); - - for (int i = 0; i < topDirectors.size(); i++) { - Map.Entry entry = topDirectors.get(i); - System.out.printf("%-5d %-30s %-15d\n", i + 1, entry.getKey(), entry.getValue()); - } - } - - private static void analyzeTopRatedMovies(List movies) { - System.out.println("\n【高分电影TOP10】"); - System.out.println("-".repeat(80)); - - System.out.printf("%-5s %-40s %-10s %-10s\n", "排名", "电影名称", "评分", "年份"); - System.out.println("-".repeat(80)); - - movies.stream() - .sorted(Comparator.comparing(Movie::getRating).reversed()) - .limit(10) - .forEach(movie -> System.out.printf("%-5d %-40s %-10.1f %-10s\n", - movie.getRank(), movie.getTitle(), movie.getRating(), movie.getYear())); - } - - private static void analyzeMostPopularMovies(List movies) { - System.out.println("\n【最受关注电影TOP10】"); - System.out.println("-".repeat(80)); - - System.out.printf("%-5s %-40s %-15s %-10s\n", "排名", "电影名称", "评价人数", "评分"); - System.out.println("-".repeat(80)); - - movies.stream() - .sorted(Comparator.comparing(Movie::getRatingPeople).reversed()) - .limit(10) - .forEach(movie -> System.out.printf("%-5d %-40s %-15d %-10.1f\n", - movie.getRank(), movie.getTitle(), movie.getRatingPeople(), movie.getRating())); - } - - private static void saveData(List movies) { - saveToCSV(movies, OUTPUT_DIR + "douban_movies.csv"); - saveToJSON(movies, OUTPUT_DIR + "douban_movies.json"); - saveToText(movies, OUTPUT_DIR + "douban_movies.txt"); - } - - private static void saveToCSV(List movies, String filename) { - try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) { - writer.write("排名,电影名称,评分,评价人数,年份,导演,推荐语\n"); - - for (Movie movie : movies) { - String line = String.format("%d,%s,%.1f,%d,%s,%s,%s\n", - movie.getRank(), - escapeCSV(movie.getTitle()), - movie.getRating(), - movie.getRatingPeople(), - movie.getYear(), - escapeCSV(movie.getDirector()), - escapeCSV(movie.getQuote())); - writer.write(line); - } - - System.out.println("CSV数据已保存到: " + filename); - } catch (IOException e) { - System.err.println("保存CSV文件失败: " + e.getMessage()); - } - } - - private static String escapeCSV(String text) { - if (text == null) { - return ""; - } - if (text.contains(",") || text.contains("\"") || text.contains("\n")) { - return "\"" + text.replace("\"", "\"\"") + "\""; - } - return text; - } - - private static void saveToJSON(List movies, String filename) { - try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) { - StringBuilder json = new StringBuilder(); - json.append("[\n"); - - for (int i = 0; i < movies.size(); i++) { - Movie movie = movies.get(i); - json.append(" {\n"); - json.append(" \"rank\": ").append(movie.getRank()).append(",\n"); - json.append(" \"title\": \"").append(escapeJSON(movie.getTitle())).append("\",\n"); - json.append(" \"rating\": ").append(movie.getRating()).append(",\n"); - json.append(" \"ratingPeople\": ").append(movie.getRatingPeople()).append(",\n"); - json.append(" \"year\": \"").append(movie.getYear()).append("\",\n"); - json.append(" \"director\": \"").append(escapeJSON(movie.getDirector())).append("\",\n"); - json.append(" \"quote\": \"").append(escapeJSON(movie.getQuote())).append("\"\n"); - json.append(" }"); - - if (i < movies.size() - 1) { - json.append(","); - } - json.append("\n"); - } - - json.append("]"); - writer.write(json.toString()); - System.out.println("JSON数据已保存到: " + filename); - } catch (IOException e) { - System.err.println("保存JSON文件失败: " + e.getMessage()); - } - } - - private static String escapeJSON(String text) { - if (text == null) { - return ""; - } - return text.replace("\\", "\\\\") - .replace("\"", "\\\"") - .replace("\n", "\\n") - .replace("\r", "\\r") - .replace("\t", "\\t"); - } - - private static void saveToText(List movies, String filename) { - try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) { - writer.write("豆瓣电影Top250评分数据\n"); - writer.write("爬取时间: " + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")) + "\n"); - writer.write("=".repeat(80) + "\n\n"); - - for (Movie movie : movies) { - writer.write(String.format("排名: %d\n", movie.getRank())); - writer.write(String.format("电影名称: %s\n", movie.getTitle())); - writer.write(String.format("评分: %.1f\n", movie.getRating())); - writer.write(String.format("评价人数: %d人\n", movie.getRatingPeople())); - writer.write(String.format("年份: %s\n", movie.getYear())); - writer.write(String.format("导演: %s\n", movie.getDirector())); - writer.write(String.format("详细信息: %s\n", movie.getInfo())); - writer.write(String.format("推荐语: %s\n", movie.getQuote())); - writer.write("-".repeat(80) + "\n\n"); - } - - System.out.println("文本数据已保存到: " + filename); - } catch (IOException e) { - System.err.println("保存文本文件失败: " + e.getMessage()); - } - } - - static class Movie { - private int rank; - private String title; - private double rating; - private int ratingPeople; - private String year; - private String director; - private String info; - private String quote; - - public int getRank() { - return rank; - } - - public void setRank(int rank) { - this.rank = rank; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } - - public double getRating() { - return rating; - } - - public void setRating(double rating) { - this.rating = rating; - } - - public int getRatingPeople() { - return ratingPeople; - } - - public void setRatingPeople(int ratingPeople) { - this.ratingPeople = ratingPeople; - } - - public String getYear() { - return year; - } - - public void setYear(String year) { - this.year = year; - } - - public String getDirector() { - return director; - } - - public void setDirector(String director) { - this.director = director; - } - - public String getInfo() { - return info; - } - - public void setInfo(String info) { - this.info = info; - } - - public String getQuote() { - return quote; - } - - public void setQuote(String quote) { - this.quote = quote; - } - } -} \ No newline at end of file