import java.io.BufferedReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class SimpleMovieCrawler { public static void main(String[] args) { try { // 1. 抓取电影数据 List movies = crawlMovies(); System.out.println("爬取完成,共获取 " + movies.size() + " 部电影数据"); // 2. 保存到文件 saveToFile(movies, "movies.txt"); // 3. 分析数据 analyzeData(movies); } catch (IOException e) { e.printStackTrace(); } } // 简单的爬虫实现 public static List crawlMovies() throws IOException { List movies = new ArrayList<>(); String url = "https://www.imdb.com/chart/top/"; // 发送 HTTP 请求 HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection(); connection.setRequestMethod("GET"); connection.setRequestProperty("User-Agent", "Mozilla/5.0"); // 读取响应 BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream())); StringBuilder content = new StringBuilder(); String line; while ((line = reader.readLine()) != null) { content.append(line); } reader.close(); connection.disconnect(); // 简单解析 HTML(实际项目中建议使用 Jsoup) String html = content.toString(); int start = html.indexOf(""); int end = html.indexOf("", start); if (start != -1 && end != -1) { String tableContent = html.substring(start, end); String[] rows = tableContent.split(""); for (int i = 1; i < Math.min(rows.length, 21); i++) { // 只取前 20 部 String row = rows[i]; Movie movie = new Movie(); // 提取标题 int titleStart = row.indexOf("", titleStart); if (titleStart != -1 && titleEnd != -1) { String titleHtml = row.substring(titleStart, titleEnd); int titleTextStart = titleHtml.indexOf(">" ) + 1; if (titleTextStart != -1) { movie.setTitle(titleHtml.substring(titleTextStart).trim()); } } // 提取年份 int yearStart = row.indexOf(""); int yearEnd = row.indexOf("", yearStart); if (yearStart != -1 && yearEnd != -1) { String year = row.substring(yearStart + 27, yearEnd).replaceAll("[()]", "").trim(); movie.setYear(year); } // 提取评分 int ratingStart = row.indexOf(""); int ratingEnd = row.indexOf("", ratingStart); if (ratingStart != -1 && ratingEnd != -1) { String rating = row.substring(ratingStart + 8, ratingEnd).trim(); movie.setRating(rating); } if (movie.getTitle() != null) { movies.add(movie); } } } return movies; } // 保存数据到文件 public static void saveToFile(List movies, String fileName) throws IOException { FileWriter writer = new FileWriter(fileName); writer.write("Title,Rating,Year\n"); for (Movie movie : movies) { writer.write(movie.getTitle() + "," + movie.getRating() + "," + movie.getYear() + "\n"); } writer.close(); System.out.println("数据已保存到: " + fileName); } // 分析数据 public static void analyzeData(List movies) { System.out.println("\n=== 电影数据分析 ==="); // 评分分布 Map ratingDist = new HashMap<>(); for (Movie movie : movies) { String rating = movie.getRating(); ratingDist.put(rating, ratingDist.getOrDefault(rating, 0) + 1); } System.out.println("\n1. 评分分布:"); for (Map.Entry entry : ratingDist.entrySet()) { System.out.println("评分 " + entry.getKey() + ": " + entry.getValue() + " 部"); } // 年份分布 Map yearDist = new HashMap<>(); for (Movie movie : movies) { String year = movie.getYear(); if (year != null) { yearDist.put(year, yearDist.getOrDefault(year, 0) + 1); } } System.out.println("\n2. 年份分布:"); yearDist.entrySet().stream() .sorted(Map.Entry.comparingByValue().reversed()) .limit(10) .forEach(entry -> System.out.println(entry.getKey() + "年: " + entry.getValue() + " 部")); } // 电影模型类 static class Movie { private String title; private String rating; private String year; public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getRating() { return rating; } public void setRating(String rating) { this.rating = rating; } public String getYear() { return year; } public void setYear(String year) { this.year = year; } } }