You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

155 lines
5.8 KiB

import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class SimpleMovieCrawler {
public static void main(String[] args) {
try {
// 1. 抓取电影数据
List<Movie> movies = crawlMovies();
System.out.println("爬取完成,共获取 " + movies.size() + " 部电影数据");
// 2. 保存到文件
saveToFile(movies, "movies.txt");
// 3. 分析数据
analyzeData(movies);
} catch (IOException e) {
e.printStackTrace();
}
}
// 简单的爬虫实现
public static List<Movie> crawlMovies() throws IOException {
List<Movie> movies = new ArrayList<>();
String url = "https://www.imdb.com/chart/top/";
// 发送 HTTP 请求
HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0");
// 读取响应
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
StringBuilder content = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
content.append(line);
}
reader.close();
connection.disconnect();
// 简单解析 HTML(实际项目中建议使用 Jsoup)
String html = content.toString();
int start = html.indexOf("<tbody class=\"lister-list\">");
int end = html.indexOf("</tbody>", start);
if (start != -1 && end != -1) {
String tableContent = html.substring(start, end);
String[] rows = tableContent.split("<tr>");
for (int i = 1; i < Math.min(rows.length, 21); i++) { // 只取前 20 部
String row = rows[i];
Movie movie = new Movie();
// 提取标题
int titleStart = row.indexOf("<a href=");
int titleEnd = row.indexOf("</a>", titleStart);
if (titleStart != -1 && titleEnd != -1) {
String titleHtml = row.substring(titleStart, titleEnd);
int titleTextStart = titleHtml.indexOf(">" ) + 1;
if (titleTextStart != -1) {
movie.setTitle(titleHtml.substring(titleTextStart).trim());
}
}
// 提取年份
int yearStart = row.indexOf("<span class=\"secondaryInfo\">");
int yearEnd = row.indexOf("</span>", yearStart);
if (yearStart != -1 && yearEnd != -1) {
String year = row.substring(yearStart + 27, yearEnd).replaceAll("[()]", "").trim();
movie.setYear(year);
}
// 提取评分
int ratingStart = row.indexOf("<strong>");
int ratingEnd = row.indexOf("</strong>", ratingStart);
if (ratingStart != -1 && ratingEnd != -1) {
String rating = row.substring(ratingStart + 8, ratingEnd).trim();
movie.setRating(rating);
}
if (movie.getTitle() != null) {
movies.add(movie);
}
}
}
return movies;
}
// 保存数据到文件
public static void saveToFile(List<Movie> movies, String fileName) throws IOException {
FileWriter writer = new FileWriter(fileName);
writer.write("Title,Rating,Year\n");
for (Movie movie : movies) {
writer.write(movie.getTitle() + "," + movie.getRating() + "," + movie.getYear() + "\n");
}
writer.close();
System.out.println("数据已保存到: " + fileName);
}
// 分析数据
public static void analyzeData(List<Movie> movies) {
System.out.println("\n=== 电影数据分析 ===");
// 评分分布
Map<String, Integer> ratingDist = new HashMap<>();
for (Movie movie : movies) {
String rating = movie.getRating();
ratingDist.put(rating, ratingDist.getOrDefault(rating, 0) + 1);
}
System.out.println("\n1. 评分分布:");
for (Map.Entry<String, Integer> entry : ratingDist.entrySet()) {
System.out.println("评分 " + entry.getKey() + ": " + entry.getValue() + " 部");
}
// 年份分布
Map<String, Integer> yearDist = new HashMap<>();
for (Movie movie : movies) {
String year = movie.getYear();
if (year != null) {
yearDist.put(year, yearDist.getOrDefault(year, 0) + 1);
}
}
System.out.println("\n2. 年份分布:");
yearDist.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(10)
.forEach(entry -> System.out.println(entry.getKey() + "年: " + entry.getValue() + " 部"));
}
// 电影模型类
static class Movie {
private String title;
private String rating;
private String year;
public String getTitle() { return title; }
public void setTitle(String title) { this.title = title; }
public String getRating() { return rating; }
public void setRating(String rating) { this.rating = rating; }
public String getYear() { return year; }
public void setYear(String year) { this.year = year; }
}
}