You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
155 lines
5.8 KiB
155 lines
5.8 KiB
import java.io.BufferedReader;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.io.InputStreamReader;
|
|
import java.net.HttpURLConnection;
|
|
import java.net.URL;
|
|
import java.util.ArrayList;
|
|
import java.util.HashMap;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
|
|
public class SimpleMovieCrawler {
|
|
|
|
public static void main(String[] args) {
|
|
try {
|
|
// 1. 抓取电影数据
|
|
List<Movie> movies = crawlMovies();
|
|
System.out.println("爬取完成,共获取 " + movies.size() + " 部电影数据");
|
|
|
|
// 2. 保存到文件
|
|
saveToFile(movies, "movies.txt");
|
|
|
|
// 3. 分析数据
|
|
analyzeData(movies);
|
|
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
// 简单的爬虫实现
|
|
public static List<Movie> crawlMovies() throws IOException {
|
|
List<Movie> movies = new ArrayList<>();
|
|
String url = "https://www.imdb.com/chart/top/";
|
|
|
|
// 发送 HTTP 请求
|
|
HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection();
|
|
connection.setRequestMethod("GET");
|
|
connection.setRequestProperty("User-Agent", "Mozilla/5.0");
|
|
|
|
// 读取响应
|
|
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
|
|
StringBuilder content = new StringBuilder();
|
|
String line;
|
|
while ((line = reader.readLine()) != null) {
|
|
content.append(line);
|
|
}
|
|
reader.close();
|
|
connection.disconnect();
|
|
|
|
// 简单解析 HTML(实际项目中建议使用 Jsoup)
|
|
String html = content.toString();
|
|
int start = html.indexOf("<tbody class=\"lister-list\">");
|
|
int end = html.indexOf("</tbody>", start);
|
|
if (start != -1 && end != -1) {
|
|
String tableContent = html.substring(start, end);
|
|
String[] rows = tableContent.split("<tr>");
|
|
|
|
for (int i = 1; i < Math.min(rows.length, 21); i++) { // 只取前 20 部
|
|
String row = rows[i];
|
|
Movie movie = new Movie();
|
|
|
|
// 提取标题
|
|
int titleStart = row.indexOf("<a href=");
|
|
int titleEnd = row.indexOf("</a>", titleStart);
|
|
if (titleStart != -1 && titleEnd != -1) {
|
|
String titleHtml = row.substring(titleStart, titleEnd);
|
|
int titleTextStart = titleHtml.indexOf(">" ) + 1;
|
|
if (titleTextStart != -1) {
|
|
movie.setTitle(titleHtml.substring(titleTextStart).trim());
|
|
}
|
|
}
|
|
|
|
// 提取年份
|
|
int yearStart = row.indexOf("<span class=\"secondaryInfo\">");
|
|
int yearEnd = row.indexOf("</span>", yearStart);
|
|
if (yearStart != -1 && yearEnd != -1) {
|
|
String year = row.substring(yearStart + 27, yearEnd).replaceAll("[()]", "").trim();
|
|
movie.setYear(year);
|
|
}
|
|
|
|
// 提取评分
|
|
int ratingStart = row.indexOf("<strong>");
|
|
int ratingEnd = row.indexOf("</strong>", ratingStart);
|
|
if (ratingStart != -1 && ratingEnd != -1) {
|
|
String rating = row.substring(ratingStart + 8, ratingEnd).trim();
|
|
movie.setRating(rating);
|
|
}
|
|
|
|
if (movie.getTitle() != null) {
|
|
movies.add(movie);
|
|
}
|
|
}
|
|
}
|
|
|
|
return movies;
|
|
}
|
|
|
|
// 保存数据到文件
|
|
public static void saveToFile(List<Movie> movies, String fileName) throws IOException {
|
|
FileWriter writer = new FileWriter(fileName);
|
|
writer.write("Title,Rating,Year\n");
|
|
for (Movie movie : movies) {
|
|
writer.write(movie.getTitle() + "," + movie.getRating() + "," + movie.getYear() + "\n");
|
|
}
|
|
writer.close();
|
|
System.out.println("数据已保存到: " + fileName);
|
|
}
|
|
|
|
// 分析数据
|
|
public static void analyzeData(List<Movie> movies) {
|
|
System.out.println("\n=== 电影数据分析 ===");
|
|
|
|
// 评分分布
|
|
Map<String, Integer> ratingDist = new HashMap<>();
|
|
for (Movie movie : movies) {
|
|
String rating = movie.getRating();
|
|
ratingDist.put(rating, ratingDist.getOrDefault(rating, 0) + 1);
|
|
}
|
|
|
|
System.out.println("\n1. 评分分布:");
|
|
for (Map.Entry<String, Integer> entry : ratingDist.entrySet()) {
|
|
System.out.println("评分 " + entry.getKey() + ": " + entry.getValue() + " 部");
|
|
}
|
|
|
|
// 年份分布
|
|
Map<String, Integer> yearDist = new HashMap<>();
|
|
for (Movie movie : movies) {
|
|
String year = movie.getYear();
|
|
if (year != null) {
|
|
yearDist.put(year, yearDist.getOrDefault(year, 0) + 1);
|
|
}
|
|
}
|
|
|
|
System.out.println("\n2. 年份分布:");
|
|
yearDist.entrySet().stream()
|
|
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
|
|
.limit(10)
|
|
.forEach(entry -> System.out.println(entry.getKey() + "年: " + entry.getValue() + " 部"));
|
|
}
|
|
|
|
// 电影模型类
|
|
static class Movie {
|
|
private String title;
|
|
private String rating;
|
|
private String year;
|
|
|
|
public String getTitle() { return title; }
|
|
public void setTitle(String title) { this.title = title; }
|
|
public String getRating() { return rating; }
|
|
public void setRating(String rating) { this.rating = rating; }
|
|
public String getYear() { return year; }
|
|
public void setYear(String year) { this.year = year; }
|
|
}
|
|
}
|