5 changed files with 771 additions and 0 deletions
@ -0,0 +1,520 @@ |
|||||
|
import java.io.*; |
||||
|
import java.net.*; |
||||
|
import java.nio.charset.StandardCharsets; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.*; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class MovieCrawler { |
||||
|
|
||||
|
private static final String BASE_URL = "https://movie.douban.com/top250"; |
||||
|
private static final int MAX_PAGES = 10; |
||||
|
private static final String OUTPUT_DIR = "D:/"; |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
System.out.println("=".repeat(80)); |
||||
|
System.out.println("豆瓣电影Top250爬虫与数据分析系统"); |
||||
|
System.out.println("=".repeat(80)); |
||||
|
|
||||
|
long startTime = System.currentTimeMillis(); |
||||
|
|
||||
|
try { |
||||
|
List<Movie> movies = crawlMovies(); |
||||
|
|
||||
|
if (movies.isEmpty()) { |
||||
|
System.err.println("未能爬取到任何电影数据,请检查网络连接或稍后重试。"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
System.out.println("\n数据爬取完成,共获取 " + movies.size() + " 部电影数据"); |
||||
|
System.out.println("开始数据分析..."); |
||||
|
|
||||
|
analyzeMovies(movies); |
||||
|
|
||||
|
System.out.println("\n开始保存数据..."); |
||||
|
|
||||
|
saveData(movies); |
||||
|
|
||||
|
long endTime = System.currentTimeMillis(); |
||||
|
long duration = (endTime - startTime) / 1000; |
||||
|
|
||||
|
System.out.println("\n" + "=".repeat(80)); |
||||
|
System.out.println("所有任务完成!"); |
||||
|
System.out.println("总耗时: " + duration + " 秒"); |
||||
|
System.out.println("输出文件位置: " + OUTPUT_DIR); |
||||
|
System.out.println("=".repeat(80)); |
||||
|
|
||||
|
} catch (Exception e) { |
||||
|
System.err.println("程序执行出错: " + e.getMessage()); |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static List<Movie> crawlMovies() throws InterruptedException { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
|
||||
|
for (int page = 0; page < MAX_PAGES; page++) { |
||||
|
int start = page * 25; |
||||
|
String url = BASE_URL + "?start=" + start; |
||||
|
|
||||
|
try { |
||||
|
System.out.println("正在爬取第 " + (page + 1) + " 页..."); |
||||
|
List<Movie> pageMovies = crawlPage(url, page * 25 + 1); |
||||
|
movies.addAll(pageMovies); |
||||
|
|
||||
|
if (page < MAX_PAGES - 1) { |
||||
|
Thread.sleep(2000); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
System.err.println("爬取第 " + (page + 1) + " 页失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
private static List<Movie> crawlPage(String url, int startRank) throws Exception { |
||||
|
String html = fetchHtml(url); |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
|
||||
|
Pattern moviePattern = Pattern.compile("<div class=\"item\">(.*?)</div>\s*</div>", Pattern.DOTALL); |
||||
|
Matcher movieMatcher = moviePattern.matcher(html); |
||||
|
|
||||
|
int rank = startRank; |
||||
|
while (movieMatcher.find()) { |
||||
|
String movieHtml = movieMatcher.group(1); |
||||
|
Movie movie = parseMovie(movieHtml, rank++); |
||||
|
if (movie != null) { |
||||
|
movies.add(movie); |
||||
|
System.out.printf(" [%d] %s - %.1f分\n", movie.getRank(), movie.getTitle(), movie.getRating()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
private static String fetchHtml(String url) throws Exception { |
||||
|
URL obj = new URL(url); |
||||
|
HttpURLConnection con = (HttpURLConnection) obj.openConnection(); |
||||
|
con.setRequestMethod("GET"); |
||||
|
con.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); |
||||
|
con.setConnectTimeout(10000); |
||||
|
con.setReadTimeout(10000); |
||||
|
|
||||
|
int responseCode = con.getResponseCode(); |
||||
|
if (responseCode != HttpURLConnection.HTTP_OK) { |
||||
|
throw new Exception("HTTP error code: " + responseCode); |
||||
|
} |
||||
|
|
||||
|
BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), StandardCharsets.UTF_8)); |
||||
|
String inputLine; |
||||
|
StringBuilder content = new StringBuilder(); |
||||
|
|
||||
|
while ((inputLine = in.readLine()) != null) { |
||||
|
content.append(inputLine).append("\n"); |
||||
|
} |
||||
|
|
||||
|
in.close(); |
||||
|
con.disconnect(); |
||||
|
|
||||
|
return content.toString(); |
||||
|
} |
||||
|
|
||||
|
private static Movie parseMovie(String html, int rank) { |
||||
|
Movie movie = new Movie(); |
||||
|
movie.setRank(rank); |
||||
|
|
||||
|
// 解析标题
|
||||
|
Pattern titlePattern = Pattern.compile("<span class=\"title\">(.*?)</span>"); |
||||
|
Matcher titleMatcher = titlePattern.matcher(html); |
||||
|
if (titleMatcher.find()) { |
||||
|
movie.setTitle(cleanText(titleMatcher.group(1))); |
||||
|
} |
||||
|
|
||||
|
// 解析评分
|
||||
|
Pattern ratingPattern = Pattern.compile("<span class=\"rating_num\" property=\"v:average\">(.*?)</span>"); |
||||
|
Matcher ratingMatcher = ratingPattern.matcher(html); |
||||
|
if (ratingMatcher.find()) { |
||||
|
try { |
||||
|
movie.setRating(Double.parseDouble(cleanText(ratingMatcher.group(1)))); |
||||
|
} catch (NumberFormatException e) { |
||||
|
movie.setRating(0.0); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 解析评价人数
|
||||
|
Pattern ratingPeoplePattern = Pattern.compile("<span>(\d+)人评价</span>"); |
||||
|
Matcher ratingPeopleMatcher = ratingPeoplePattern.matcher(html); |
||||
|
if (ratingPeopleMatcher.find()) { |
||||
|
try { |
||||
|
movie.setRatingPeople(Integer.parseInt(ratingPeopleMatcher.group(1))); |
||||
|
} catch (NumberFormatException e) { |
||||
|
movie.setRatingPeople(0); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 解析详细信息
|
||||
|
Pattern infoPattern = Pattern.compile("<div class=\"bd\">\s*<p class=\"\">(.*?)</p>", Pattern.DOTALL); |
||||
|
Matcher infoMatcher = infoPattern.matcher(html); |
||||
|
if (infoMatcher.find()) { |
||||
|
String info = cleanText(infoMatcher.group(1)); |
||||
|
movie.setInfo(info); |
||||
|
movie.setYear(extractYear(info)); |
||||
|
movie.setDirector(extractDirector(info)); |
||||
|
} |
||||
|
|
||||
|
// 解析推荐语
|
||||
|
Pattern quotePattern = Pattern.compile("<span class=\"inq\">(.*?)</span>"); |
||||
|
Matcher quoteMatcher = quotePattern.matcher(html); |
||||
|
if (quoteMatcher.find()) { |
||||
|
movie.setQuote(cleanText(quoteMatcher.group(1))); |
||||
|
} else { |
||||
|
movie.setQuote("暂无推荐语"); |
||||
|
} |
||||
|
|
||||
|
return movie; |
||||
|
} |
||||
|
|
||||
|
private static String cleanText(String text) { |
||||
|
if (text == null || text.isEmpty()) { |
||||
|
return ""; |
||||
|
} |
||||
|
return text.replaceAll("<.*?>", "").replaceAll("\\s+", " ").trim(); |
||||
|
} |
||||
|
|
||||
|
private static String extractYear(String text) { |
||||
|
if (text == null || text.isEmpty()) { |
||||
|
return "未知"; |
||||
|
} |
||||
|
Pattern pattern = Pattern.compile("(\\d{4})"); |
||||
|
Matcher matcher = pattern.matcher(text); |
||||
|
if (matcher.find()) { |
||||
|
return matcher.group(1); |
||||
|
} |
||||
|
return "未知"; |
||||
|
} |
||||
|
|
||||
|
private static String extractDirector(String info) { |
||||
|
if (info == null || info.isEmpty()) { |
||||
|
return "未知"; |
||||
|
} |
||||
|
String cleaned = info.trim(); |
||||
|
if (cleaned.startsWith("导演:")) { |
||||
|
int endIndex = cleaned.indexOf("主演:"); |
||||
|
if (endIndex == -1) { |
||||
|
endIndex = cleaned.indexOf("类型:"); |
||||
|
} |
||||
|
if (endIndex == -1) { |
||||
|
endIndex = cleaned.length(); |
||||
|
} |
||||
|
String directorPart = cleaned.substring(3, endIndex).trim(); |
||||
|
String[] directors = directorPart.split("/"); |
||||
|
if (directors.length > 0) { |
||||
|
return directors[0].trim(); |
||||
|
} |
||||
|
} |
||||
|
return "未知"; |
||||
|
} |
||||
|
|
||||
|
private static void analyzeMovies(List<Movie> movies) { |
||||
|
System.out.println("\n" + "=".repeat(80)); |
||||
|
System.out.println("豆瓣电影Top250数据分析报告"); |
||||
|
System.out.println("=".repeat(80)); |
||||
|
|
||||
|
analyzeRatingDistribution(movies); |
||||
|
analyzeYearDistribution(movies); |
||||
|
analyzeDirectorRanking(movies); |
||||
|
analyzeTopRatedMovies(movies); |
||||
|
analyzeMostPopularMovies(movies); |
||||
|
} |
||||
|
|
||||
|
private static void analyzeRatingDistribution(List<Movie> movies) { |
||||
|
System.out.println("\n【评分分布统计】"); |
||||
|
System.out.println("-".repeat(80)); |
||||
|
|
||||
|
Map<Double, Long> ratingDistribution = movies.stream() |
||||
|
.collect(Collectors.groupingBy( |
||||
|
movie -> Math.round(movie.getRating() * 10.0) / 10.0, |
||||
|
TreeMap::new, |
||||
|
Collectors.counting() |
||||
|
)); |
||||
|
|
||||
|
System.out.printf("%-10s %-15s %-15s\n", "评分区间", "电影数量", "占比"); |
||||
|
System.out.println("-".repeat(80)); |
||||
|
|
||||
|
int total = movies.size(); |
||||
|
for (Map.Entry<Double, Long> entry : ratingDistribution.entrySet()) { |
||||
|
double percentage = (entry.getValue() * 100.0) / total; |
||||
|
System.out.printf("%-10.1f %-15d %-14.2f%%\n", entry.getKey(), entry.getValue(), percentage); |
||||
|
} |
||||
|
|
||||
|
double avgRating = movies.stream() |
||||
|
.mapToDouble(Movie::getRating) |
||||
|
.average() |
||||
|
.orElse(0.0); |
||||
|
System.out.printf("\n平均评分: %.2f\n", avgRating); |
||||
|
} |
||||
|
|
||||
|
private static void analyzeYearDistribution(List<Movie> movies) { |
||||
|
System.out.println("\n【年份分布统计】"); |
||||
|
System.out.println("-".repeat(80)); |
||||
|
|
||||
|
Map<String, Long> yearDistribution = movies.stream() |
||||
|
.filter(movie -> !"未知".equals(movie.getYear())) |
||||
|
.collect(Collectors.groupingBy( |
||||
|
Movie::getYear, |
||||
|
TreeMap::new, |
||||
|
Collectors.counting() |
||||
|
)); |
||||
|
|
||||
|
System.out.printf("%-10s %-15s\n", "年份", "电影数量"); |
||||
|
System.out.println("-".repeat(80)); |
||||
|
|
||||
|
yearDistribution.entrySet().stream() |
||||
|
.sorted(Map.Entry.<String, Long>comparingByValue().reversed()) |
||||
|
.limit(10) |
||||
|
.forEach(entry -> System.out.printf("%-10s %-15d\n", entry.getKey(), entry.getValue())); |
||||
|
|
||||
|
String mostProductiveYear = yearDistribution.entrySet().stream() |
||||
|
.max(Map.Entry.comparingByValue()) |
||||
|
.map(Map.Entry::getKey) |
||||
|
.orElse("未知"); |
||||
|
|
||||
|
System.out.printf("\n电影产量最高的年份: %s\n", mostProductiveYear); |
||||
|
} |
||||
|
|
||||
|
private static void analyzeDirectorRanking(List<Movie> movies) { |
||||
|
System.out.println("\n【导演作品排行】"); |
||||
|
System.out.println("-".repeat(80)); |
||||
|
|
||||
|
Map<String, Long> directorCount = movies.stream() |
||||
|
.filter(movie -> !"未知".equals(movie.getDirector())) |
||||
|
.collect(Collectors.groupingBy( |
||||
|
Movie::getDirector, |
||||
|
Collectors.counting() |
||||
|
)); |
||||
|
|
||||
|
List<Map.Entry<String, Long>> topDirectors = directorCount.entrySet().stream() |
||||
|
.sorted(Map.Entry.<String, Long>comparingByValue().reversed()) |
||||
|
.limit(10) |
||||
|
.collect(Collectors.toList()); |
||||
|
|
||||
|
System.out.printf("%-5s %-30s %-15s\n", "排名", "导演", "作品数量"); |
||||
|
System.out.println("-".repeat(80)); |
||||
|
|
||||
|
for (int i = 0; i < topDirectors.size(); i++) { |
||||
|
Map.Entry<String, Long> entry = topDirectors.get(i); |
||||
|
System.out.printf("%-5d %-30s %-15d\n", i + 1, entry.getKey(), entry.getValue()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static void analyzeTopRatedMovies(List<Movie> movies) { |
||||
|
System.out.println("\n【高分电影TOP10】"); |
||||
|
System.out.println("-".repeat(80)); |
||||
|
|
||||
|
System.out.printf("%-5s %-40s %-10s %-10s\n", "排名", "电影名称", "评分", "年份"); |
||||
|
System.out.println("-".repeat(80)); |
||||
|
|
||||
|
movies.stream() |
||||
|
.sorted(Comparator.comparing(Movie::getRating).reversed()) |
||||
|
.limit(10) |
||||
|
.forEach(movie -> System.out.printf("%-5d %-40s %-10.1f %-10s\n", |
||||
|
movie.getRank(), movie.getTitle(), movie.getRating(), movie.getYear())); |
||||
|
} |
||||
|
|
||||
|
private static void analyzeMostPopularMovies(List<Movie> movies) { |
||||
|
System.out.println("\n【最受关注电影TOP10】"); |
||||
|
System.out.println("-".repeat(80)); |
||||
|
|
||||
|
System.out.printf("%-5s %-40s %-15s %-10s\n", "排名", "电影名称", "评价人数", "评分"); |
||||
|
System.out.println("-".repeat(80)); |
||||
|
|
||||
|
movies.stream() |
||||
|
.sorted(Comparator.comparing(Movie::getRatingPeople).reversed()) |
||||
|
.limit(10) |
||||
|
.forEach(movie -> System.out.printf("%-5d %-40s %-15d %-10.1f\n", |
||||
|
movie.getRank(), movie.getTitle(), movie.getRatingPeople(), movie.getRating())); |
||||
|
} |
||||
|
|
||||
|
private static void saveData(List<Movie> movies) { |
||||
|
saveToCSV(movies, OUTPUT_DIR + "douban_movies.csv"); |
||||
|
saveToJSON(movies, OUTPUT_DIR + "douban_movies.json"); |
||||
|
saveToText(movies, OUTPUT_DIR + "douban_movies.txt"); |
||||
|
} |
||||
|
|
||||
|
private static void saveToCSV(List<Movie> movies, String filename) { |
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) { |
||||
|
writer.write("排名,电影名称,评分,评价人数,年份,导演,推荐语\n"); |
||||
|
|
||||
|
for (Movie movie : movies) { |
||||
|
String line = String.format("%d,%s,%.1f,%d,%s,%s,%s\n", |
||||
|
movie.getRank(), |
||||
|
escapeCSV(movie.getTitle()), |
||||
|
movie.getRating(), |
||||
|
movie.getRatingPeople(), |
||||
|
movie.getYear(), |
||||
|
escapeCSV(movie.getDirector()), |
||||
|
escapeCSV(movie.getQuote())); |
||||
|
writer.write(line); |
||||
|
} |
||||
|
|
||||
|
System.out.println("CSV数据已保存到: " + filename); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("保存CSV文件失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static String escapeCSV(String text) { |
||||
|
if (text == null) { |
||||
|
return ""; |
||||
|
} |
||||
|
if (text.contains(",") || text.contains("\"") || text.contains("\n")) { |
||||
|
return "\"" + text.replace("\"", "\"\"") + "\""; |
||||
|
} |
||||
|
return text; |
||||
|
} |
||||
|
|
||||
|
private static void saveToJSON(List<Movie> movies, String filename) { |
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) { |
||||
|
StringBuilder json = new StringBuilder(); |
||||
|
json.append("[\n"); |
||||
|
|
||||
|
for (int i = 0; i < movies.size(); i++) { |
||||
|
Movie movie = movies.get(i); |
||||
|
json.append(" {\n"); |
||||
|
json.append(" \"rank\": ").append(movie.getRank()).append(",\n"); |
||||
|
json.append(" \"title\": \"").append(escapeJSON(movie.getTitle())).append("\",\n"); |
||||
|
json.append(" \"rating\": ").append(movie.getRating()).append(",\n"); |
||||
|
json.append(" \"ratingPeople\": ").append(movie.getRatingPeople()).append(",\n"); |
||||
|
json.append(" \"year\": \"").append(movie.getYear()).append("\",\n"); |
||||
|
json.append(" \"director\": \"").append(escapeJSON(movie.getDirector())).append("\",\n"); |
||||
|
json.append(" \"quote\": \"").append(escapeJSON(movie.getQuote())).append("\"\n"); |
||||
|
json.append(" }"); |
||||
|
|
||||
|
if (i < movies.size() - 1) { |
||||
|
json.append(","); |
||||
|
} |
||||
|
json.append("\n"); |
||||
|
} |
||||
|
|
||||
|
json.append("]"); |
||||
|
writer.write(json.toString()); |
||||
|
System.out.println("JSON数据已保存到: " + filename); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("保存JSON文件失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static String escapeJSON(String text) { |
||||
|
if (text == null) { |
||||
|
return ""; |
||||
|
} |
||||
|
return text.replace("\\", "\\\\") |
||||
|
.replace("\"", "\\\"") |
||||
|
.replace("\n", "\\n") |
||||
|
.replace("\r", "\\r") |
||||
|
.replace("\t", "\\t"); |
||||
|
} |
||||
|
|
||||
|
private static void saveToText(List<Movie> movies, String filename) { |
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename, StandardCharsets.UTF_8))) { |
||||
|
writer.write("豆瓣电影Top250评分数据\n"); |
||||
|
writer.write("爬取时间: " + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")) + "\n"); |
||||
|
writer.write("=".repeat(80) + "\n\n"); |
||||
|
|
||||
|
for (Movie movie : movies) { |
||||
|
writer.write(String.format("排名: %d\n", movie.getRank())); |
||||
|
writer.write(String.format("电影名称: %s\n", movie.getTitle())); |
||||
|
writer.write(String.format("评分: %.1f\n", movie.getRating())); |
||||
|
writer.write(String.format("评价人数: %d人\n", movie.getRatingPeople())); |
||||
|
writer.write(String.format("年份: %s\n", movie.getYear())); |
||||
|
writer.write(String.format("导演: %s\n", movie.getDirector())); |
||||
|
writer.write(String.format("详细信息: %s\n", movie.getInfo())); |
||||
|
writer.write(String.format("推荐语: %s\n", movie.getQuote())); |
||||
|
writer.write("-".repeat(80) + "\n\n"); |
||||
|
} |
||||
|
|
||||
|
System.out.println("文本数据已保存到: " + filename); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("保存文本文件失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
static class Movie { |
||||
|
private int rank; |
||||
|
private String title; |
||||
|
private double rating; |
||||
|
private int ratingPeople; |
||||
|
private String year; |
||||
|
private String director; |
||||
|
private String info; |
||||
|
private String quote; |
||||
|
|
||||
|
public int getRank() { |
||||
|
return rank; |
||||
|
} |
||||
|
|
||||
|
public void setRank(int rank) { |
||||
|
this.rank = rank; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public double getRating() { |
||||
|
return rating; |
||||
|
} |
||||
|
|
||||
|
public void setRating(double rating) { |
||||
|
this.rating = rating; |
||||
|
} |
||||
|
|
||||
|
public int getRatingPeople() { |
||||
|
return ratingPeople; |
||||
|
} |
||||
|
|
||||
|
public void setRatingPeople(int ratingPeople) { |
||||
|
this.ratingPeople = ratingPeople; |
||||
|
} |
||||
|
|
||||
|
public String getYear() { |
||||
|
return year; |
||||
|
} |
||||
|
|
||||
|
public void setYear(String year) { |
||||
|
this.year = year; |
||||
|
} |
||||
|
|
||||
|
public String getDirector() { |
||||
|
return director; |
||||
|
} |
||||
|
|
||||
|
public void setDirector(String director) { |
||||
|
this.director = director; |
||||
|
} |
||||
|
|
||||
|
public String getInfo() { |
||||
|
return info; |
||||
|
} |
||||
|
|
||||
|
public void setInfo(String info) { |
||||
|
this.info = info; |
||||
|
} |
||||
|
|
||||
|
public String getQuote() { |
||||
|
return quote; |
||||
|
} |
||||
|
|
||||
|
public void setQuote(String quote) { |
||||
|
this.quote = quote; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
After Width: | Height: | Size: 713 KiB |
|
After Width: | Height: | Size: 932 KiB |
|
After Width: | Height: | Size: 891 KiB |
Loading…
Reference in new issue