Browse Source

project3

master
WangYangyang 1 month ago
parent
commit
a5ccf00277
  1. 41
      crawl_project_extension_2/pom.xml
  2. 128
      crawl_project_extension_2/src/main/java/com/example/ChartGenerator.java
  3. 37
      crawl_project_extension_2/src/main/java/com/example/CsvExporter.java
  4. 36
      crawl_project_extension_2/src/main/java/com/example/DataAnalyzer.java
  5. 101
      crawl_project_extension_2/src/main/java/com/example/DoubanCrawler.java
  6. 103
      crawl_project_extension_2/src/main/java/com/example/M1905Crawler.java
  7. 53
      crawl_project_extension_2/src/main/java/com/example/Main.java
  8. 75
      crawl_project_extension_2/src/main/java/com/example/Movie.java
  9. 8
      crawl_project_extension_2/src/main/java/com/example/MovieAnalyzer.java
  10. 8
      crawl_project_extension_2/src/main/java/com/example/MovieCrawler.java
  11. 13
      crawl_project_extension_2/src/main/java/org/example/App.java
  12. 38
      crawl_project_extension_2/src/test/java/org/example/AppTest.java
  13. BIN
      crawl_project_extension_2/target/classes/com/example/ChartGenerator.class
  14. BIN
      crawl_project_extension_2/target/classes/com/example/CsvExporter.class
  15. BIN
      crawl_project_extension_2/target/classes/com/example/DataAnalyzer.class
  16. BIN
      crawl_project_extension_2/target/classes/com/example/DoubanCrawler.class
  17. BIN
      crawl_project_extension_2/target/classes/com/example/Main.class
  18. BIN
      crawl_project_extension_2/target/classes/com/example/Movie.class
  19. BIN
      crawl_project_extension_2/target/classes/com/example/MovieAnalyzer.class
  20. BIN
      crawl_project_extension_2/target/classes/com/example/MovieCrawler.class
  21. BIN
      crawl_project_extension_2/target/classes/org/example/App.class

41
crawl_project_extension_2/pom.xml

@ -0,0 +1,41 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>crawl_project_extension_2</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>crawl_project_extension_2</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>5.9</version>
</dependency>
<dependency>
<groupId>org.knowm.xchart</groupId>
<artifactId>xchart</artifactId>
<version>3.8.7</version>
</dependency>
</dependencies>
</project>

128
crawl_project_extension_2/src/main/java/com/example/ChartGenerator.java

@ -0,0 +1,128 @@
package com.example;
import org.knowm.xchart.*;
import org.knowm.xchart.style.Styler;
import java.awt.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.stream.Collectors;
public class ChartGenerator {
// 1. 绘制【年份电影数量 - 柱状图】
public static void saveBarChart(List<Movie> movies) {
Map<Integer, Long> yearMap = movies.stream()
.filter(m -> m.getYear() > 1980)
.collect(Collectors.groupingBy(Movie::getYear, Collectors.counting()));
List<Entry<Integer, Long>> sortedList = new ArrayList<>(yearMap.entrySet());
sortedList.sort(Entry.comparingByKey());
if (sortedList.size() > 15) {
sortedList = sortedList.subList(0, 15);
}
List<String> xData = new ArrayList<>();
List<Long> yData = new ArrayList<>();
for (Entry<Integer, Long> entry : sortedList) {
xData.add(entry.getKey().toString());
yData.add(entry.getValue());
}
CategoryChart chart = new CategoryChartBuilder()
.width(1000)
.height(600)
.title("豆瓣Top250 - 各年份电影数量柱状图")
.xAxisTitle("年份")
.yAxisTitle("电影数量")
.theme(Styler.ChartTheme.Matlab)
.build();
chart.getStyler().setLegendVisible(false);
chart.getStyler().setLabelsVisible(true);
chart.getStyler().setXAxisLabelRotation(45);
chart.getStyler().setChartBackgroundColor(Color.WHITE);
chart.addSeries("电影数量", xData, yData);
try {
BitmapEncoder.saveBitmap(chart, "./年份电影数量_柱状图", BitmapEncoder.BitmapFormat.PNG);
System.out.println("✅ 柱状图已保存:年份电影数量_柱状图.png");
} catch (IOException e) {
e.printStackTrace();
}
}
// 2. 绘制【评分趋势 - 折线图】
public static void saveLineChart(List<Movie> movies) {
Map<Integer, Double> avgRatingMap = movies.stream()
.filter(m -> m.getYear() > 1980)
.collect(Collectors.groupingBy(Movie::getYear, Collectors.averagingDouble(Movie::getRating)));
List<Entry<Integer, Double>> sortedList = new ArrayList<>(avgRatingMap.entrySet());
sortedList.sort(Entry.comparingByKey());
if (sortedList.size() > 15) {
sortedList = sortedList.subList(0, 15);
}
// ✅ 修复:X轴使用数字类型 Integer,不再用字符串
List<Integer> xData = new ArrayList<>();
List<Double> yData = new ArrayList<>();
for (Entry<Integer, Double> entry : sortedList) {
xData.add(entry.getKey());
yData.add(entry.getValue());
}
XYChart chart = new XYChartBuilder()
.width(1000)
.height(600)
.title("豆瓣Top250 - 历年平均评分趋势")
.xAxisTitle("年份")
.yAxisTitle("平均评分")
.theme(Styler.ChartTheme.Matlab)
.build();
chart.getStyler().setMarkerSize(6);
chart.getStyler().setChartBackgroundColor(Color.WHITE);
chart.addSeries("平均评分", xData, yData);
try {
BitmapEncoder.saveBitmap(chart, "./历年平均评分_折线图", BitmapEncoder.BitmapFormat.PNG);
System.out.println("✅ 折线图已保存!");
} catch (IOException e) {
e.printStackTrace();
}
}
// 3. 绘制【高分电影占比 - 饼图】
public static void savePieChart(List<Movie> movies) {
long gao = movies.stream().filter(m -> m.getRating() >= 9.5).count();
long zhong = movies.stream().filter(m -> m.getRating() >= 9.0 && m.getRating() < 9.5).count();
long di = movies.stream().filter(m -> m.getRating() < 9.0).count();
PieChart chart = new PieChartBuilder()
.width(700)
.height(700)
.title("豆瓣Top250 - 评分分布饼图")
.theme(Styler.ChartTheme.Matlab)
.build();
chart.addSeries("9.5分及以上", gao);
chart.addSeries("9.0-9.5分", zhong);
chart.addSeries("9.0分以下", di);
chart.getStyler().setChartBackgroundColor(Color.WHITE);
chart.getStyler().setLegendVisible(true);
try {
BitmapEncoder.saveBitmap(chart, "./评分分布_饼图", BitmapEncoder.BitmapFormat.PNG);
System.out.println("✅ 饼图已保存:评分分布_饼图.png");
} catch (IOException e) {
e.printStackTrace();
}
}
}

37
crawl_project_extension_2/src/main/java/com/example/CsvExporter.java

@ -0,0 +1,37 @@
package com.example;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
public class CsvExporter{
public static void exportToCsv(List<Movie> movies, String filePath) {
try (FileWriter writer = new FileWriter(filePath)) {
// 1. 表头:确保顺序是【电影名称,导演,上映年份,豆瓣评分,评价人数】
writer.write("电影名称,导演,上映年份,豆瓣评分,评价人数\n");
// 2. 写入数据:字段顺序必须和表头完全对应!
for (Movie movie : movies) {
String line = String.format("%s,%s,%d,%.1f,%d\n",
escapeCsv(movie.getTitle()), // 1.电影名称
escapeCsv(movie.getDirector()), // 2.导演
movie.getYear(), // 3.上映年份
movie.getRating(), // 4.豆瓣评分
movie.getReviewCount() // 5.评价人数(这里之前写反了!)
);
writer.write(line);
}
System.out.println("\nCSV文件导出成功!路径:" + filePath);
System.out.println("提示:评价人数在第5列,已显示真实数据!");
} catch (IOException e) {
e.printStackTrace();
}
}
// CSV 特殊字符转义(避免逗号/引号导致格式错乱)
private static String escapeCsv(String value) {
if (value == null) return "";
// 包含逗号、引号或换行时,用双引号包裹
if (value.contains(",") || value.contains("\"") || value.contains("\n")) {
return "\"" + value.replace("\"", "\"\"") + "\"";
}
return value;
}
}

36
crawl_project_extension_2/src/main/java/com/example/DataAnalyzer.java

@ -0,0 +1,36 @@
package com.example;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class DataAnalyzer implements MovieAnalyzer {
@Override
public void analyzeByDimension(List<Movie> movies) {
System.out.println("\n===== 评分最高Top10电影 =====");
movies.stream()
.sorted((m1, m2) -> Double.compare(m2.getRating(), m1.getRating()))
.limit(10)
.forEach(m -> System.out.printf("%-25s 评分: %.1f 年份: %d%n",
m.getTitle(), m.getRating(), m.getYear()));
System.out.println("\n===== 各年份电影数量统计 =====");
Map<Integer, Long> countByYear = movies.stream()
.filter(m -> m.getYear() != 0)
.collect(Collectors.groupingBy(Movie::getYear, Collectors.counting()));
// 按年份排序输出
countByYear.entrySet().stream()
.sorted(Map.Entry.comparingByKey())
.forEach(entry ->
System.out.printf("年份: %-4d 数量: %d 部%n", entry.getKey(), entry.getValue()));
}
// 统计总数据
@Override
public void analyzeTotal(List<Movie> movies){
System.out.println("\n===== 数据总览 =====");
System.out.println("电影总数:" + movies.size());
double avgRating = movies.stream().mapToDouble(Movie::getRating).average().orElse(0);
System.out.printf("平均评分:%.2f%n", avgRating);
}
}

101
crawl_project_extension_2/src/main/java/com/example/DoubanCrawler.java

@ -0,0 +1,101 @@
package com.example;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DoubanCrawler implements MovieCrawler {
// 编译年份正则(提取4位数字年份)
private static final Pattern YEAR_PATTERN = Pattern.compile("(\\d{4})");
@Override
public List<Movie> crawl() {
List<Movie> movies = new ArrayList<>();
String baseUrl = "https://movie.douban.com/top250?start=";
try {
// 10页,每页25条
for (int i = 0; i < 250; i += 25) {
String url = baseUrl + i;
System.out.println("正在爬取:" + url);
Document doc = Jsoup.connect(url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36")
.timeout(8000)
.get();
Elements items = doc.select(".item");
for (Element item : items) {
Movie movie = new Movie();
// 1. 电影名
movie.setTitle(item.select(".title").first().text());
// 2. 评分
movie.setRating(Double.parseDouble(item.select(".rating_num").text()));
// 3. 评价人数
int reviewCount = 0;
String allText = item.text(); // 直接拿整个区块的文字
Pattern pattern = Pattern.compile("(\\d+)人评价");
Matcher matcher = pattern.matcher(allText);
if (matcher.find()) {
reviewCount = Integer.parseInt(matcher.group(1));
}
movie.setReviewCount(reviewCount);
movie.setReviewCount(reviewCount);
// 4. 电影信息(导演 + 年份)
String info = item.select(".bd p").first().text();
// 清洗导演
movie.setDirector(cleanDirector(info));
// 清洗年份
movie.setYear(cleanYear(info));
movies.add(movie);
}
// 文明爬虫,随机延迟
Thread.sleep((long) (Math.random() * 2000 + 1000));
}
System.out.println("爬取完成!共获取 " + movies.size() + " 部电影");
} catch (IOException | InterruptedException e) {
e.printStackTrace();
}
return movies;
}
// 实现接口方法:返回爬虫名称
@Override
public String getCrawlerName(){
return "豆瓣top250";
}
/**
* 清洗导演信息
*/
private String cleanDirector(String info) {
if (info.contains("导演:")) {
int start = info.indexOf("导演:") + 3;
int end = info.indexOf(" ", start + 2);
if (end == -1) end = info.length();
return info.substring(start, end).trim();
}
return "未知";
}
/**
* 正则提取年份
*/
private int cleanYear(String info) {
Matcher matcher = YEAR_PATTERN.matcher(info);
if (matcher.find()) {
return Integer.parseInt(matcher.group(1));
}
return 0;
}
}

103
crawl_project_extension_2/src/main/java/com/example/M1905Crawler.java

@ -0,0 +1,103 @@
package com.example;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class M1905Crawler implements MovieCrawler {
// 1905电影网排行榜URL
private static final String RANK_URL = "https://www.1905.com/vod/rank/ta99o3.shtml";
@Override
public List<Movie> crawl() {
List<Movie> movies = new ArrayList<>();
try {
System.out.println("正在爬取:" + RANK_URL);
Document doc = Jsoup.connect(RANK_URL)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.header("Referer", "https://www.1905.com/")
.timeout(15000)
.get();
// 表格结构:选择表格行(跳过表头)
Elements rows = doc.select("table tr");
System.out.println("找到行数:" + rows.size());
int rank = 1;
for (Element row : rows) {
// 跳过表头行
if (row.select("th").size() > 0) continue;
Elements cells = row.select("td");
if (cells.size() < 5) continue;
Movie movie = new Movie();
// 1. 排行(第1列)
// cells.get(0).text() -> "1", "2"...
// 2. 趋势(第2列)- 忽略
// 3. 电影名称(第3列)
Element nameCell = cells.get(2);
Element nameLink = nameCell.selectFirst("a");
String title = nameLink != null ? nameLink.text().trim() : nameCell.text().trim();
movie.setTitle(title);
// 获取详情页链接(用于方案二)
String detailUrl = nameLink != null ? nameLink.absUrl("href") : "";
// 4. 主演(第4列)- 作为导演字段存储(页面无导演信息)
String actors = cells.get(3).text().trim();
movie.setDirector(actors); // 复用director字段存主演
// 5. 播放次数(第5列)- 作为reviewCount存储
String playCountStr = cells.get(4).text().trim().replace(",", "");
int playCount = 0;
try {
playCount = Integer.parseInt(playCountStr);
} catch (NumberFormatException e) {
// 忽略解析错误
}
movie.setReviewCount(playCount); // 复用reviewCount字段存播放次数
// 评分和年份:列表页没有,设为默认值
movie.setRating(0.0); // 无评分数据
movie.setYear(0); // 无年份数据
// 方案二:进入详情页获取完整信息(取消下面注释启用)
// if (!detailUrl.isEmpty()) {
// fillDetailInfo(movie, detailUrl);
// Thread.sleep(1000); // 礼貌延迟
// }
movies.add(movie);
System.out.printf("已解析 [%d] %s | 主演: %s | 播放: %d%n",
rank, title, actors, playCount);
rank++;
}
System.out.println("爬取完成!共获取 " + movies.size() + " 部电影");
} catch (IOException e) {
System.err.println("爬取失败:" + e.getMessage());
e.printStackTrace();
}
return movies;
}
@Override
public String getCrawlerName () {
return "1905电影网播放排行榜";
}
}

53
crawl_project_extension_2/src/main/java/com/example/Main.java

@ -0,0 +1,53 @@
package com.example;
import java.util.List;
public class Main {
public static void main(String[] args) {
// 通过参数切换爬虫:1905 或 douban(默认)
String source = "1905";
if (args.length > 0) {
source = args[0].toLowerCase();
}
MovieCrawler crawler;
String csvName;
switch (source) {
case "1905":
crawler = new M1905Crawler();
csvName = "1905_rank.csv";
break;
case "douban":
default:
crawler = new DoubanCrawler();
csvName = "douban_top250.csv";
break;
}
System.out.println("使用爬虫:" + crawler.getCrawlerName());
List<Movie> movies = crawler.crawl();
if (movies.isEmpty()) {
System.err.println("未获取到任何电影数据!");
return;
}
// 数据分析
MovieAnalyzer analyzer = new DataAnalyzer();
analyzer.analyzeTotal(movies);
analyzer.analyzeByDimension(movies);
// 导出CSV
CsvExporter.exportToCsv(movies, csvName);
// 生成图表(1905数据缺少评分/年份,图表可能为空或需调整)
if (!"1905".equals(source)) {
ChartGenerator.saveBarChart(movies);
ChartGenerator.saveLineChart(movies);
ChartGenerator.savePieChart(movies);
} else {
System.out.println("⚠️ 1905数据缺少评分/年份,跳过图表生成");
}
}
}

75
crawl_project_extension_2/src/main/java/com/example/Movie.java

@ -0,0 +1,75 @@
package com.example;
public class Movie {
private String title; // 电影名称
private String director; // 导演
private int year; // 上映年份
private double rating; // 评分
private int reviewCount; // 评价人数
// 无参构造
public Movie() {}
// 全参构造
public Movie(String title, String director, int year, double rating, int reviewCount) {
this.title = title;
this.director = director;
this.year = year;
this.rating = rating;
this.reviewCount = reviewCount;
}
// Getter & Setter
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDirector() {
return director;
}
public void setDirector(String director) {
this.director = director;
}
public int getYear() {
return year;
}
public void setYear(int year) {
this.year = year;
}
public double getRating() {
return rating;
}
public void setRating(double rating) {
this.rating = rating;
}
public int getReviewCount() {
return reviewCount;
}
public void setReviewCount(int reviewCount) {
this.reviewCount = reviewCount;
}
// 打印输出
@Override
public String toString() {
return "Movie{" +
"片名='" + title + '\'' +
", 导演='" + director + '\'' +
", 年份=" + year +
", 评分=" + rating +
", 评价人数=" + reviewCount +
'}';
}
}

8
crawl_project_extension_2/src/main/java/com/example/MovieAnalyzer.java

@ -0,0 +1,8 @@
package com.example;
import java.util.List;
public interface MovieAnalyzer {
// 总览分析
void analyzeTotal(List<Movie> movies);
// 按维度分析(TopN、年份等)
void analyzeByDimension(List<Movie> movies);
}

8
crawl_project_extension_2/src/main/java/com/example/MovieCrawler.java

@ -0,0 +1,8 @@
package com.example;
import java.util.List;
public interface MovieCrawler {
// 爬取电影列表
List<Movie> crawl();
// 获取爬虫名称(如"豆瓣Top250"、"IMDB Top100")
String getCrawlerName();
}

13
crawl_project_extension_2/src/main/java/org/example/App.java

@ -0,0 +1,13 @@
package org.example;
/**
* Hello world!
*
*/
public class App
{
public static void main( String[] args )
{
System.out.println( "Hello World!" );
}
}

38
crawl_project_extension_2/src/test/java/org/example/AppTest.java

@ -0,0 +1,38 @@
package org.example;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
/**
* Unit test for simple App.
*/
public class AppTest
extends TestCase
{
/**
* Create the test case
*
* @param testName name of the test case
*/
public AppTest( String testName )
{
super( testName );
}
/**
* @return the suite of tests being tested
*/
public static Test suite()
{
return new TestSuite( AppTest.class );
}
/**
* Rigourous Test :-)
*/
public void testApp()
{
assertTrue( true );
}
}

BIN
crawl_project_extension_2/target/classes/com/example/ChartGenerator.class

Binary file not shown.

BIN
crawl_project_extension_2/target/classes/com/example/CsvExporter.class

Binary file not shown.

BIN
crawl_project_extension_2/target/classes/com/example/DataAnalyzer.class

Binary file not shown.

BIN
crawl_project_extension_2/target/classes/com/example/DoubanCrawler.class

Binary file not shown.

BIN
crawl_project_extension_2/target/classes/com/example/Main.class

Binary file not shown.

BIN
crawl_project_extension_2/target/classes/com/example/Movie.class

Binary file not shown.

BIN
crawl_project_extension_2/target/classes/com/example/MovieAnalyzer.class

Binary file not shown.

BIN
crawl_project_extension_2/target/classes/com/example/MovieCrawler.class

Binary file not shown.

BIN
crawl_project_extension_2/target/classes/org/example/App.class

Binary file not shown.
Loading…
Cancel
Save