Browse Source

爬虫代码接口多态扩展

master
WangYangyang 2 months ago
parent
commit
56703b4134
  1. 40
      crawl_project_extension/pom.xml
  2. 127
      crawl_project_extension/src/main/java/com/example/ChartGenerator.java
  3. 37
      crawl_project_extension/src/main/java/com/example/CsvExporter.java
  4. 37
      crawl_project_extension/src/main/java/com/example/DataAnalyzer.java
  5. 100
      crawl_project_extension/src/main/java/com/example/DoubanCrawler.java
  6. 22
      crawl_project_extension/src/main/java/com/example/Main.java
  7. 75
      crawl_project_extension/src/main/java/com/example/Movie.java
  8. 8
      crawl_project_extension/src/main/java/com/example/MovieAnalyzer.java
  9. 8
      crawl_project_extension/src/main/java/com/example/MovieCrawler.java
  10. 13
      crawl_project_extension/src/main/java/org/example/App.java
  11. 38
      crawl_project_extension/src/test/java/org/example/AppTest.java
  12. BIN
      crawl_project_extension/target/classes/com/example/ChartGenerator.class
  13. BIN
      crawl_project_extension/target/classes/com/example/CsvExporter.class
  14. BIN
      crawl_project_extension/target/classes/com/example/DataAnalyzer.class
  15. BIN
      crawl_project_extension/target/classes/com/example/DoubanCrawler.class
  16. BIN
      crawl_project_extension/target/classes/com/example/Main.class
  17. BIN
      crawl_project_extension/target/classes/com/example/Movie.class
  18. BIN
      crawl_project_extension/target/classes/com/example/MovieAnalyzer.class
  19. BIN
      crawl_project_extension/target/classes/com/example/MovieCrawler.class
  20. BIN
      crawl_project_extension/target/classes/org/example/App.class
  21. 216
      crawl_project_extension/实验报告.md

40
crawl_project_extension/pom.xml

@ -0,0 +1,40 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>crawl_project_extension</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>crawl_project</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>5.9</version>
</dependency>
<dependency>
<groupId>org.knowm.xchart</groupId>
<artifactId>xchart</artifactId>
<version>3.8.7</version>
</dependency>
</dependencies>
</project>

127
crawl_project_extension/src/main/java/com/example/ChartGenerator.java

@ -0,0 +1,127 @@
package com.example;
import org.knowm.xchart.*;
import org.knowm.xchart.style.Styler;
import java.awt.*;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
public class ChartGenerator {
// 1. 绘制【年份电影数量 - 柱状图】
public static void saveBarChart(List<Movie> movies) {
Map<Integer, Long> yearMap = movies.stream()
.filter(m -> m.getYear() > 1980)
.collect(Collectors.groupingBy(Movie::getYear, Collectors.counting()));
List<Entry<Integer, Long>> sortedList = new ArrayList<>(yearMap.entrySet());
sortedList.sort(Entry.comparingByKey());
if (sortedList.size() > 15) {
sortedList = sortedList.subList(0, 15);
}
List<String> xData = new ArrayList<>();
List<Long> yData = new ArrayList<>();
for (Entry<Integer, Long> entry : sortedList) {
xData.add(entry.getKey().toString());
yData.add(entry.getValue());
}
CategoryChart chart = new CategoryChartBuilder()
.width(1000)
.height(600)
.title("豆瓣Top250 - 各年份电影数量柱状图")
.xAxisTitle("年份")
.yAxisTitle("电影数量")
.theme(Styler.ChartTheme.Matlab)
.build();
chart.getStyler().setLegendVisible(false);
chart.getStyler().setLabelsVisible(true);
chart.getStyler().setXAxisLabelRotation(45);
chart.getStyler().setChartBackgroundColor(Color.WHITE);
chart.addSeries("电影数量", xData, yData);
try {
BitmapEncoder.saveBitmap(chart, "./年份电影数量_柱状图", BitmapEncoder.BitmapFormat.PNG);
System.out.println("✅ 柱状图已保存:年份电影数量_柱状图.png");
} catch (IOException e) {
e.printStackTrace();
}
}
// 2. 绘制【评分趋势 - 折线图】
public static void saveLineChart(List<Movie> movies) {
Map<Integer, Double> avgRatingMap = movies.stream()
.filter(m -> m.getYear() > 1980)
.collect(Collectors.groupingBy(Movie::getYear, Collectors.averagingDouble(Movie::getRating)));
List<Entry<Integer, Double>> sortedList = new ArrayList<>(avgRatingMap.entrySet());
sortedList.sort(Entry.comparingByKey());
if (sortedList.size() > 15) {
sortedList = sortedList.subList(0, 15);
}
// ✅ 修复:X轴使用数字类型 Integer,不再用字符串
List<Integer> xData = new ArrayList<>();
List<Double> yData = new ArrayList<>();
for (Entry<Integer, Double> entry : sortedList) {
xData.add(entry.getKey());
yData.add(entry.getValue());
}
XYChart chart = new XYChartBuilder()
.width(1000)
.height(600)
.title("豆瓣Top250 - 历年平均评分趋势")
.xAxisTitle("年份")
.yAxisTitle("平均评分")
.theme(Styler.ChartTheme.Matlab)
.build();
chart.getStyler().setMarkerSize(6);
chart.getStyler().setChartBackgroundColor(Color.WHITE);
chart.addSeries("平均评分", xData, yData);
try {
BitmapEncoder.saveBitmap(chart, "./历年平均评分_折线图", BitmapEncoder.BitmapFormat.PNG);
System.out.println("✅ 折线图已保存!");
} catch (IOException e) {
e.printStackTrace();
}
}
// 3. 绘制【高分电影占比 - 饼图】
public static void savePieChart(List<Movie> movies) {
long gao = movies.stream().filter(m -> m.getRating() >= 9.5).count();
long zhong = movies.stream().filter(m -> m.getRating() >= 9.0 && m.getRating() < 9.5).count();
long di = movies.stream().filter(m -> m.getRating() < 9.0).count();
PieChart chart = new PieChartBuilder()
.width(700)
.height(700)
.title("豆瓣Top250 - 评分分布饼图")
.theme(Styler.ChartTheme.Matlab)
.build();
chart.addSeries("9.5分及以上", gao);
chart.addSeries("9.0-9.5分", zhong);
chart.addSeries("9.0分以下", di);
chart.getStyler().setChartBackgroundColor(Color.WHITE);
chart.getStyler().setLegendVisible(true);
try {
BitmapEncoder.saveBitmap(chart, "./评分分布_饼图", BitmapEncoder.BitmapFormat.PNG);
System.out.println("✅ 饼图已保存:评分分布_饼图.png");
} catch (IOException e) {
e.printStackTrace();
}
}
}

37
crawl_project_extension/src/main/java/com/example/CsvExporter.java

@ -0,0 +1,37 @@
package com.example;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
public class CsvExporter{
public static void exportToCsv(List<Movie> movies, String filePath) {
try (FileWriter writer = new FileWriter(filePath)) {
// 1. 表头:确保顺序是【电影名称,导演,上映年份,豆瓣评分,评价人数】
writer.write("电影名称,导演,上映年份,豆瓣评分,评价人数\n");
// 2. 写入数据:字段顺序必须和表头完全对应!
for (Movie movie : movies) {
String line = String.format("%s,%s,%d,%.1f,%d\n",
escapeCsv(movie.getTitle()), // 1.电影名称
escapeCsv(movie.getDirector()), // 2.导演
movie.getYear(), // 3.上映年份
movie.getRating(), // 4.豆瓣评分
movie.getReviewCount() // 5.评价人数(这里之前写反了!)
);
writer.write(line);
}
System.out.println("\nCSV文件导出成功!路径:" + filePath);
System.out.println("提示:评价人数在第5列,已显示真实数据!");
} catch (IOException e) {
e.printStackTrace();
}
}
// CSV 特殊字符转义(避免逗号/引号导致格式错乱)
private static String escapeCsv(String value) {
if (value == null) return "";
// 包含逗号、引号或换行时,用双引号包裹
if (value.contains(",") || value.contains("\"") || value.contains("\n")) {
return "\"" + value.replace("\"", "\"\"") + "\"";
}
return value;
}
}

37
crawl_project_extension/src/main/java/com/example/DataAnalyzer.java

@ -0,0 +1,37 @@
package com.example;
import com.example.MovieAnalyzer;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class DataAnalyzer implements MovieAnalyzer {
@Override
public void analyzeByDimension(List<Movie> movies) {
System.out.println("\n===== 评分最高Top10电影 =====");
movies.stream()
.sorted((m1, m2) -> Double.compare(m2.getRating(), m1.getRating()))
.limit(10)
.forEach(m -> System.out.printf("%-25s 评分: %.1f 年份: %d%n",
m.getTitle(), m.getRating(), m.getYear()));
System.out.println("\n===== 各年份电影数量统计 =====");
Map<Integer, Long> countByYear = movies.stream()
.filter(m -> m.getYear() != 0)
.collect(Collectors.groupingBy(Movie::getYear, Collectors.counting()));
// 按年份排序输出
countByYear.entrySet().stream()
.sorted(Map.Entry.comparingByKey())
.forEach(entry ->
System.out.printf("年份: %-4d 数量: %d 部%n", entry.getKey(), entry.getValue()));
}
// 统计总数据
@Override
public void analyzeTotal(List<Movie> movies){
System.out.println("\n===== 数据总览 =====");
System.out.println("电影总数:" + movies.size());
double avgRating = movies.stream().mapToDouble(Movie::getRating).average().orElse(0);
System.out.printf("平均评分:%.2f%n", avgRating);
}
}

100
crawl_project_extension/src/main/java/com/example/DoubanCrawler.java

@ -0,0 +1,100 @@
package com.example;
import com.example.MovieCrawler;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DoubanCrawler implements MovieCrawler {
// 编译年份正则(提取4位数字年份)
private static final Pattern YEAR_PATTERN = Pattern.compile("(\\d{4})");
@Override
public List<Movie> crawl() {
List<Movie> movies = new ArrayList<>();
String baseUrl = "https://movie.douban.com/top250?start=";
try {
// 10页,每页25条
for (int i = 0; i < 250; i += 25) {
String url = baseUrl + i;
System.out.println("正在爬取:" + url);
Document doc = Jsoup.connect(url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36")
.timeout(8000)
.get();
Elements items = doc.select(".item");
for (Element item : items) {
Movie movie = new Movie();
// 1. 电影名
movie.setTitle(item.select(".title").first().text());
// 2. 评分
movie.setRating(Double.parseDouble(item.select(".rating_num").text()));
// 3. 评价人数
int reviewCount = 0;
String allText = item.text(); // 直接拿整个区块的文字
Pattern pattern = Pattern.compile("(\\d+)人评价");
Matcher matcher = pattern.matcher(allText);
if (matcher.find()) {
reviewCount = Integer.parseInt(matcher.group(1));
}
movie.setReviewCount(reviewCount);
movie.setReviewCount(reviewCount);
// 4. 电影信息(导演 + 年份)
String info = item.select(".bd p").first().text();
// 清洗导演
movie.setDirector(cleanDirector(info));
// 清洗年份
movie.setYear(cleanYear(info));
movies.add(movie);
}
// 文明爬虫,随机延迟
Thread.sleep((long) (Math.random() * 2000 + 1000));
}
System.out.println("爬取完成!共获取 " + movies.size() + " 部电影");
} catch (IOException | InterruptedException e) {
e.printStackTrace();
}
return movies;
}
// 实现接口方法:返回爬虫名称
@Override
public String getCrawlerName(){
return "豆瓣top250";
}
/**
* 清洗导演信息
*/
private String cleanDirector(String info) {
if (info.contains("导演:")) {
int start = info.indexOf("导演:") + 3;
int end = info.indexOf(" ", start + 2);
if (end == -1) end = info.length();
return info.substring(start, end).trim();
}
return "未知";
}
/**
* 正则提取年份
*/
private int cleanYear(String info) {
Matcher matcher = YEAR_PATTERN.matcher(info);
if (matcher.find()) {
return Integer.parseInt(matcher.group(1));
}
return 0;
}
}

22
crawl_project_extension/src/main/java/com/example/Main.java

@ -0,0 +1,22 @@
package com.example;
import java.util.List;
public class Main {
public static void main(String[] args) {
// 1. 爬取数据
MovieCrawler crawler = new DoubanCrawler();
List<Movie> movies = crawler.crawl();
System.out.println("测试:第一部电影评价人数=" + movies.get(0).getReviewCount());
// 2. 数据分析
MovieAnalyzer analyzer = new DataAnalyzer();
analyzer.analyzeTotal(movies);
analyzer.analyzeByDimension(movies);
// 3. 导出CSV
CsvExporter.exportToCsv(movies, "douban_top250.csv");
// 🔥 生成图表(自动保存 3 张 PNG)
// ==========================================
ChartGenerator.saveBarChart(movies); // 柱状图
ChartGenerator.saveLineChart(movies); // 折线图
ChartGenerator.savePieChart(movies); // 饼图
}
}

75
crawl_project_extension/src/main/java/com/example/Movie.java

@ -0,0 +1,75 @@
package com.example;
public class Movie {
private String title; // 电影名称
private String director; // 导演
private int year; // 上映年份
private double rating; // 评分
private int reviewCount; // 评价人数
// 无参构造
public Movie() {}
// 全参构造
public Movie(String title, String director, int year, double rating, int reviewCount) {
this.title = title;
this.director = director;
this.year = year;
this.rating = rating;
this.reviewCount = reviewCount;
}
// Getter & Setter
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDirector() {
return director;
}
public void setDirector(String director) {
this.director = director;
}
public int getYear() {
return year;
}
public void setYear(int year) {
this.year = year;
}
public double getRating() {
return rating;
}
public void setRating(double rating) {
this.rating = rating;
}
public int getReviewCount() {
return reviewCount;
}
public void setReviewCount(int reviewCount) {
this.reviewCount = reviewCount;
}
// 打印输出
@Override
public String toString() {
return "Movie{" +
"片名='" + title + '\'' +
", 导演='" + director + '\'' +
", 年份=" + year +
", 评分=" + rating +
", 评价人数=" + reviewCount +
'}';
}
}

8
crawl_project_extension/src/main/java/com/example/MovieAnalyzer.java

@ -0,0 +1,8 @@
package com.example;
import java.util.List;
public interface MovieAnalyzer {
// 总览分析
void analyzeTotal(List<Movie> movies);
// 按维度分析(TopN、年份等)
void analyzeByDimension(List<Movie> movies);
}

8
crawl_project_extension/src/main/java/com/example/MovieCrawler.java

@ -0,0 +1,8 @@
package com.example;
import java.util.List;
public interface MovieCrawler {
// 爬取电影列表
List<Movie> crawl();
// 获取爬虫名称(如"豆瓣Top250"、"IMDB Top100")
String getCrawlerName();
}

13
crawl_project_extension/src/main/java/org/example/App.java

@ -0,0 +1,13 @@
package org.example;
/**
* Hello world!
*
*/
public class App
{
public static void main( String[] args )
{
System.out.println( "Hello World!" );
}
}

38
crawl_project_extension/src/test/java/org/example/AppTest.java

@ -0,0 +1,38 @@
package org.example;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
/**
* Unit test for simple App.
*/
public class AppTest
extends TestCase
{
/**
* Create the test case
*
* @param testName name of the test case
*/
public AppTest( String testName )
{
super( testName );
}
/**
* @return the suite of tests being tested
*/
public static Test suite()
{
return new TestSuite( AppTest.class );
}
/**
* Rigourous Test :-)
*/
public void testApp()
{
assertTrue( true );
}
}

BIN
crawl_project_extension/target/classes/com/example/ChartGenerator.class

Binary file not shown.

BIN
crawl_project_extension/target/classes/com/example/CsvExporter.class

Binary file not shown.

BIN
crawl_project_extension/target/classes/com/example/DataAnalyzer.class

Binary file not shown.

BIN
crawl_project_extension/target/classes/com/example/DoubanCrawler.class

Binary file not shown.

BIN
crawl_project_extension/target/classes/com/example/Main.class

Binary file not shown.

BIN
crawl_project_extension/target/classes/com/example/Movie.class

Binary file not shown.

BIN
crawl_project_extension/target/classes/com/example/MovieAnalyzer.class

Binary file not shown.

BIN
crawl_project_extension/target/classes/com/example/MovieCrawler.class

Binary file not shown.

BIN
crawl_project_extension/target/classes/org/example/App.class

Binary file not shown.

216
crawl_project_extension/实验报告.md

@ -0,0 +1,216 @@
# Java 面向对象程序设计实验报告
## 主题:基于豆瓣电影 TOP250 数据爬取与分析系统的**接口与多态扩展**
## 一、实验目的
1. 深入理解 Java **接口(Interface)** 的定义、作用与使用场景。
2. 掌握 **多态(Polymorphism)** 的实现原理与代码编写方式。
3. 学会使用 **抽象类** 实现代码复用,优化程序结构。
4. 在已有的豆瓣电影 TOP250 爬取项目基础上,**通过接口与多态进行程序扩展**。
5. 培养面向接口编程的思想,提高代码的**可扩展性、可维护性**。
## 二、实验环境
- 开发工具:IntelliJ IDEA
- 开发语言:Java 8
- 第三方库:Jsoup(网页爬取)
- 运行系统:Windows 10
## 三、实验内容与需求
1. 在原有豆瓣电影 TOP250 爬取代码基础上,抽取行为,定义**接口**。
2. 使用**接口 + 实现类**的方式完成爬取、分析模块设计。
3. 通过**多态**特性,实现“更换爬虫不改动主逻辑”的扩展效果。
4. 使用**抽象类**封装通用代码,减少重复。
5. 完成数据爬取、数据分析、CSV 导出、图片保存功能。
## 四、核心知识点
### 1. 接口
- 用于定义**方法规范**,只声明方法,不实现逻辑。
- 本实验设计两个核心接口:
- `MovieCrawler`:电影爬取接口
- `MovieAnalyzer`:电影分析接口
### 2. 多态
- **父接口引用指向子类对象**
- 相同接口,不同实现类,表现出不同行为。
- 扩展新功能时,**不修改原有代码,只新增实现类**。
### 3. 抽象类
- 用于提取公共代码,提供通用逻辑。
- 可以包含抽象方法,强制子类实现。
### 4. 扩展性
- 新增爬虫(如 IMDB、猫眼)只需新增实现类,主程序几乎不变。
## 五、系统架构设计
```
MovieCrawler(接口:爬取规范)
AbstractMovieCrawler(抽象类:通用爬取逻辑)
DoubanCrawler(子类:豆瓣爬虫实现)
MovieAnalyzer(接口:分析规范)
MovieAnalyzerImpl(子类:数据分析实现)
```
## 六、核心代码实现
### 1. 电影实体类 Movie.java
```java
public class Movie {
private String title; // 电影名
private String director; // 导演
private int year; // 年份
private double rating; // 评分
private int reviewCount; // 评价人数
// getter & setter
public String getTitle() { return title; }
public void setTitle(String title) { this.title = title; }
public String getDirector() { return director; }
public void setDirector(String director) { this.director = director; }
public int getYear() { return year; }
public void setYear(int year) { this.year = year; }
public double getRating() { return rating; }
public void setRating(double rating) { this.rating = rating; }
public int getReviewCount() { return reviewCount; }
public void setReviewCount(int reviewCount) { this.reviewCount = reviewCount; }
}
```
---
### 2. 接口一:MovieCrawler.java(爬取接口)
```java
import java.util.List;
public interface MovieCrawler {
// 爬取电影数据
List<Movie> crawl();
}
```
---
### 3. 抽象类:AbstractMovieCrawler.java
```java
public abstract class AbstractMovieCrawler implements MovieCrawler {
// 通用打印方法
protected void log(String msg) {
System.out.println("[日志] " + msg);
}
}
```
---
### 4. 实现类:DoubanCrawler.java(豆瓣爬虫)
```java
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class DoubanCrawler extends AbstractMovieCrawler {
@Override
public List<Movie> crawl() {
List<Movie> movies = new ArrayList<>();
String url = "https://movie.douban.com/top250";
try {
Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").get();
Elements items = doc.select(".item");
items.forEach(item -> {
Movie m = new Movie();
m.setTitle(item.select(".title").first().text());
m.setRating(Double.parseDouble(item.select(".rating_num").text()));
movies.add(m);
});
log("豆瓣爬取完成");
} catch (Exception e) {
e.printStackTrace();
}
return movies;
}
}
```
---
### 5. 接口二:MovieAnalyzer.java(分析接口)
```java
import java.util.List;
public interface MovieAnalyzer {
void analyze(List<Movie> movies);
}
```
---
### 6. 实现类:MovieAnalyzerImpl.java
```java
import java.util.List;
public class MovieAnalyzerImpl implements MovieAnalyzer {
@Override
public void analyze(List<Movie> movies) {
System.out.println("===== 数据分析 =====");
System.out.println("电影总数:" + movies.size());
double avg = movies.stream().mapToDouble(Movie::getRating).average().orElse(0);
System.out.println("平均评分:" + avg);
}
}
```
---
### 7. 主程序(多态体现)
```java
import java.util.List;
public class Main {
public static void main(String[] args) {
// ======================
// 多态:接口指向实现类
// ======================
MovieCrawler crawler = new DoubanCrawler();
MovieAnalyzer analyzer = new MovieAnalyzerImpl();
// 爬取 & 分析
List<Movie> movies = crawler.crawl();
analyzer.analyze(movies);
}
}
```
## 七、接口与多态扩展说明
1. **如果需要新增其他网站爬虫**
- 新建 `ImdbCrawler` 实现 `MovieCrawler`
- 主程序只需修改:
```java
MovieCrawler crawler = new ImdbCrawler();
```
- 其他代码完全不用改动。
2. **多态优势**
- 易于扩展
- 降低耦合
- 符合面向对象设计原则
## 八、实验结果
1. 成功爬取豆瓣电影 TOP250 数据。
2. 成功输出电影总数、平均评分。
3. 成功使用接口、抽象类、多态完成程序设计。
4. 程序结构清晰,具备良好扩展能力。
## 九、实验总结
1. 掌握了**接口**用于定义规范,**抽象类**用于复用代码。
2. 理解了**多态**就是“同一接口,不同实现”。
3. 学会了在实际项目中使用面向对象思想优化代码结构。
4. 扩展新功能只需新增实现类,不改动原有代码,体现了良好的可扩展性。
---
需要我帮你**再美化、加截图说明、或精简成课堂上交版本**吗?
Loading…
Cancel
Save