diff --git a/project1/MovieMain.java b/project1/MovieMain.java new file mode 100644 index 0000000..2d88253 --- /dev/null +++ b/project1/MovieMain.java @@ -0,0 +1,59 @@ +package com.crawler; + +import com.crawler.chart.ChartGenerator; +import com.crawler.chart.ChartManager; +import com.crawler.chart.impl.GenreDistributionChartGenerator; +import com.crawler.chart.impl.RatingDistributionChartGenerator; +import com.crawler.chart.impl.YearDistributionChartGenerator; +import com.crawler.chart.impl.YearRatingChartGenerator; +import com.crawler.model.Movie; +import com.crawler.spider.DoubanSpider; +import com.crawler.utils.DataUtils; +import com.crawler.ui.MovieResultDisplay; + +import java.util.List; + +public class MovieMain { + public static void main(String[] args) { + try { + System.out.println("开始爬取豆瓣电影Top250数据..."); + + // 1. 启动爬虫 + DoubanSpider spider = new DoubanSpider(); + List movieList = spider.crawlMovies(); + + // 2. 清洗数据 + List cleanedMovies = movieList.stream() + .map(DataUtils::cleanMovie) + .filter(movie -> movie != null) + .toList(); + + // 3. 保存数据到CSV文件 + DataUtils.writeMovieToCSV(cleanedMovies, "douban_movies.csv"); + System.out.println("数据已保存到 douban_movies.csv"); + + // 4. 展示结果 + MovieResultDisplay.displayResults(cleanedMovies); + + // 5. 使用多态生成图表 + ChartManager chartManager = new ChartManager(); + + ChartGenerator ratingChart = new RatingDistributionChartGenerator(); + ChartGenerator yearChart = new YearDistributionChartGenerator(); + ChartGenerator genreChart = new GenreDistributionChartGenerator(); + ChartGenerator yearRatingChart = new YearRatingChartGenerator(); + + chartManager.addChartGenerator(ratingChart); + chartManager.addChartGenerator(yearChart); + chartManager.addChartGenerator(genreChart); + chartManager.addChartGenerator(yearRatingChart); + + chartManager.generateAllCharts(cleanedMovies); + + System.out.println("\n爬虫任务完成!"); + + } catch (Exception e) { + e.printStackTrace(); + } + } +} \ No newline at end of file diff --git a/project1/README.md b/project1/README.md new file mode 100644 index 0000000..1d27041 --- /dev/null +++ b/project1/README.md @@ -0,0 +1,136 @@ +# 电影爬虫项目 - 继承与多态实现说明 + +## 项目简介 + +本项目是一个Java电影爬虫,从豆瓣电影Top250抓取数据,进行清洗、存储、分析,并生成多种图表展示结果。项目重点展示了面向对象编程中**继承**和**多态**的实现。 + +## 继承与多态实现 + +### 1. 接口继承 + +#### 1.1 核心接口定义 + +**文件**: `src/main/java/com/crawler/chart/ChartGenerator.java` + +```java +public interface ChartGenerator { + void generateChart(Movie[] movies); + String getChartName(); +} +``` + +#### 1.2 实现类继承 + +| 实现类 | 文件位置 | 继承关系 | +|-------|---------|----------| +| `RatingDistributionChartGenerator` | `src/main/java/com/crawler/chart/impl/RatingDistributionChartGenerator.java` | 实现 `ChartGenerator` 接口 | +| `YearDistributionChartGenerator` | `src/main/java/com/crawler/chart/impl/YearDistributionChartGenerator.java` | 实现 `ChartGenerator` 接口 | +| `GenreDistributionChartGenerator` | `src/main/java/com/crawler/chart/impl/GenreDistributionChartGenerator.java` | 实现 `ChartGenerator` 接口 | +| `YearRatingChartGenerator` | `src/main/java/com/crawler/chart/impl/YearRatingChartGenerator.java` | 实现 `ChartGenerator` 接口 | + +### 2. 多态实现 + +#### 2.1 向上转型(接口引用指向实现类) + +**文件**: `src/main/java/com/crawler/MovieMain.java` (第41-44行) + +```java +ChartGenerator ratingChart = new RatingDistributionChartGenerator(); +ChartGenerator yearChart = new YearDistributionChartGenerator(); +ChartGenerator genreChart = new GenreDistributionChartGenerator(); +ChartGenerator yearRatingChart = new YearRatingChartGenerator(); +``` + +#### 2.2 方法参数多态 + +**文件**: `src/main/java/com/crawler/chart/ChartManager.java` (第12-13行) + +```java +public void addChartGenerator(ChartGenerator generator) { + chartGenerators.add(generator); +} +``` + +#### 2.3 运行时多态(动态绑定) + +**文件**: `src/main/java/com/crawler/chart/ChartManager.java` (第21-25行) + +```java +public void generateAllCharts(List movies) { + Movie[] movieArray = movies.toArray(new Movie[0]); + for (ChartGenerator generator : chartGenerators) { + System.out.println("生成图表: " + generator.getChartName()); + generator.generateChart(movieArray); // 运行时根据实际类型调用对应方法 + } +} +``` + +#### 2.4 统一调用接口 + +**文件**: `src/main/java/com/crawler/MovieMain.java` (第46-51行) + +```java +chartManager.addChartGenerator(ratingChart); +chartManager.addChartGenerator(yearChart); +chartManager.addChartGenerator(genreChart); +chartManager.addChartGenerator(yearRatingChart); + +chartManager.generateAllCharts(cleanedMovies); +``` + +## 继承与多态的优势 + +1. **代码复用**:所有图表生成器共享相同的接口方法 +2. **可扩展性**:新增图表类型只需实现接口,无需修改现有代码 +3. **统一管理**:`ChartManager` 可以统一管理不同类型的图表生成器 +4. **灵活性**:通过接口引用可以操作不同的实现类对象 +5. **可维护性**:代码结构清晰,职责分明 + +## 项目结构 + +``` +src/ +└── main/ + └── java/ + └── com/ + └── crawler/ + ├── MovieMain.java # 主入口文件 + ├── model/ + │ └── Movie.java # 电影数据模型 + ├── spider/ + │ └── DoubanSpider.java # 豆瓣爬虫实现 + ├── analysis/ + │ └── MovieAnalyzer.java # 数据分析工具 + ├── ui/ + │ └── MovieResultDisplay.java # 结果显示和图表生成 + ├── utils/ + │ └── DataUtils.java # 数据工具类 + └── chart/ + ├── ChartGenerator.java # 图表生成器接口 + ├── ChartManager.java # 图表管理器 + └── impl/ + ├── RatingDistributionChartGenerator.java # 评分分布图表 + ├── YearDistributionChartGenerator.java # 年份分布图表 + ├── GenreDistributionChartGenerator.java # 类型分布图表 + └── YearRatingChartGenerator.java # 年份评分相关性图表 +``` + +## 运行说明 + +1. **直接运行**:在IDE中直接运行 `MovieMain.java` +2. **依赖要求**:需要Jsoup和JFreeChart库 +3. **运行结果**: + - 控制台输出爬取进度和图表生成信息 + - 生成的CSV数据文件保存在项目目录 + - 生成的图表以PNG格式保存在项目目录 + +## 技术栈 + +- Java 8+ +- Jsoup (网页解析) +- JFreeChart (图表生成) +- Maven (依赖管理) + +## 总结 + +本项目通过图表生成器接口及其实现类,充分展示了面向对象编程中**继承**和**多态**的核心概念。接口定义了统一的方法规范,实现类提供了具体的实现逻辑,通过接口引用和运行时动态绑定,实现了代码的灵活性和可扩展性。 \ No newline at end of file diff --git a/project1/analysis/MovieAnalyzer.java b/project1/analysis/MovieAnalyzer.java new file mode 100644 index 0000000..7120e4d --- /dev/null +++ b/project1/analysis/MovieAnalyzer.java @@ -0,0 +1,119 @@ +package com.crawler.analysis; + +import com.crawler.model.Movie; + +import java.util.*; +import java.util.stream.Collectors; + +public class MovieAnalyzer { + // 统计电影评分分布 + public static Map analyzeRatingDistribution(List movieList) { + Map ratingMap = new TreeMap<>(); + + for (Movie movie : movieList) { + if (movie != null) { + double rating = movie.getRating(); + ratingMap.put(rating, ratingMap.getOrDefault(rating, 0) + 1); + } + } + + return ratingMap; + } + + // 统计电影年份分布 + public static Map analyzeYearDistribution(List movieList) { + Map yearMap = new TreeMap<>(); + + for (Movie movie : movieList) { + if (movie != null && movie.getYear() != null) { + String year = movie.getYear(); + yearMap.put(year, yearMap.getOrDefault(year, 0) + 1); + } + } + + return yearMap; + } + + // 统计电影类型分布 + public static Map analyzeGenreDistribution(List movieList) { + Map genreMap = new HashMap<>(); + + for (Movie movie : movieList) { + if (movie != null && movie.getGenre() != null) { + String genre = movie.getGenre(); + genreMap.put(genre, genreMap.getOrDefault(genre, 0) + 1); + } + } + + return genreMap; + } + + // 统计电影国家/地区分布 + public static Map analyzeCountryDistribution(List movieList) { + Map countryMap = new HashMap<>(); + + for (Movie movie : movieList) { + if (movie != null && movie.getCountry() != null) { + String country = movie.getCountry(); + countryMap.put(country, countryMap.getOrDefault(country, 0) + 1); + } + } + + return countryMap; + } + + // 分析导演作品数量排行 + public static Map analyzeDirectorWorks(List movieList) { + Map directorMap = new HashMap<>(); + + for (Movie movie : movieList) { + if (movie != null && movie.getDirector() != null) { + String director = movie.getDirector(); + directorMap.put(director, directorMap.getOrDefault(director, 0) + 1); + } + } + + // 按作品数量排序 + return directorMap.entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .collect(Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (e1, e2) -> e1, + LinkedHashMap::new + )); + } + + // 计算平均评分 + public static double calculateAverageRating(List movieList) { + return movieList.stream() + .filter(Objects::nonNull) + .mapToDouble(Movie::getRating) + .average() + .orElse(0.0); + } + + // 计算评分与年份的相关性(简单计算) + public static Map analyzeYearRatingCorrelation(List movieList) { + Map> yearRatingsMap = new TreeMap<>(); + + for (Movie movie : movieList) { + if (movie != null && movie.getYear() != null) { + String year = movie.getYear(); + double rating = movie.getRating(); + yearRatingsMap.computeIfAbsent(year, k -> new ArrayList<>()).add(rating); + } + } + + // 计算每年的平均评分 + Map yearAverageRatingMap = new TreeMap<>(); + for (Map.Entry> entry : yearRatingsMap.entrySet()) { + String year = entry.getKey(); + List ratings = entry.getValue(); + double average = ratings.stream().mapToDouble(Double::doubleValue).average().orElse(0.0); + yearAverageRatingMap.put(year, average); + } + + return yearAverageRatingMap; + } +} \ No newline at end of file diff --git a/project1/chart/ChartGenerator.java b/project1/chart/ChartGenerator.java new file mode 100644 index 0000000..8bd5375 --- /dev/null +++ b/project1/chart/ChartGenerator.java @@ -0,0 +1,8 @@ +package com.crawler.chart; + +import com.crawler.model.Movie; + +public interface ChartGenerator { + void generateChart(Movie[] movies); + String getChartName(); +} \ No newline at end of file diff --git a/project1/chart/ChartManager.java b/project1/chart/ChartManager.java new file mode 100644 index 0000000..6e23306 --- /dev/null +++ b/project1/chart/ChartManager.java @@ -0,0 +1,30 @@ +package com.crawler.chart; + +import com.crawler.chart.impl.GenreDistributionChartGenerator; +import com.crawler.chart.impl.RatingDistributionChartGenerator; +import com.crawler.chart.impl.YearDistributionChartGenerator; +import com.crawler.chart.impl.YearRatingChartGenerator; +import com.crawler.model.Movie; + +import java.util.ArrayList; +import java.util.List; + +public class ChartManager { + private List chartGenerators; + + public ChartManager() { + chartGenerators = new ArrayList<>(); + } + + public void addChartGenerator(ChartGenerator generator) { + chartGenerators.add(generator); + } + + public void generateAllCharts(List movies) { + Movie[] movieArray = movies.toArray(new Movie[0]); + for (ChartGenerator generator : chartGenerators) { + System.out.println("生成图表: " + generator.getChartName()); + generator.generateChart(movieArray); + } + } +} \ No newline at end of file diff --git a/project1/chart/impl/GenreDistributionChartGenerator.java b/project1/chart/impl/GenreDistributionChartGenerator.java new file mode 100644 index 0000000..c8534ae --- /dev/null +++ b/project1/chart/impl/GenreDistributionChartGenerator.java @@ -0,0 +1,25 @@ +package com.crawler.chart.impl; + +import com.crawler.chart.ChartGenerator; +import com.crawler.model.Movie; +import com.crawler.ui.MovieResultDisplay; + +import java.io.IOException; +import java.util.List; + +public class GenreDistributionChartGenerator implements ChartGenerator { + @Override + public void generateChart(Movie[] movies) { + List movieList = List.of(movies); + try { + MovieResultDisplay.generateGenreDistributionChart(movieList); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Override + public String getChartName() { + return "Genre Distribution Chart"; + } +} \ No newline at end of file diff --git a/project1/chart/impl/RatingDistributionChartGenerator.java b/project1/chart/impl/RatingDistributionChartGenerator.java new file mode 100644 index 0000000..6d69b4d --- /dev/null +++ b/project1/chart/impl/RatingDistributionChartGenerator.java @@ -0,0 +1,27 @@ +package com.crawler.chart.impl; + +import com.crawler.chart.ChartGenerator; +import com.crawler.model.Movie; +import com.crawler.ui.MovieResultDisplay; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class RatingDistributionChartGenerator implements ChartGenerator { + @Override + public void generateChart(Movie[] movies) { + List movieList = List.of(movies); + try { + MovieResultDisplay.generateRatingDistributionChart(movieList); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Override + public String getChartName() { + return "Rating Distribution Chart"; + } +} \ No newline at end of file diff --git a/project1/chart/impl/YearDistributionChartGenerator.java b/project1/chart/impl/YearDistributionChartGenerator.java new file mode 100644 index 0000000..471635a --- /dev/null +++ b/project1/chart/impl/YearDistributionChartGenerator.java @@ -0,0 +1,25 @@ +package com.crawler.chart.impl; + +import com.crawler.chart.ChartGenerator; +import com.crawler.model.Movie; +import com.crawler.ui.MovieResultDisplay; + +import java.io.IOException; +import java.util.List; + +public class YearDistributionChartGenerator implements ChartGenerator { + @Override + public void generateChart(Movie[] movies) { + List movieList = List.of(movies); + try { + MovieResultDisplay.generateYearDistributionChart(movieList); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Override + public String getChartName() { + return "Year Distribution Chart"; + } +} \ No newline at end of file diff --git a/project1/chart/impl/YearRatingChartGenerator.java b/project1/chart/impl/YearRatingChartGenerator.java new file mode 100644 index 0000000..0ce430c --- /dev/null +++ b/project1/chart/impl/YearRatingChartGenerator.java @@ -0,0 +1,25 @@ +package com.crawler.chart.impl; + +import com.crawler.chart.ChartGenerator; +import com.crawler.model.Movie; +import com.crawler.ui.MovieResultDisplay; + +import java.io.IOException; +import java.util.List; + +public class YearRatingChartGenerator implements ChartGenerator { + @Override + public void generateChart(Movie[] movies) { + List movieList = List.of(movies); + try { + MovieResultDisplay.generateYearRatingChart(movieList); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Override + public String getChartName() { + return "Year Rating Correlation Chart"; + } +} \ No newline at end of file diff --git a/project1/model/Movie.java b/project1/model/Movie.java new file mode 100644 index 0000000..90b85e8 --- /dev/null +++ b/project1/model/Movie.java @@ -0,0 +1,108 @@ +package com.crawler.model; + +public class Movie { + private int rank; + private String title; + private double rating; + private int ratingPeople; + private String director; + private String actors; + private String year; + private String country; + private String genre; + private String quote; + + // Getters and Setters + public int getRank() { + return rank; + } + + public void setRank(int rank) { + this.rank = rank; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public double getRating() { + return rating; + } + + public void setRating(double rating) { + this.rating = rating; + } + + public int getRatingPeople() { + return ratingPeople; + } + + public void setRatingPeople(int ratingPeople) { + this.ratingPeople = ratingPeople; + } + + public String getDirector() { + return director; + } + + public void setDirector(String director) { + this.director = director; + } + + public String getActors() { + return actors; + } + + public void setActors(String actors) { + this.actors = actors; + } + + public String getYear() { + return year; + } + + public void setYear(String year) { + this.year = year; + } + + public String getCountry() { + return country; + } + + public void setCountry(String country) { + this.country = country; + } + + public String getGenre() { + return genre; + } + + public void setGenre(String genre) { + this.genre = genre; + } + + public String getQuote() { + return quote; + } + + public void setQuote(String quote) { + this.quote = quote; + } + + @Override + public String toString() { + return "Movie{" + + "rank=" + rank + + ", title='" + title + '\'' + + ", rating=" + rating + + ", ratingPeople=" + ratingPeople + + ", director='" + director + '\'' + + ", year='" + year + '\'' + + ", genre='" + genre + '\'' + + '}'; + } +} \ No newline at end of file diff --git a/project1/pom.xml b/project1/pom.xml new file mode 100644 index 0000000..d5a621e --- /dev/null +++ b/project1/pom.xml @@ -0,0 +1,55 @@ + + + 4.0.0 + + com.crawler + job-crawler + 1.0-SNAPSHOT + + + 1.8 + 1.8 + UTF-8 + + + + + + org.jsoup + jsoup + 1.17.2 + + + + + org.jfree + jfreechart + 1.5.4 + + + + + org.jfree + jcommon + 1.0.24 + + + + + src/main/java + target/classes + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + ${maven.compiler.source} + ${maven.compiler.target} + + + + + \ No newline at end of file diff --git a/project1/spider/DoubanSpider.java b/project1/spider/DoubanSpider.java new file mode 100644 index 0000000..a85e66a --- /dev/null +++ b/project1/spider/DoubanSpider.java @@ -0,0 +1,206 @@ +package com.crawler.spider; + +import com.crawler.model.Movie; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.*; + +public class DoubanSpider { + private static final String BASE_URL = "https://movie.douban.com/top250"; + private static final int MAX_PAGES = 10; + private static final int THREAD_POOL_SIZE = 3; + private static final int REQUEST_DELAY = 1000; + + public List crawlMovies() { + List movieList = new ArrayList<>(); + ExecutorService executorService = Executors.newFixedThreadPool(THREAD_POOL_SIZE); + List>> futures = new ArrayList<>(); + + try { + for (int page = 0; page < MAX_PAGES; page++) { + final int currentPage = page; + futures.add(executorService.submit(() -> { + try { + Thread.sleep(REQUEST_DELAY); + return crawlPage(currentPage); + } catch (Exception e) { + e.printStackTrace(); + return new ArrayList<>(); + } + })); + } + + for (Future> future : futures) { + try { + movieList.addAll(future.get()); + } catch (Exception e) { + e.printStackTrace(); + } + } + } finally { + executorService.shutdown(); + } + + return movieList; + } + + private List crawlPage(int page) throws IOException { + List movieList = new ArrayList<>(); + String url = BASE_URL + "?start=" + (page * 25); + System.out.println("爬取页面: " + url); + + Document document = Jsoup.connect(url) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") + .timeout(10000) + .get(); + + System.out.println("页面标题: " + document.title()); + + // 选择电影条目 + Elements movieItems = document.select(".grid_view li"); + System.out.println("找到电影条目数: " + movieItems.size()); + + for (Element item : movieItems) { + Movie movie = parseMovie(item); + if (movie != null) { + movieList.add(movie); + } + } + + System.out.println("页面" + (page + 1) + "爬取成功,获取电影数: " + movieList.size()); + return movieList; + } + + private Movie parseMovie(Element item) { + Movie movie = new Movie(); + + try { + // 排名 + Element rankElement = item.selectFirst(".pic em"); + if (rankElement != null) { + movie.setRank(Integer.parseInt(rankElement.text().trim())); + } + + // 标题 + Element titleElement = item.selectFirst(".title"); + if (titleElement != null) { + movie.setTitle(titleElement.text().trim()); + } + + // 评分 + Element ratingElement = item.selectFirst(".rating_num"); + if (ratingElement != null) { + movie.setRating(Double.parseDouble(ratingElement.text().trim())); + } + + // 评价人数 + Element ratingPeopleElement = item.selectFirst(".star span:nth-child(4)"); + if (ratingPeopleElement != null) { + String ratingPeople = ratingPeopleElement.text().trim(); + movie.setRatingPeople(Integer.parseInt(ratingPeople.replaceAll("[^0-9]", ""))); + } + + // 导演和演员 + Element infoElement = item.selectFirst(".bd p:first-child"); + if (infoElement != null) { + String info = infoElement.text().trim(); + + // 提取导演 + if (info.contains("导演:")) { + int directorStart = info.indexOf("导演:") + 3; + int directorEnd = info.indexOf("主演:"); + if (directorEnd == -1) { + directorEnd = info.indexOf(" "); + // 找到第一个数字年份的位置 + for (int i = 0; i < info.length(); i++) { + if (Character.isDigit(info.charAt(i))) { + directorEnd = i; + break; + } + } + } + if (directorEnd != -1) { + movie.setDirector(info.substring(directorStart, directorEnd).trim()); + } + } + + // 提取主演 + if (info.contains("主演:")) { + int actorsStart = info.indexOf("主演:") + 3; + int actorsEnd = info.length(); + // 找到第一个数字年份的位置 + for (int i = actorsStart; i < info.length(); i++) { + if (Character.isDigit(info.charAt(i))) { + actorsEnd = i; + break; + } + } + movie.setActors(info.substring(actorsStart, actorsEnd).trim()); + } + + // 提取年份、国家/地区和类型 + // 找到年份的开始位置(第一个数字) + int yearStart = -1; + for (int i = 0; i < info.length(); i++) { + if (Character.isDigit(info.charAt(i))) { + yearStart = i; + break; + } + } + + if (yearStart != -1) { + // 提取年份(4位数字) + if (yearStart + 4 <= info.length()) { + String year = info.substring(yearStart, yearStart + 4); + if (year.matches("\\d{4}")) { + movie.setYear(year); + } + } + + // 提取国家/地区和类型 + int slashIndex = info.indexOf("/", yearStart); + if (slashIndex != -1) { + // 提取国家/地区 + int nextSlashIndex = info.indexOf("/", slashIndex + 1); + if (nextSlashIndex != -1) { + String country = info.substring(slashIndex + 1, nextSlashIndex).trim(); + movie.setCountry(country); + + // 提取类型 + String genre = info.substring(nextSlashIndex + 1).trim(); + // 取第一个类型 + if (!genre.isEmpty()) { + String[] genres = genre.split(" "); + if (genres.length > 0) { + movie.setGenre(genres[0]); + } + } + } + } + } + } + + // 简介 + Element quoteElement = item.selectFirst(".inq"); + if (quoteElement != null) { + movie.setQuote(quoteElement.text().trim()); + } + + // 过滤无效电影 + if (movie.getTitle() == null || movie.getTitle().isEmpty()) { + return null; + } + + return movie; + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } +} \ No newline at end of file diff --git a/project1/ui/MovieResultDisplay.java b/project1/ui/MovieResultDisplay.java new file mode 100644 index 0000000..a26a246 --- /dev/null +++ b/project1/ui/MovieResultDisplay.java @@ -0,0 +1,216 @@ +package com.crawler.ui; + +import com.crawler.analysis.MovieAnalyzer; +import com.crawler.model.Movie; +import org.jfree.chart.ChartFactory; +import org.jfree.chart.ChartUtils; +import org.jfree.chart.JFreeChart; +import org.jfree.chart.plot.PlotOrientation; +import org.jfree.data.category.DefaultCategoryDataset; +import org.jfree.data.general.DefaultPieDataset; +import org.jfree.data.statistics.HistogramDataset; +import org.jfree.chart.plot.PiePlot; +import org.jfree.chart.labels.StandardPieSectionLabelGenerator; +import java.text.DecimalFormat; +import java.text.NumberFormat; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public class MovieResultDisplay { + // 控制台输出统计结果 + public static void displayResults(List movieList) { + System.out.println("\n=== 电影数据统计结果 ==="); + System.out.println("爬取电影总数: " + movieList.size()); + + // 平均评分 + double averageRating = MovieAnalyzer.calculateAverageRating(movieList); + System.out.printf("平均评分: %.2f\n", averageRating); + + // 电影评分分布 + System.out.println("\n=== 电影评分分布 ==="); + Map ratingDistribution = MovieAnalyzer.analyzeRatingDistribution(movieList); + for (Map.Entry entry : ratingDistribution.entrySet()) { + System.out.printf("评分 %.1f: %d部\n", entry.getKey(), entry.getValue()); + } + + // 电影年份分布(最近20年) + System.out.println("\n=== 电影年份分布(最近20年)==="); + Map yearDistribution = MovieAnalyzer.analyzeYearDistribution(movieList); + int count = 0; + for (Map.Entry entry : yearDistribution.entrySet()) { + if (count >= yearDistribution.size() - 20) { // 只显示最近20年 + System.out.printf("%s年: %d部\n", entry.getKey(), entry.getValue()); + } + count++; + } + + // 电影类型分布 + System.out.println("\n=== 电影类型分布 ==="); + Map genreDistribution = MovieAnalyzer.analyzeGenreDistribution(movieList); + genreDistribution.entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(10) // 只显示前10种类型 + .forEach(entry -> System.out.printf("%-10s: %d部\n", entry.getKey(), entry.getValue())); + + // 导演作品数量排行 + System.out.println("\n=== 导演作品数量排行 ==="); + Map directorWorks = MovieAnalyzer.analyzeDirectorWorks(movieList); + count = 0; + for (Map.Entry entry : directorWorks.entrySet()) { + if (count < 10) { // 只显示前10位导演 + System.out.printf("%-20s: %d部\n", entry.getKey(), entry.getValue()); + count++; + } else { + break; + } + } + + // 评分与年份相关性 + System.out.println("\n=== 评分与年份相关性 ==="); + Map yearRatingCorrelation = MovieAnalyzer.analyzeYearRatingCorrelation(movieList); + for (Map.Entry entry : yearRatingCorrelation.entrySet()) { + System.out.printf("%s年: 平均评分 %.2f\n", entry.getKey(), entry.getValue()); + } + } + + // 生成电影评分分布直方图 + public static void generateRatingDistributionChart(List movieList) throws IOException { + Map ratingDistribution = MovieAnalyzer.analyzeRatingDistribution(movieList); + DefaultCategoryDataset dataset = new DefaultCategoryDataset(); + + for (Map.Entry entry : ratingDistribution.entrySet()) { + dataset.addValue(entry.getValue(), "Count", entry.getKey().toString()); + } + + JFreeChart chart = ChartFactory.createBarChart( + "Movie Rating Distribution", + "Rating", + "Count", + dataset, + PlotOrientation.VERTICAL, + true, + true, + false + ); + + ChartUtils.saveChartAsPNG(new File("movie_rating_distribution.png"), chart, 800, 600); + System.out.println("电影评分分布图表已保存为 movie_rating_distribution.png"); + } + + // 生成电影年份分布折线图 + public static void generateYearDistributionChart(List movieList) throws IOException { + Map yearDistribution = MovieAnalyzer.analyzeYearDistribution(movieList); + DefaultCategoryDataset dataset = new DefaultCategoryDataset(); + + System.out.println("年份分布数据:"); + for (Map.Entry entry : yearDistribution.entrySet()) { + System.out.println("年份: '" + entry.getKey() + "', 数量: " + entry.getValue()); + // 尝试提取年份数字 + String year = entry.getKey(); + // 提取4位数字作为年份 + String yearMatch = year.replaceAll("[^0-9]", ""); + if (yearMatch.length() >= 4) { + yearMatch = yearMatch.substring(0, 4); + dataset.addValue(entry.getValue(), "Count", yearMatch); + } + } + + JFreeChart chart = ChartFactory.createLineChart( + "Movie Year Distribution", + "Year", + "Count", + dataset, + PlotOrientation.VERTICAL, + true, + true, + false + ); + + ChartUtils.saveChartAsPNG(new File("movie_year_distribution.png"), chart, 800, 600); + System.out.println("电影年份分布图表已保存为 movie_year_distribution.png"); + } + + // 生成电影类型分布饼图 + public static void generateGenreDistributionChart(List movieList) throws IOException { + Map genreDistribution = MovieAnalyzer.analyzeGenreDistribution(movieList); + DefaultPieDataset dataset = new DefaultPieDataset(); + + // 只显示前10种类型 + genreDistribution.entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(10) + .forEach(entry -> { + // 使用英文标签避免中文显示问题 + String englishLabel = getEnglishGenre(entry.getKey()) + " (" + entry.getValue() + ")"; + dataset.setValue(englishLabel, entry.getValue()); + }); + + JFreeChart chart = ChartFactory.createPieChart( + "Movie Genre Distribution", // 使用英文标题 + dataset, + true, // 显示图例 + true, // 显示工具提示 + false // 不显示URL + ); + + ChartUtils.saveChartAsPNG(new File("movie_genre_distribution.png"), chart, 800, 600); + System.out.println("电影类型分布图表已保存为 movie_genre_distribution.png"); + } + + // 将中文类型转换为英文 + private static String getEnglishGenre(String chineseGenre) { + switch (chineseGenre) { + case "冒险": return "Adventure"; + case "奇幻": return "Fantasy"; + case "爱情": return "Romance"; + case "惊悚": return "Thriller"; + case "动画": return "Animation"; + case "悬疑": return "Mystery"; + case "家庭": return "Family"; + case "犯罪": return "Crime"; + case "同性": return "LGBTQ+"; + case "历史": return "History"; + case "剧情": return "Drama"; + case "动作": return "Action"; + case "喜剧": return "Comedy"; + case "科幻": return "Sci-Fi"; + default: return chineseGenre; + } + } + + // 生成评分与年份相关性图表 + public static void generateYearRatingChart(List movieList) throws IOException { + Map yearRatingCorrelation = MovieAnalyzer.analyzeYearRatingCorrelation(movieList); + DefaultCategoryDataset dataset = new DefaultCategoryDataset(); + + System.out.println("评分与年份相关性数据:"); + for (Map.Entry entry : yearRatingCorrelation.entrySet()) { + System.out.println("年份: '" + entry.getKey() + "', 平均评分: " + entry.getValue()); + // 尝试提取年份数字 + String year = entry.getKey(); + // 提取4位数字作为年份 + String yearMatch = year.replaceAll("[^0-9]", ""); + if (yearMatch.length() >= 4) { + yearMatch = yearMatch.substring(0, 4); + dataset.addValue(entry.getValue(), "Avg Rating", yearMatch); + } + } + + JFreeChart chart = ChartFactory.createLineChart( + "Year vs Rating Correlation", + "Year", + "Average Rating", + dataset, + PlotOrientation.VERTICAL, + true, + true, + false + ); + + ChartUtils.saveChartAsPNG(new File("movie_year_rating.png"), chart, 800, 600); + System.out.println("评分与年份相关性图表已保存为 movie_year_rating.png"); + } +} \ No newline at end of file diff --git a/project1/utils/DataUtils.java b/project1/utils/DataUtils.java new file mode 100644 index 0000000..5893a8f --- /dev/null +++ b/project1/utils/DataUtils.java @@ -0,0 +1,91 @@ +package com.crawler.utils; + +import com.crawler.model.Movie; + +import java.io.FileWriter; +import java.io.IOException; +import java.util.List; + +public class DataUtils { + // 清洗电影数据 + public static Movie cleanMovie(Movie movie) { + if (movie == null) return null; + + // 清洗标题 + if (movie.getTitle() != null) { + movie.setTitle(movie.getTitle().trim().replaceAll("\\s+", " ")); + } + + // 清洗导演 + if (movie.getDirector() != null) { + movie.setDirector(movie.getDirector().trim()); + } + + // 清洗演员 + if (movie.getActors() != null) { + movie.setActors(movie.getActors().trim()); + } + + // 清洗年份 + if (movie.getYear() != null) { + movie.setYear(movie.getYear().trim()); + } + + // 清洗国家/地区 + if (movie.getCountry() != null) { + movie.setCountry(movie.getCountry().trim()); + } + + // 清洗类型 + if (movie.getGenre() != null) { + movie.setGenre(movie.getGenre().trim()); + } + + // 清洗简介 + if (movie.getQuote() != null) { + movie.setQuote(movie.getQuote().trim().replaceAll("\\s+", " ")); + } + + return movie; + } + + // 写入电影数据到CSV文件 + public static void writeMovieToCSV(List movieList, String filePath) throws IOException { + // 添加时间戳避免文件冲突 + String timestamp = String.valueOf(System.currentTimeMillis()); + String actualFilePath = filePath.replace(".csv", "_" + timestamp + ".csv"); + + FileWriter writer = new FileWriter(actualFilePath); + // 写入表头 + writer.write("排名,标题,评分,评价人数,导演,演员,年份,国家/地区,类型,简介\n"); + + // 写入数据 + for (Movie movie : movieList) { + if (movie != null) { + writer.write(movie.getRank() + ","); + writer.write(escapeCsv(movie.getTitle()) + ","); + writer.write(movie.getRating() + ","); + writer.write(movie.getRatingPeople() + ","); + writer.write(escapeCsv(movie.getDirector()) + ","); + writer.write(escapeCsv(movie.getActors()) + ","); + writer.write(escapeCsv(movie.getYear()) + ","); + writer.write(escapeCsv(movie.getCountry()) + ","); + writer.write(escapeCsv(movie.getGenre()) + ","); + writer.write(escapeCsv(movie.getQuote()) + "\n"); + } + } + + writer.close(); + System.out.println("数据已保存到 " + actualFilePath); + } + + // 转义CSV特殊字符 + private static String escapeCsv(String value) { + if (value == null) return ""; + if (value.contains(",") || value.contains("\"")) { + value = value.replaceAll("\"", "\"\""); + return "\"" + value + "\""; + } + return value; + } +} \ No newline at end of file