diff --git a/crawl_project_extension_2/pom.xml b/crawl_project_extension_2/pom.xml new file mode 100644 index 0000000..cf0832e --- /dev/null +++ b/crawl_project_extension_2/pom.xml @@ -0,0 +1,41 @@ + + 4.0.0 + + org.example + crawl_project_extension_2 + 1.0-SNAPSHOT + jar + + crawl_project_extension_2 + http://maven.apache.org + + + UTF-8 + + + + + junit + junit + 3.8.1 + test + + + org.jsoup + jsoup + 1.17.2 + + + com.opencsv + opencsv + 5.9 + + + org.knowm.xchart + xchart + 3.8.7 + + + + diff --git a/crawl_project_extension_2/src/main/java/com/example/ChartGenerator.java b/crawl_project_extension_2/src/main/java/com/example/ChartGenerator.java new file mode 100644 index 0000000..a091c3a --- /dev/null +++ b/crawl_project_extension_2/src/main/java/com/example/ChartGenerator.java @@ -0,0 +1,128 @@ +package com.example; + +import org.knowm.xchart.*; +import org.knowm.xchart.style.Styler; + +import java.awt.*; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.stream.Collectors; +public class ChartGenerator { + + // 1. 绘制【年份电影数量 - 柱状图】 + public static void saveBarChart(List movies) { + Map yearMap = movies.stream() + .filter(m -> m.getYear() > 1980) + .collect(Collectors.groupingBy(Movie::getYear, Collectors.counting())); + + List> sortedList = new ArrayList<>(yearMap.entrySet()); + sortedList.sort(Entry.comparingByKey()); + + if (sortedList.size() > 15) { + sortedList = sortedList.subList(0, 15); + } + + List xData = new ArrayList<>(); + List yData = new ArrayList<>(); + for (Entry entry : sortedList) { + xData.add(entry.getKey().toString()); + yData.add(entry.getValue()); + } + + CategoryChart chart = new CategoryChartBuilder() + .width(1000) + .height(600) + .title("豆瓣Top250 - 各年份电影数量柱状图") + .xAxisTitle("年份") + .yAxisTitle("电影数量") + .theme(Styler.ChartTheme.Matlab) + .build(); + + chart.getStyler().setLegendVisible(false); + chart.getStyler().setLabelsVisible(true); + chart.getStyler().setXAxisLabelRotation(45); + chart.getStyler().setChartBackgroundColor(Color.WHITE); + + chart.addSeries("电影数量", xData, yData); + + try { + BitmapEncoder.saveBitmap(chart, "./年份电影数量_柱状图", BitmapEncoder.BitmapFormat.PNG); + System.out.println("✅ 柱状图已保存:年份电影数量_柱状图.png"); + } catch (IOException e) { + e.printStackTrace(); + } + } + + // 2. 绘制【评分趋势 - 折线图】 + public static void saveLineChart(List movies) { + Map avgRatingMap = movies.stream() + .filter(m -> m.getYear() > 1980) + .collect(Collectors.groupingBy(Movie::getYear, Collectors.averagingDouble(Movie::getRating))); + + List> sortedList = new ArrayList<>(avgRatingMap.entrySet()); + sortedList.sort(Entry.comparingByKey()); + + if (sortedList.size() > 15) { + sortedList = sortedList.subList(0, 15); + } + + // ✅ 修复:X轴使用数字类型 Integer,不再用字符串 + List xData = new ArrayList<>(); + List yData = new ArrayList<>(); + for (Entry entry : sortedList) { + xData.add(entry.getKey()); + yData.add(entry.getValue()); + } + + XYChart chart = new XYChartBuilder() + .width(1000) + .height(600) + .title("豆瓣Top250 - 历年平均评分趋势") + .xAxisTitle("年份") + .yAxisTitle("平均评分") + .theme(Styler.ChartTheme.Matlab) + .build(); + + chart.getStyler().setMarkerSize(6); + chart.getStyler().setChartBackgroundColor(Color.WHITE); + chart.addSeries("平均评分", xData, yData); + + try { + BitmapEncoder.saveBitmap(chart, "./历年平均评分_折线图", BitmapEncoder.BitmapFormat.PNG); + System.out.println("✅ 折线图已保存!"); + } catch (IOException e) { + e.printStackTrace(); + } + } + + // 3. 绘制【高分电影占比 - 饼图】 + public static void savePieChart(List movies) { + long gao = movies.stream().filter(m -> m.getRating() >= 9.5).count(); + long zhong = movies.stream().filter(m -> m.getRating() >= 9.0 && m.getRating() < 9.5).count(); + long di = movies.stream().filter(m -> m.getRating() < 9.0).count(); + + PieChart chart = new PieChartBuilder() + .width(700) + .height(700) + .title("豆瓣Top250 - 评分分布饼图") + .theme(Styler.ChartTheme.Matlab) + .build(); + + chart.addSeries("9.5分及以上", gao); + chart.addSeries("9.0-9.5分", zhong); + chart.addSeries("9.0分以下", di); + + chart.getStyler().setChartBackgroundColor(Color.WHITE); + chart.getStyler().setLegendVisible(true); + + try { + BitmapEncoder.saveBitmap(chart, "./评分分布_饼图", BitmapEncoder.BitmapFormat.PNG); + System.out.println("✅ 饼图已保存:评分分布_饼图.png"); + } catch (IOException e) { + e.printStackTrace(); + } + } +} \ No newline at end of file diff --git a/crawl_project_extension_2/src/main/java/com/example/CsvExporter.java b/crawl_project_extension_2/src/main/java/com/example/CsvExporter.java new file mode 100644 index 0000000..662f4c0 --- /dev/null +++ b/crawl_project_extension_2/src/main/java/com/example/CsvExporter.java @@ -0,0 +1,37 @@ +package com.example; +import java.io.FileWriter; +import java.io.IOException; +import java.util.List; +public class CsvExporter{ + public static void exportToCsv(List movies, String filePath) { + try (FileWriter writer = new FileWriter(filePath)) { + // 1. 表头:确保顺序是【电影名称,导演,上映年份,豆瓣评分,评价人数】 + writer.write("电影名称,导演,上映年份,豆瓣评分,评价人数\n"); + + // 2. 写入数据:字段顺序必须和表头完全对应! + for (Movie movie : movies) { + String line = String.format("%s,%s,%d,%.1f,%d\n", + escapeCsv(movie.getTitle()), // 1.电影名称 + escapeCsv(movie.getDirector()), // 2.导演 + movie.getYear(), // 3.上映年份 + movie.getRating(), // 4.豆瓣评分 + movie.getReviewCount() // 5.评价人数(这里之前写反了!) + ); + writer.write(line); + } + System.out.println("\nCSV文件导出成功!路径:" + filePath); + System.out.println("提示:评价人数在第5列,已显示真实数据!"); + } catch (IOException e) { + e.printStackTrace(); + } + } + // CSV 特殊字符转义(避免逗号/引号导致格式错乱) + private static String escapeCsv(String value) { + if (value == null) return ""; + // 包含逗号、引号或换行时,用双引号包裹 + if (value.contains(",") || value.contains("\"") || value.contains("\n")) { + return "\"" + value.replace("\"", "\"\"") + "\""; + } + return value; + } +} diff --git a/crawl_project_extension_2/src/main/java/com/example/DataAnalyzer.java b/crawl_project_extension_2/src/main/java/com/example/DataAnalyzer.java new file mode 100644 index 0000000..477203d --- /dev/null +++ b/crawl_project_extension_2/src/main/java/com/example/DataAnalyzer.java @@ -0,0 +1,36 @@ +package com.example; + +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +public class DataAnalyzer implements MovieAnalyzer { + + @Override + public void analyzeByDimension(List movies) { + System.out.println("\n===== 评分最高Top10电影 ====="); + movies.stream() + .sorted((m1, m2) -> Double.compare(m2.getRating(), m1.getRating())) + .limit(10) + .forEach(m -> System.out.printf("%-25s 评分: %.1f 年份: %d%n", + m.getTitle(), m.getRating(), m.getYear())); + System.out.println("\n===== 各年份电影数量统计 ====="); + Map countByYear = movies.stream() + .filter(m -> m.getYear() != 0) + .collect(Collectors.groupingBy(Movie::getYear, Collectors.counting())); + + // 按年份排序输出 + countByYear.entrySet().stream() + .sorted(Map.Entry.comparingByKey()) + .forEach(entry -> + System.out.printf("年份: %-4d 数量: %d 部%n", entry.getKey(), entry.getValue())); + } + + // 统计总数据 + @Override + public void analyzeTotal(List movies){ + System.out.println("\n===== 数据总览 ====="); + System.out.println("电影总数:" + movies.size()); + double avgRating = movies.stream().mapToDouble(Movie::getRating).average().orElse(0); + System.out.printf("平均评分:%.2f%n", avgRating); + } +} diff --git a/crawl_project_extension_2/src/main/java/com/example/DoubanCrawler.java b/crawl_project_extension_2/src/main/java/com/example/DoubanCrawler.java new file mode 100644 index 0000000..fd91e92 --- /dev/null +++ b/crawl_project_extension_2/src/main/java/com/example/DoubanCrawler.java @@ -0,0 +1,101 @@ +package com.example; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +public class DoubanCrawler implements MovieCrawler { + // 编译年份正则(提取4位数字年份) + private static final Pattern YEAR_PATTERN = Pattern.compile("(\\d{4})"); + @Override + public List crawl() { + List movies = new ArrayList<>(); + String baseUrl = "https://movie.douban.com/top250?start="; + + try { + // 10页,每页25条 + for (int i = 0; i < 250; i += 25) { + String url = baseUrl + i; + System.out.println("正在爬取:" + url); + + Document doc = Jsoup.connect(url) + .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36") + .timeout(8000) + .get(); + + Elements items = doc.select(".item"); + for (Element item : items) { + Movie movie = new Movie(); + + // 1. 电影名 + movie.setTitle(item.select(".title").first().text()); + + // 2. 评分 + movie.setRating(Double.parseDouble(item.select(".rating_num").text())); + + // 3. 评价人数 + int reviewCount = 0; + String allText = item.text(); // 直接拿整个区块的文字 + Pattern pattern = Pattern.compile("(\\d+)人评价"); + Matcher matcher = pattern.matcher(allText); + if (matcher.find()) { + reviewCount = Integer.parseInt(matcher.group(1)); + } + movie.setReviewCount(reviewCount); + movie.setReviewCount(reviewCount); + // 4. 电影信息(导演 + 年份) + String info = item.select(".bd p").first().text(); + + // 清洗导演 + movie.setDirector(cleanDirector(info)); + // 清洗年份 + movie.setYear(cleanYear(info)); + + movies.add(movie); + } + + // 文明爬虫,随机延迟 + Thread.sleep((long) (Math.random() * 2000 + 1000)); + } + System.out.println("爬取完成!共获取 " + movies.size() + " 部电影"); + } catch (IOException | InterruptedException e) { + e.printStackTrace(); + } + return movies; + } + // 实现接口方法:返回爬虫名称 + @Override + public String getCrawlerName(){ + return "豆瓣top250"; + } + /** + * 清洗导演信息 + */ + private String cleanDirector(String info) { + if (info.contains("导演:")) { + int start = info.indexOf("导演:") + 3; + int end = info.indexOf(" ", start + 2); + if (end == -1) end = info.length(); + return info.substring(start, end).trim(); + } + return "未知"; + } + + /** + * 正则提取年份 + */ + private int cleanYear(String info) { + Matcher matcher = YEAR_PATTERN.matcher(info); + if (matcher.find()) { + return Integer.parseInt(matcher.group(1)); + } + return 0; + } +} + diff --git a/crawl_project_extension_2/src/main/java/com/example/M1905Crawler.java b/crawl_project_extension_2/src/main/java/com/example/M1905Crawler.java new file mode 100644 index 0000000..5833090 --- /dev/null +++ b/crawl_project_extension_2/src/main/java/com/example/M1905Crawler.java @@ -0,0 +1,103 @@ +package com.example; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class M1905Crawler implements MovieCrawler { + + // 1905电影网排行榜URL + private static final String RANK_URL = "https://www.1905.com/vod/rank/ta99o3.shtml"; + + @Override + public List crawl() { + List movies = new ArrayList<>(); + + try { + System.out.println("正在爬取:" + RANK_URL); + + Document doc = Jsoup.connect(RANK_URL) + .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") + .header("Referer", "https://www.1905.com/") + .timeout(15000) + .get(); + + // 表格结构:选择表格行(跳过表头) + Elements rows = doc.select("table tr"); + + System.out.println("找到行数:" + rows.size()); + + int rank = 1; + for (Element row : rows) { + // 跳过表头行 + if (row.select("th").size() > 0) continue; + + Elements cells = row.select("td"); + if (cells.size() < 5) continue; + + Movie movie = new Movie(); + + // 1. 排行(第1列) + // cells.get(0).text() -> "1", "2"... + + // 2. 趋势(第2列)- 忽略 + + // 3. 电影名称(第3列) + Element nameCell = cells.get(2); + Element nameLink = nameCell.selectFirst("a"); + String title = nameLink != null ? nameLink.text().trim() : nameCell.text().trim(); + movie.setTitle(title); + + // 获取详情页链接(用于方案二) + String detailUrl = nameLink != null ? nameLink.absUrl("href") : ""; + + // 4. 主演(第4列)- 作为导演字段存储(页面无导演信息) + String actors = cells.get(3).text().trim(); + movie.setDirector(actors); // 复用director字段存主演 + + // 5. 播放次数(第5列)- 作为reviewCount存储 + String playCountStr = cells.get(4).text().trim().replace(",", ""); + int playCount = 0; + try { + playCount = Integer.parseInt(playCountStr); + } catch (NumberFormatException e) { + // 忽略解析错误 + } + movie.setReviewCount(playCount); // 复用reviewCount字段存播放次数 + + // 评分和年份:列表页没有,设为默认值 + movie.setRating(0.0); // 无评分数据 + movie.setYear(0); // 无年份数据 + + // 方案二:进入详情页获取完整信息(取消下面注释启用) + // if (!detailUrl.isEmpty()) { + // fillDetailInfo(movie, detailUrl); + // Thread.sleep(1000); // 礼貌延迟 + // } + + movies.add(movie); + System.out.printf("已解析 [%d] %s | 主演: %s | 播放: %d%n", + rank, title, actors, playCount); + rank++; + } + + System.out.println("爬取完成!共获取 " + movies.size() + " 部电影"); + + } catch (IOException e) { + System.err.println("爬取失败:" + e.getMessage()); + e.printStackTrace(); + } + + return movies; + + } + @Override + public String getCrawlerName () { + return "1905电影网播放排行榜"; + } +} \ No newline at end of file diff --git a/crawl_project_extension_2/src/main/java/com/example/Main.java b/crawl_project_extension_2/src/main/java/com/example/Main.java new file mode 100644 index 0000000..38d06ac --- /dev/null +++ b/crawl_project_extension_2/src/main/java/com/example/Main.java @@ -0,0 +1,53 @@ +package com.example; + +import java.util.List; + +public class Main { + public static void main(String[] args) { + // 通过参数切换爬虫:1905 或 douban(默认) + String source = "1905"; + if (args.length > 0) { + source = args[0].toLowerCase(); + } + + MovieCrawler crawler; + String csvName; + + switch (source) { + case "1905": + crawler = new M1905Crawler(); + csvName = "1905_rank.csv"; + break; + case "douban": + default: + crawler = new DoubanCrawler(); + csvName = "douban_top250.csv"; + break; + } + + System.out.println("使用爬虫:" + crawler.getCrawlerName()); + List movies = crawler.crawl(); + + if (movies.isEmpty()) { + System.err.println("未获取到任何电影数据!"); + return; + } + + // 数据分析 + MovieAnalyzer analyzer = new DataAnalyzer(); + analyzer.analyzeTotal(movies); + analyzer.analyzeByDimension(movies); + + // 导出CSV + CsvExporter.exportToCsv(movies, csvName); + + // 生成图表(1905数据缺少评分/年份,图表可能为空或需调整) + if (!"1905".equals(source)) { + ChartGenerator.saveBarChart(movies); + ChartGenerator.saveLineChart(movies); + ChartGenerator.savePieChart(movies); + } else { + System.out.println("⚠️ 1905数据缺少评分/年份,跳过图表生成"); + } + } +} \ No newline at end of file diff --git a/crawl_project_extension_2/src/main/java/com/example/Movie.java b/crawl_project_extension_2/src/main/java/com/example/Movie.java new file mode 100644 index 0000000..3308675 --- /dev/null +++ b/crawl_project_extension_2/src/main/java/com/example/Movie.java @@ -0,0 +1,75 @@ +package com.example; + +public class Movie { + private String title; // 电影名称 + private String director; // 导演 + private int year; // 上映年份 + private double rating; // 评分 + private int reviewCount; // 评价人数 + + // 无参构造 + public Movie() {} + + // 全参构造 + public Movie(String title, String director, int year, double rating, int reviewCount) { + this.title = title; + this.director = director; + this.year = year; + this.rating = rating; + this.reviewCount = reviewCount; + } + + // Getter & Setter + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getDirector() { + return director; + } + + public void setDirector(String director) { + this.director = director; + } + + public int getYear() { + return year; + } + + public void setYear(int year) { + this.year = year; + } + + public double getRating() { + return rating; + } + + public void setRating(double rating) { + this.rating = rating; + } + + public int getReviewCount() { + return reviewCount; + } + + public void setReviewCount(int reviewCount) { + this.reviewCount = reviewCount; + } + + // 打印输出 + @Override + public String toString() { + return "Movie{" + + "片名='" + title + '\'' + + ", 导演='" + director + '\'' + + ", 年份=" + year + + ", 评分=" + rating + + ", 评价人数=" + reviewCount + + '}'; + } +} + diff --git a/crawl_project_extension_2/src/main/java/com/example/MovieAnalyzer.java b/crawl_project_extension_2/src/main/java/com/example/MovieAnalyzer.java new file mode 100644 index 0000000..d8edf81 --- /dev/null +++ b/crawl_project_extension_2/src/main/java/com/example/MovieAnalyzer.java @@ -0,0 +1,8 @@ +package com.example; +import java.util.List; +public interface MovieAnalyzer { + // 总览分析 + void analyzeTotal(List movies); + // 按维度分析(TopN、年份等) + void analyzeByDimension(List movies); +} diff --git a/crawl_project_extension_2/src/main/java/com/example/MovieCrawler.java b/crawl_project_extension_2/src/main/java/com/example/MovieCrawler.java new file mode 100644 index 0000000..f9589bd --- /dev/null +++ b/crawl_project_extension_2/src/main/java/com/example/MovieCrawler.java @@ -0,0 +1,8 @@ +package com.example; +import java.util.List; +public interface MovieCrawler { + // 爬取电影列表 + List crawl(); + // 获取爬虫名称(如"豆瓣Top250"、"IMDB Top100") + String getCrawlerName(); +} diff --git a/crawl_project_extension_2/src/main/java/org/example/App.java b/crawl_project_extension_2/src/main/java/org/example/App.java new file mode 100644 index 0000000..5f21d2e --- /dev/null +++ b/crawl_project_extension_2/src/main/java/org/example/App.java @@ -0,0 +1,13 @@ +package org.example; + +/** + * Hello world! + * + */ +public class App +{ + public static void main( String[] args ) + { + System.out.println( "Hello World!" ); + } +} diff --git a/crawl_project_extension_2/src/test/java/org/example/AppTest.java b/crawl_project_extension_2/src/test/java/org/example/AppTest.java new file mode 100644 index 0000000..d5f435d --- /dev/null +++ b/crawl_project_extension_2/src/test/java/org/example/AppTest.java @@ -0,0 +1,38 @@ +package org.example; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +/** + * Unit test for simple App. + */ +public class AppTest + extends TestCase +{ + /** + * Create the test case + * + * @param testName name of the test case + */ + public AppTest( String testName ) + { + super( testName ); + } + + /** + * @return the suite of tests being tested + */ + public static Test suite() + { + return new TestSuite( AppTest.class ); + } + + /** + * Rigourous Test :-) + */ + public void testApp() + { + assertTrue( true ); + } +} diff --git a/crawl_project_extension_2/target/classes/com/example/ChartGenerator.class b/crawl_project_extension_2/target/classes/com/example/ChartGenerator.class new file mode 100644 index 0000000..a7b1194 Binary files /dev/null and b/crawl_project_extension_2/target/classes/com/example/ChartGenerator.class differ diff --git a/crawl_project_extension_2/target/classes/com/example/CsvExporter.class b/crawl_project_extension_2/target/classes/com/example/CsvExporter.class new file mode 100644 index 0000000..867bc56 Binary files /dev/null and b/crawl_project_extension_2/target/classes/com/example/CsvExporter.class differ diff --git a/crawl_project_extension_2/target/classes/com/example/DataAnalyzer.class b/crawl_project_extension_2/target/classes/com/example/DataAnalyzer.class new file mode 100644 index 0000000..ce1645e Binary files /dev/null and b/crawl_project_extension_2/target/classes/com/example/DataAnalyzer.class differ diff --git a/crawl_project_extension_2/target/classes/com/example/DoubanCrawler.class b/crawl_project_extension_2/target/classes/com/example/DoubanCrawler.class new file mode 100644 index 0000000..448bf17 Binary files /dev/null and b/crawl_project_extension_2/target/classes/com/example/DoubanCrawler.class differ diff --git a/crawl_project_extension_2/target/classes/com/example/Main.class b/crawl_project_extension_2/target/classes/com/example/Main.class new file mode 100644 index 0000000..b3ba57d Binary files /dev/null and b/crawl_project_extension_2/target/classes/com/example/Main.class differ diff --git a/crawl_project_extension_2/target/classes/com/example/Movie.class b/crawl_project_extension_2/target/classes/com/example/Movie.class new file mode 100644 index 0000000..b132f6b Binary files /dev/null and b/crawl_project_extension_2/target/classes/com/example/Movie.class differ diff --git a/crawl_project_extension_2/target/classes/com/example/MovieAnalyzer.class b/crawl_project_extension_2/target/classes/com/example/MovieAnalyzer.class new file mode 100644 index 0000000..3306c2e Binary files /dev/null and b/crawl_project_extension_2/target/classes/com/example/MovieAnalyzer.class differ diff --git a/crawl_project_extension_2/target/classes/com/example/MovieCrawler.class b/crawl_project_extension_2/target/classes/com/example/MovieCrawler.class new file mode 100644 index 0000000..78d4622 Binary files /dev/null and b/crawl_project_extension_2/target/classes/com/example/MovieCrawler.class differ diff --git a/crawl_project_extension_2/target/classes/org/example/App.class b/crawl_project_extension_2/target/classes/org/example/App.class new file mode 100644 index 0000000..4240a01 Binary files /dev/null and b/crawl_project_extension_2/target/classes/org/example/App.class differ