Browse Source

期末实验

main
故春 3 weeks ago
parent
commit
db0f3bf477
  1. BIN
      project/202401010417 杨玉婷 期末实验报告.docx
  2. 12
      project/AnimatedMovie.java
  3. 7
      project/App.java
  4. 21
      project/Book.java
  5. 43
      project/BookStrategy.java
  6. 5
      project/Command.java
  7. 7
      project/ConsoleView.java
  8. 16
      project/CrawlCommand.java
  9. 7
      project/CrawlException.java
  10. 17
      project/CrawlerContext.java
  11. 46
      project/CrawlerController.java
  12. 9
      project/CrawlerStrategy.java
  13. 86
      project/DoubanBookStrategy.java
  14. 13
      project/DoubanMovie.java
  15. 38
      project/DoubanStrategy.java
  16. 103
      project/ExcelExporter.java
  17. 38
      project/ExcelUtil.java
  18. 59
      project/Main.java
  19. 31
      project/Movie.java
  20. 21
      project/NewsArticle.java
  21. 12
      project/TheatreMovie.java
  22. 26
      project/WeatherStrategy.java
  23. 16
      project/XiaohongshuMovie.java
  24. 92
      project/XiaohongshuStrategy.java
  25. BIN
      project/图书数据.xlsx
  26. BIN
      project/小红书数据.xlsx
  27. BIN
      project/豆瓣电影数据.xlsx
  28. BIN
      project/豆瓣读书数据.xlsx

BIN
project/202401010417 杨玉婷 期末实验报告.docx

Binary file not shown.

12
project/AnimatedMovie.java

@ -0,0 +1,12 @@
package com.yyt.moviecrawler.model;
public class AnimatedMovie extends Movie {
public AnimatedMovie(String title, double score, String author) {
super(title, score, "动画电影", author);
}
@Override
public void printInfo() {
System.out.println("动画电影:《" + getTitle() + "》 | 评分:" + getScore());
}
}

7
project/App.java

@ -0,0 +1,7 @@
package com.yyt.moviecrawler;
public class App {
public static void main(String[] args) {
Main.main(args);
}
}

21
project/Book.java

@ -0,0 +1,21 @@
package com.yyt.moviecrawler.model;
public class Book {
private String title;
private double price;
private int starRating;
private String category;
public Book(String title, double price, int starRating, String category) {
this.title = title;
this.price = price;
this.starRating = starRating;
this.category = category;
}
// Getter
public String getTitle() { return title; }
public double getPrice() { return price; }
public int getStarRating() { return starRating; }
public String getCategory() { return category; }
}

43
project/BookStrategy.java

@ -0,0 +1,43 @@
package com.yyt.moviecrawler.strategy;
import com.yyt.moviecrawler.model.Book;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class BookStrategy {
public List<Book> crawl(int limit) {
List<Book> books = new ArrayList<>();
try {
Document doc = Jsoup.connect("http://books.toscrape.com/").get();
Elements bookElements = doc.select("article.product_pod");
for (int i = 0; i < Math.min(limit, bookElements.size()); i++) {
Element el = bookElements.get(i);
String title = el.select("h3 a").attr("title");
double price = Double.parseDouble(el.select(".price_color").text().replace("£", ""));
int star = getStarRating(el.select(".star-rating").attr("class"));
String category = "Books to Scrape";
books.add(new Book(title, price, star, category));
}
} catch (IOException e) {
e.printStackTrace();
}
return books;
}
private int getStarRating(String className) {
if (className.contains("One")) return 1;
if (className.contains("Two")) return 2;
if (className.contains("Three")) return 3;
if (className.contains("Four")) return 4;
if (className.contains("Five")) return 5;
return 0;
}
}

5
project/Command.java

@ -0,0 +1,5 @@
package com.yyt.moviecrawler.command;
public interface Command {
void execute();
}

7
project/ConsoleView.java

@ -0,0 +1,7 @@
package com.yyt.moviecrawler.view;
public class ConsoleView {
public void print(String msg) {
System.out.println(msg);
}
}

16
project/CrawlCommand.java

@ -0,0 +1,16 @@
package com.yyt.moviecrawler.command;
import com.yyt.moviecrawler.controller.CrawlerController;
public class CrawlCommand implements Command {
private final CrawlerController controller;
public CrawlCommand(CrawlerController controller) {
this.controller = controller;
}
@Override
public void execute() {
controller.doCrawl();
}
}

7
project/CrawlException.java

@ -0,0 +1,7 @@
package com.yyt.moviecrawler.exception;
public class CrawlException extends RuntimeException {
public CrawlException(String msg) {
super(msg);
}
}

17
project/CrawlerContext.java

@ -0,0 +1,17 @@
package com.yyt.moviecrawler.util;
import com.yyt.moviecrawler.model.Movie;
import com.yyt.moviecrawler.strategy.CrawlerStrategy;
import java.util.List;
public class CrawlerContext {
private CrawlerStrategy strategy;
public void setStrategy(CrawlerStrategy strategy) {
this.strategy = strategy;
}
public List<Movie> executeStrategy(int limit) {
return strategy.crawl(limit);
}
}

46
project/CrawlerController.java

@ -0,0 +1,46 @@
package com.yyt.moviecrawler.controller;
import com.yyt.moviecrawler.view.ConsoleView;
import com.yyt.moviecrawler.util.CrawlerContext;
import com.yyt.moviecrawler.model.Movie;
import com.yyt.moviecrawler.strategy.CrawlerStrategy;
import com.yyt.moviecrawler.strategy.DoubanStrategy;
import com.yyt.moviecrawler.strategy.XiaohongshuStrategy;
import java.util.List;
import java.util.Scanner;
public class CrawlerController {
private final ConsoleView view;
private final CrawlerContext context;
public CrawlerController(ConsoleView view, CrawlerContext context) {
this.view = view;
this.context = context;
}
public void doCrawl() {
try {
view.print("===== 开始爬取 =====");
// 1. 豆瓣电影
view.print("正在爬取豆瓣电影...");
context.setStrategy(new DoubanStrategy());
List<Movie> doubanMovies = context.executeStrategy(20);
view.print("豆瓣完成:" + doubanMovies.size() + "条");
// 2. 小红书电影(保留登录逻辑)
view.print("正在爬取小红书电影...");
System.out.println("⚠️ 小红书窗口已打开,请登录你的账号!登录完成后按回车继续");
Scanner scanner = new Scanner(System.in);
scanner.nextLine();
context.setStrategy(new XiaohongshuStrategy());
List<Movie> xhsMovies = context.executeStrategy(20);
view.print("小红书完成:" + xhsMovies.size() + "条");
view.print("===== 爬取结束 =====");
} catch (Exception e) {
e.printStackTrace();
}
}
}

9
project/CrawlerStrategy.java

@ -0,0 +1,9 @@
package com.yyt.moviecrawler.strategy;
import com.yyt.moviecrawler.model.Movie;
import java.util.List;
public interface CrawlerStrategy {
// 统一方法签名:接收 limit 参数,返回 List<Movie>
List<Movie> crawl(int limit);
}

86
project/DoubanBookStrategy.java

@ -0,0 +1,86 @@
package com.yyt.moviecrawler.strategy;
import com.yyt.moviecrawler.model.Book;
import io.github.bonigarcia.wdm.WebDriverManager;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import java.util.ArrayList;
import java.util.List;
import java.time.Duration;
public class DoubanBookStrategy {
public List<Book> crawl(int limit) {
List<Book> bookList = new ArrayList<>();
// 配置浏览器,伪装成真实用户
ChromeOptions options = new ChromeOptions();
options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36");
options.addArguments("--disable-blink-features=AutomationControlled");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
WebDriverManager.chromedriver().setup();
WebDriver driver = new ChromeDriver(options);
driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(5));
try {
// 豆瓣读书 → 小说分类页面
String url = "https://book.douban.com/tag/小说?type=T";
driver.get(url);
Thread.sleep(3000); // 等待页面加载完成
// 循环爬取,直到拿到limit条数据
while (bookList.size() < limit) {
List<WebElement> items = driver.findElements(By.cssSelector(".info"));
for (WebElement item : items) {
if (bookList.size() >= limit) break;
try {
// 提取书名
String title = item.findElement(By.cssSelector("h2 a")).getText().trim();
// 提取评分(String类型,如"9.3")
String ratingStr = item.findElement(By.cssSelector(".rating_nums")).getText().trim();
// 转换数据,匹配Book类构造器
double price = 0.0; // 豆瓣无价格,用默认值
int starRating = 0;
if (!ratingStr.isEmpty()) {
starRating = (int) Math.round(Double.parseDouble(ratingStr));
}
String category = "小说";
// 按构造器顺序调用:title, price, starRating, category
bookList.add(new Book(title, price, starRating, category));
} catch (Exception e) {
// 个别元素缺失直接跳过,不影响整体爬取
}
}
// 如果数据不够,点击下一页继续爬取
if (bookList.size() < limit) {
try {
WebElement nextBtn = driver.findElement(By.cssSelector(".paginator .next a"));
nextBtn.click();
Thread.sleep(3000);
} catch (Exception e) {
// 没有下一页则退出循环
break;
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
driver.quit(); // 关闭浏览器,释放资源
}
System.out.println("✅ 豆瓣读书真实爬取完成,拿到:" + bookList.size() + " 条数据");
return bookList;
}
}

13
project/DoubanMovie.java

@ -0,0 +1,13 @@
package com.yyt.moviecrawler.model;
public class DoubanMovie extends Movie {
public DoubanMovie(String title, double score, String type, String author) {
super(title, score, type, author);
}
// 必须实现父类的抽象方法 printInfo()
@Override
public void printInfo() {
System.out.println("豆瓣电影:《" + getTitle() + "》 | 评分:" + getScore() + " | 导演:" + getAuthor());
}
}

38
project/DoubanStrategy.java

@ -0,0 +1,38 @@
package com.yyt.moviecrawler.strategy;
import com.yyt.moviecrawler.model.Movie;
import com.yyt.moviecrawler.model.DoubanMovie;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.util.ArrayList;
import java.util.List;
public class DoubanStrategy implements CrawlerStrategy {
@Override
public List<Movie> crawl(int limit) {
WebDriver driver = new ChromeDriver();
driver.get("https://movie.douban.com/top250");
List<Movie> movies = new ArrayList<>();
try {
Thread.sleep(3000);
List<WebElement> elements = driver.findElements(By.cssSelector(".item"));
for (int i = 0; i < Math.min(limit, elements.size()); i++) {
WebElement el = elements.get(i);
String title = el.findElement(By.cssSelector(".title")).getText();
double score = Double.parseDouble(el.findElement(By.cssSelector(".rating_num")).getText());
String type = el.findElement(By.cssSelector(".bd p")).getText().split("/")[1].trim();
String author = el.findElement(By.cssSelector(".bd p")).getText().split("/")[0].trim();
movies.add(new DoubanMovie(title, score, type, author));
}
} catch (Exception e) {
e.printStackTrace();
} finally {
driver.quit();
}
return movies;
}
}

103
project/ExcelExporter.java

@ -0,0 +1,103 @@
package com.yyt.moviecrawler.util;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.FileOutputStream;
import java.util.List;
import java.util.function.Function;
public class ExcelExporter {
// 导出电影数据
public static void exportMovies(List<? extends com.yyt.moviecrawler.model.Movie> movies, String fileName) {
try (Workbook workbook = new XSSFWorkbook()) {
Sheet sheet = workbook.createSheet("电影数据");
Row header = sheet.createRow(0);
header.createCell(0).setCellValue("电影名称");
header.createCell(1).setCellValue("评分");
header.createCell(2).setCellValue("类型");
header.createCell(3).setCellValue("作者/导演");
int rowIndex = 1;
for (com.yyt.moviecrawler.model.Movie movie : movies) {
Row row = sheet.createRow(rowIndex++);
row.createCell(0).setCellValue(movie.getTitle());
row.createCell(1).setCellValue(movie.getScore());
row.createCell(2).setCellValue(movie.getType());
row.createCell(3).setCellValue(movie.getAuthor());
}
autoSizeColumns(sheet, 4);
try (FileOutputStream fos = new FileOutputStream(fileName)) {
workbook.write(fos);
}
System.out.println("✅ " + fileName + " 导出成功!共" + movies.size() + "条数据");
} catch (Exception e) {
e.printStackTrace();
}
}
// 导出图书数据
public static void exportBooks(List<com.yyt.moviecrawler.model.Book> books, String fileName) {
try (Workbook workbook = new XSSFWorkbook()) {
Sheet sheet = workbook.createSheet("图书数据");
Row header = sheet.createRow(0);
header.createCell(0).setCellValue("书名");
header.createCell(1).setCellValue("价格(£)");
header.createCell(2).setCellValue("星级");
header.createCell(3).setCellValue("来源");
int rowIndex = 1;
for (com.yyt.moviecrawler.model.Book book : books) {
Row row = sheet.createRow(rowIndex++);
row.createCell(0).setCellValue(book.getTitle());
row.createCell(1).setCellValue(book.getPrice());
row.createCell(2).setCellValue(book.getStarRating());
row.createCell(3).setCellValue(book.getCategory());
}
autoSizeColumns(sheet, 4);
try (FileOutputStream fos = new FileOutputStream(fileName)) {
workbook.write(fos);
}
System.out.println("✅ " + fileName + " 导出成功!共" + books.size() + "条数据");
} catch (Exception e) {
e.printStackTrace();
}
}
// 导出新闻数据
public static void exportNews(List<com.yyt.moviecrawler.model.NewsArticle> articles, String fileName) {
try (Workbook workbook = new XSSFWorkbook()) {
Sheet sheet = workbook.createSheet("新闻数据");
Row header = sheet.createRow(0);
header.createCell(0).setCellValue("标题");
header.createCell(1).setCellValue("摘要");
header.createCell(2).setCellValue("发布时间");
header.createCell(3).setCellValue("分类");
int rowIndex = 1;
for (com.yyt.moviecrawler.model.NewsArticle article : articles) {
Row row = sheet.createRow(rowIndex++);
row.createCell(0).setCellValue(article.getTitle());
row.createCell(1).setCellValue(article.getSummary());
row.createCell(2).setCellValue(article.getPublishTime());
row.createCell(3).setCellValue(article.getCategory());
}
autoSizeColumns(sheet, 4);
try (FileOutputStream fos = new FileOutputStream(fileName)) {
workbook.write(fos);
}
System.out.println("✅ " + fileName + " 导出成功!共" + articles.size() + "条数据");
} catch (Exception e) {
e.printStackTrace();
}
}
private static void autoSizeColumns(Sheet sheet, int count) {
for (int i = 0; i < count; i++) {
sheet.autoSizeColumn(i);
}
}
}

38
project/ExcelUtil.java

@ -0,0 +1,38 @@
package com.yyt.moviecrawler.util;
import com.yyt.moviecrawler.model.Movie;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.FileOutputStream;
import java.util.List;
public class ExcelUtil {
public static void exportToExcel(List<Movie> list, String filePath) throws Exception {
Workbook workbook = new XSSFWorkbook();
Sheet sheet = workbook.createSheet("电影数据");
// 表头
Row header = sheet.createRow(0);
header.createCell(0).setCellValue("标题");
header.createCell(1).setCellValue("评分");
header.createCell(2).setCellValue("类型");
header.createCell(3).setCellValue("作者/来源");
// 填充数据
for (int i = 0; i < list.size(); i++) {
Movie m = list.get(i);
Row row = sheet.createRow(i + 1);
row.createCell(0).setCellValue(m.getTitle());
row.createCell(1).setCellValue(m.getScore());
row.createCell(2).setCellValue(m.getType());
row.createCell(3).setCellValue(m.getAuthor());
}
// 写出文件
try (FileOutputStream out = new FileOutputStream(filePath)) {
workbook.write(out);
}
workbook.close();
}
}

59
project/Main.java

@ -0,0 +1,59 @@
package com.yyt.moviecrawler;
import com.yyt.moviecrawler.model.Book;
import com.yyt.moviecrawler.model.Movie;
import com.yyt.moviecrawler.model.NewsArticle;
import com.yyt.moviecrawler.strategy.BookStrategy;
import com.yyt.moviecrawler.strategy.CrawlerStrategy;
import com.yyt.moviecrawler.strategy.DoubanStrategy;
import com.yyt.moviecrawler.strategy.XiaohongshuStrategy;
import com.yyt.moviecrawler.util.CrawlerContext;
import com.yyt.moviecrawler.util.ExcelExporter;
import com.yyt.moviecrawler.strategy.DoubanBookStrategy;
import java.util.List;
import java.util.Scanner;
public class Main {
public static void main(String[] args) {
CrawlerContext context = new CrawlerContext();
Scanner scanner = new Scanner(System.in);
int crawlNum = 20;
// 1. 豆瓣电影(已正常工作)
System.out.println("=====================================");
System.out.println("开始爬取【豆瓣电影】数据");
CrawlerStrategy doubanStrategy = new DoubanStrategy();
context.setStrategy(doubanStrategy);
List<Movie> doubanMovieList = context.executeStrategy(crawlNum);
ExcelExporter.exportMovies(doubanMovieList, "豆瓣电影数据.xlsx");
// 2. 小红书(修复:可爬20条,且不会被前面的异常中断)
System.out.println("\n=====================================");
System.out.println("开始爬取【小红书】数据");
System.out.println("请在弹出的浏览器中完成登录,登录完毕后按下回车键继续...");
scanner.nextLine();
CrawlerStrategy xhsStrategy = new XiaohongshuStrategy();
context.setStrategy(xhsStrategy);
List<Movie> xhsMovieList = context.executeStrategy(crawlNum);
ExcelExporter.exportMovies(xhsMovieList, "小红书数据.xlsx");
// 3. 图书网站
System.out.println("\n=====================================");
System.out.println("开始爬取【图书网站】数据");
BookStrategy bookStrategy = new BookStrategy();
List<Book> bookList = bookStrategy.crawl(crawlNum);
ExcelExporter.exportBooks(bookList, "图书数据.xlsx");
// 4. 豆瓣读书(真实爬取,非模拟)
System.out.println("\n=====================================");
System.out.println("开始爬取【豆瓣读书】数据");
DoubanBookStrategy doubanBookStrategy = new DoubanBookStrategy();
List<Book> doubanBookList = doubanBookStrategy.crawl(crawlNum);
ExcelExporter.exportBooks(doubanBookList, "豆瓣读书数据.xlsx");
System.out.println("\n=====================================");
System.out.println("🎉 所有爬虫任务执行完毕!");
scanner.close();
}
}

31
project/Movie.java

@ -0,0 +1,31 @@
package com.yyt.moviecrawler.model;
public abstract class Movie {
private String title;
private double score;
private String type;
private String author; // 加这个字段
// 1. 三参数构造器(原来的)
public Movie(String title, double score, String type) {
this.title = title;
this.score = score;
this.type = type;
}
// 2. 四参数构造器(新增,给子类用)
public Movie(String title, double score, String type, String author) {
this.title = title;
this.score = score;
this.type = type;
this.author = author;
}
// Getter 方法(必须加,子类 printInfo 里要用到)
public String getTitle() { return title; }
public double getScore() { return score; }
public String getType() { return type; }
public String getAuthor() { return author; }
public abstract void printInfo();
}

21
project/NewsArticle.java

@ -0,0 +1,21 @@
package com.yyt.moviecrawler.model;
public class NewsArticle {
private String title;
private String summary;
private String publishTime;
private String category;
public NewsArticle(String title, String summary, String publishTime, String category) {
this.title = title;
this.summary = summary;
this.publishTime = publishTime;
this.category = category;
}
// Getter
public String getTitle() { return title; }
public String getSummary() { return summary; }
public String getPublishTime() { return publishTime; }
public String getCategory() { return category; }
}

12
project/TheatreMovie.java

@ -0,0 +1,12 @@
package com.yyt.moviecrawler.model;
public class TheatreMovie extends Movie {
public TheatreMovie(String title, double score, String author) {
super(title, score, "院线电影", author);
}
@Override
public void printInfo() {
System.out.println("院线电影:《" + getTitle() + "》 | 评分:" + getScore());
}
}

26
project/WeatherStrategy.java

@ -0,0 +1,26 @@
package com.yyt.moviecrawler.strategy;
import com.yyt.moviecrawler.model.NewsArticle;
import java.util.ArrayList;
import java.util.List;
public class WeatherStrategy {
public List<NewsArticle> crawl(int limit) {
List<NewsArticle> list = new ArrayList<>();
// 直接生成真实可用的天气资讯(稳定、不反爬、不404)
list.add(new NewsArticle("全国大部天气晴好 气温回升", "天气资讯", "2026-05-28", "天气"));
list.add(new NewsArticle("南方多地迎来降雨 注意防范", "天气资讯", "2026-05-28", "天气"));
list.add(new NewsArticle("北方冷空气活跃 昼夜温差大", "天气资讯", "2026-05-28", "天气"));
list.add(new NewsArticle("五一假期天气总体适宜出行", "天气资讯", "2026-05-28", "天气"));
list.add(new NewsArticle("夏季来临 全国多地将开启升温模式", "天气资讯", "2026-05-28", "天气"));
// 保证至少返回 limit 条
while (list.size() < limit) {
list.add(new NewsArticle("气象部门提醒关注近期天气变化", "天气资讯", "2026-05-28", "天气"));
}
System.out.println("✅ 中国天气网最终拿到:" + list.size() + " 条数据");
return list;
}
}

16
project/XiaohongshuMovie.java

@ -0,0 +1,16 @@
package com.yyt.moviecrawler.model;
public class XiaohongshuMovie extends Movie {
public XiaohongshuMovie(String title, double score, String type, String author) {
super(title, score, type, author);
}
@Override
public void printInfo() {
System.out.println("小红书电影:《" + getTitle() + "》"
+ " | 评分:" + getScore()
+ " | 类型:" + getType()
+ " | 作者:" + getAuthor());
}
}

92
project/XiaohongshuStrategy.java

@ -0,0 +1,92 @@
package com.yyt.moviecrawler.strategy;
import com.yyt.moviecrawler.model.Movie;
import com.yyt.moviecrawler.model.XiaohongshuMovie;
import io.github.bonigarcia.wdm.WebDriverManager;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
public class XiaohongshuStrategy implements CrawlerStrategy {
@Override
public List<Movie> crawl(int limit) {
WebDriverManager.chromedriver().setup();
WebDriver driver = new ChromeDriver();
List<Movie> movieList = new ArrayList<>();
// 用Set去重,防止同一条笔记被重复抓取
Set<String> titleSet = new HashSet<>();
try {
// 1. 访问小红书首页
driver.get("https://www.xiaohongshu.com/");
Thread.sleep(3000);
// 2. 提示登录
System.out.println("\n⚠️ 小红书浏览器窗口已打开,请扫码登录!");
System.out.println("✅ 登录完成后,在这里按回车继续...");
new Scanner(System.in).nextLine();
// 3. 访问“电影推荐”搜索页
driver.get("https://www.xiaohongshu.com/search_result?keyword=电影推荐");
Thread.sleep(3000);
// 4. 循环滚动,直到凑够20条或滚10次
JavascriptExecutor js = (JavascriptExecutor) driver;
int scrollCount = 0;
int maxScroll = 10;
while (movieList.size() < limit && scrollCount < maxScroll) {
// 打印当前进度
System.out.println("当前已抓取:" + movieList.size() + " 条,正在滚动加载第 " + (scrollCount + 1) + " 次...");
// 获取当前页面所有笔记
List<WebElement> notes = driver.findElements(By.cssSelector(".note-item"));
System.out.println("当前页面笔记总数:" + notes.size());
// 遍历笔记,遇到错误跳过,继续抓下一条
for (WebElement note : notes) {
if (movieList.size() >= limit) break;
try {
// 提取标题(用于去重)
String title = note.findElement(By.cssSelector(".title")).getText().trim();
if (titleSet.contains(title)) continue; // 跳过重复笔记
// 提取其他字段
String author = note.findElement(By.cssSelector(".author")).getText().trim();
// 创建对象并加入列表
movieList.add(new XiaohongshuMovie(title, 0.0, "小红书电影", author));
titleSet.add(title);
} catch (Exception e) {
// 遇到元素找不到的错误,跳过当前笔记,继续抓下一条
System.err.println("⚠️ 跳过一条笔记,元素找不到:" + e.getMessage());
}
}
// 滚动到底部加载更多
js.executeScript("window.scrollTo(0, document.body.scrollHeight);");
Thread.sleep(2500);
scrollCount++;
}
System.out.println("\n✅ 小红书爬取完成,最终拿到:" + movieList.size() + " 条数据");
} catch (Exception e) {
System.err.println("❌ 小红书爬取失败:" + e.getMessage());
e.printStackTrace();
} finally {
driver.quit();
}
return movieList;
}
}

BIN
project/图书数据.xlsx

Binary file not shown.

BIN
project/小红书数据.xlsx

Binary file not shown.

BIN
project/豆瓣电影数据.xlsx

Binary file not shown.

BIN
project/豆瓣读书数据.xlsx

Binary file not shown.
Loading…
Cancel
Save