You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

92 lines
3.7 KiB

package com.yyt.moviecrawler.strategy;
import com.yyt.moviecrawler.model.Movie;
import com.yyt.moviecrawler.model.XiaohongshuMovie;
import io.github.bonigarcia.wdm.WebDriverManager;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
public class XiaohongshuStrategy implements CrawlerStrategy {
@Override
public List<Movie> crawl(int limit) {
WebDriverManager.chromedriver().setup();
WebDriver driver = new ChromeDriver();
List<Movie> movieList = new ArrayList<>();
// 用Set去重,防止同一条笔记被重复抓取
Set<String> titleSet = new HashSet<>();
try {
// 1. 访问小红书首页
driver.get("https://www.xiaohongshu.com/");
Thread.sleep(3000);
// 2. 提示登录
System.out.println("\n⚠️ 小红书浏览器窗口已打开,请扫码登录!");
System.out.println("✅ 登录完成后,在这里按回车继续...");
new Scanner(System.in).nextLine();
// 3. 访问“电影推荐”搜索页
driver.get("https://www.xiaohongshu.com/search_result?keyword=电影推荐");
Thread.sleep(3000);
// 4. 循环滚动,直到凑够20条或滚10次
JavascriptExecutor js = (JavascriptExecutor) driver;
int scrollCount = 0;
int maxScroll = 10;
while (movieList.size() < limit && scrollCount < maxScroll) {
// 打印当前进度
System.out.println("当前已抓取:" + movieList.size() + " 条,正在滚动加载第 " + (scrollCount + 1) + " 次...");
// 获取当前页面所有笔记
List<WebElement> notes = driver.findElements(By.cssSelector(".note-item"));
System.out.println("当前页面笔记总数:" + notes.size());
// 遍历笔记,遇到错误跳过,继续抓下一条
for (WebElement note : notes) {
if (movieList.size() >= limit) break;
try {
// 提取标题(用于去重)
String title = note.findElement(By.cssSelector(".title")).getText().trim();
if (titleSet.contains(title)) continue; // 跳过重复笔记
// 提取其他字段
String author = note.findElement(By.cssSelector(".author")).getText().trim();
// 创建对象并加入列表
movieList.add(new XiaohongshuMovie(title, 0.0, "小红书电影", author));
titleSet.add(title);
} catch (Exception e) {
// 遇到元素找不到的错误,跳过当前笔记,继续抓下一条
System.err.println("⚠️ 跳过一条笔记,元素找不到:" + e.getMessage());
}
}
// 滚动到底部加载更多
js.executeScript("window.scrollTo(0, document.body.scrollHeight);");
Thread.sleep(2500);
scrollCount++;
}
System.out.println("\n✅ 小红书爬取完成,最终拿到:" + movieList.size() + " 条数据");
} catch (Exception e) {
System.err.println("❌ 小红书爬取失败:" + e.getMessage());
e.printStackTrace();
} finally {
driver.quit();
}
return movieList;
}
}