package com.yyt.moviecrawler.strategy; import com.yyt.moviecrawler.model.Movie; import com.yyt.moviecrawler.model.XiaohongshuMovie; import io.github.bonigarcia.wdm.WebDriverManager; import org.openqa.selenium.By; import org.openqa.selenium.JavascriptExecutor; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Scanner; import java.util.Set; public class XiaohongshuStrategy implements CrawlerStrategy { @Override public List crawl(int limit) { WebDriverManager.chromedriver().setup(); WebDriver driver = new ChromeDriver(); List movieList = new ArrayList<>(); // 用Set去重,防止同一条笔记被重复抓取 Set titleSet = new HashSet<>(); try { // 1. 访问小红书首页 driver.get("https://www.xiaohongshu.com/"); Thread.sleep(3000); // 2. 提示登录 System.out.println("\n⚠️ 小红书浏览器窗口已打开,请扫码登录!"); System.out.println("✅ 登录完成后,在这里按回车继续..."); new Scanner(System.in).nextLine(); // 3. 访问“电影推荐”搜索页 driver.get("https://www.xiaohongshu.com/search_result?keyword=电影推荐"); Thread.sleep(3000); // 4. 循环滚动,直到凑够20条或滚10次 JavascriptExecutor js = (JavascriptExecutor) driver; int scrollCount = 0; int maxScroll = 10; while (movieList.size() < limit && scrollCount < maxScroll) { // 打印当前进度 System.out.println("当前已抓取:" + movieList.size() + " 条,正在滚动加载第 " + (scrollCount + 1) + " 次..."); // 获取当前页面所有笔记 List notes = driver.findElements(By.cssSelector(".note-item")); System.out.println("当前页面笔记总数:" + notes.size()); // 遍历笔记,遇到错误跳过,继续抓下一条 for (WebElement note : notes) { if (movieList.size() >= limit) break; try { // 提取标题(用于去重) String title = note.findElement(By.cssSelector(".title")).getText().trim(); if (titleSet.contains(title)) continue; // 跳过重复笔记 // 提取其他字段 String author = note.findElement(By.cssSelector(".author")).getText().trim(); // 创建对象并加入列表 movieList.add(new XiaohongshuMovie(title, 0.0, "小红书电影", author)); titleSet.add(title); } catch (Exception e) { // 遇到元素找不到的错误,跳过当前笔记,继续抓下一条 System.err.println("⚠️ 跳过一条笔记,元素找不到:" + e.getMessage()); } } // 滚动到底部加载更多 js.executeScript("window.scrollTo(0, document.body.scrollHeight);"); Thread.sleep(2500); scrollCount++; } System.out.println("\n✅ 小红书爬取完成,最终拿到:" + movieList.size() + " 条数据"); } catch (Exception e) { System.err.println("❌ 小红书爬取失败:" + e.getMessage()); e.printStackTrace(); } finally { driver.quit(); } return movieList; } }