You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
92 lines
3.7 KiB
92 lines
3.7 KiB
package com.yyt.moviecrawler.strategy;
|
|
|
|
import com.yyt.moviecrawler.model.Movie;
|
|
import com.yyt.moviecrawler.model.XiaohongshuMovie;
|
|
import io.github.bonigarcia.wdm.WebDriverManager;
|
|
import org.openqa.selenium.By;
|
|
import org.openqa.selenium.JavascriptExecutor;
|
|
import org.openqa.selenium.WebDriver;
|
|
import org.openqa.selenium.WebElement;
|
|
import org.openqa.selenium.chrome.ChromeDriver;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.HashSet;
|
|
import java.util.List;
|
|
import java.util.Scanner;
|
|
import java.util.Set;
|
|
|
|
public class XiaohongshuStrategy implements CrawlerStrategy {
|
|
|
|
@Override
|
|
public List<Movie> crawl(int limit) {
|
|
WebDriverManager.chromedriver().setup();
|
|
WebDriver driver = new ChromeDriver();
|
|
List<Movie> movieList = new ArrayList<>();
|
|
// 用Set去重,防止同一条笔记被重复抓取
|
|
Set<String> titleSet = new HashSet<>();
|
|
|
|
try {
|
|
// 1. 访问小红书首页
|
|
driver.get("https://www.xiaohongshu.com/");
|
|
Thread.sleep(3000);
|
|
|
|
// 2. 提示登录
|
|
System.out.println("\n⚠️ 小红书浏览器窗口已打开,请扫码登录!");
|
|
System.out.println("✅ 登录完成后,在这里按回车继续...");
|
|
new Scanner(System.in).nextLine();
|
|
|
|
// 3. 访问“电影推荐”搜索页
|
|
driver.get("https://www.xiaohongshu.com/search_result?keyword=电影推荐");
|
|
Thread.sleep(3000);
|
|
|
|
// 4. 循环滚动,直到凑够20条或滚10次
|
|
JavascriptExecutor js = (JavascriptExecutor) driver;
|
|
int scrollCount = 0;
|
|
int maxScroll = 10;
|
|
while (movieList.size() < limit && scrollCount < maxScroll) {
|
|
// 打印当前进度
|
|
System.out.println("当前已抓取:" + movieList.size() + " 条,正在滚动加载第 " + (scrollCount + 1) + " 次...");
|
|
|
|
// 获取当前页面所有笔记
|
|
List<WebElement> notes = driver.findElements(By.cssSelector(".note-item"));
|
|
System.out.println("当前页面笔记总数:" + notes.size());
|
|
|
|
// 遍历笔记,遇到错误跳过,继续抓下一条
|
|
for (WebElement note : notes) {
|
|
if (movieList.size() >= limit) break;
|
|
|
|
try {
|
|
// 提取标题(用于去重)
|
|
String title = note.findElement(By.cssSelector(".title")).getText().trim();
|
|
if (titleSet.contains(title)) continue; // 跳过重复笔记
|
|
|
|
// 提取其他字段
|
|
String author = note.findElement(By.cssSelector(".author")).getText().trim();
|
|
|
|
// 创建对象并加入列表
|
|
movieList.add(new XiaohongshuMovie(title, 0.0, "小红书电影", author));
|
|
titleSet.add(title);
|
|
} catch (Exception e) {
|
|
// 遇到元素找不到的错误,跳过当前笔记,继续抓下一条
|
|
System.err.println("⚠️ 跳过一条笔记,元素找不到:" + e.getMessage());
|
|
}
|
|
}
|
|
|
|
// 滚动到底部加载更多
|
|
js.executeScript("window.scrollTo(0, document.body.scrollHeight);");
|
|
Thread.sleep(2500);
|
|
scrollCount++;
|
|
}
|
|
|
|
System.out.println("\n✅ 小红书爬取完成,最终拿到:" + movieList.size() + " 条数据");
|
|
|
|
} catch (Exception e) {
|
|
System.err.println("❌ 小红书爬取失败:" + e.getMessage());
|
|
e.printStackTrace();
|
|
} finally {
|
|
driver.quit();
|
|
}
|
|
|
|
return movieList;
|
|
}
|
|
}
|