You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
3.5 KiB

package com.yyt.moviecrawler.strategy;
import com.yyt.moviecrawler.model.Book;
import io.github.bonigarcia.wdm.WebDriverManager;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import java.util.ArrayList;
import java.util.List;
import java.time.Duration;
public class DoubanBookStrategy {
public List<Book> crawl(int limit) {
List<Book> bookList = new ArrayList<>();
// 配置浏览器,伪装成真实用户
ChromeOptions options = new ChromeOptions();
options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36");
options.addArguments("--disable-blink-features=AutomationControlled");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
WebDriverManager.chromedriver().setup();
WebDriver driver = new ChromeDriver(options);
driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(5));
try {
// 豆瓣读书 → 小说分类页面
String url = "https://book.douban.com/tag/小说?type=T";
driver.get(url);
Thread.sleep(3000); // 等待页面加载完成
// 循环爬取,直到拿到limit条数据
while (bookList.size() < limit) {
List<WebElement> items = driver.findElements(By.cssSelector(".info"));
for (WebElement item : items) {
if (bookList.size() >= limit) break;
try {
// 提取书名
String title = item.findElement(By.cssSelector("h2 a")).getText().trim();
// 提取评分(String类型,如"9.3")
String ratingStr = item.findElement(By.cssSelector(".rating_nums")).getText().trim();
// 转换数据,匹配Book类构造器
double price = 0.0; // 豆瓣无价格,用默认值
int starRating = 0;
if (!ratingStr.isEmpty()) {
starRating = (int) Math.round(Double.parseDouble(ratingStr));
}
String category = "小说";
// 按构造器顺序调用:title, price, starRating, category
bookList.add(new Book(title, price, starRating, category));
} catch (Exception e) {
// 个别元素缺失直接跳过,不影响整体爬取
}
}
// 如果数据不够,点击下一页继续爬取
if (bookList.size() < limit) {
try {
WebElement nextBtn = driver.findElement(By.cssSelector(".paginator .next a"));
nextBtn.click();
Thread.sleep(3000);
} catch (Exception e) {
// 没有下一页则退出循环
break;
}
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
driver.quit(); // 关闭浏览器,释放资源
}
System.out.println("✅ 豆瓣读书真实爬取完成,拿到:" + bookList.size() + " 条数据");
return bookList;
}
}