You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
3.5 KiB
86 lines
3.5 KiB
package com.yyt.moviecrawler.strategy;
|
|
|
|
import com.yyt.moviecrawler.model.Book;
|
|
import io.github.bonigarcia.wdm.WebDriverManager;
|
|
import org.openqa.selenium.By;
|
|
import org.openqa.selenium.WebDriver;
|
|
import org.openqa.selenium.WebElement;
|
|
import org.openqa.selenium.chrome.ChromeDriver;
|
|
import org.openqa.selenium.chrome.ChromeOptions;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.time.Duration;
|
|
|
|
public class DoubanBookStrategy {
|
|
public List<Book> crawl(int limit) {
|
|
List<Book> bookList = new ArrayList<>();
|
|
|
|
// 配置浏览器,伪装成真实用户
|
|
ChromeOptions options = new ChromeOptions();
|
|
options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36");
|
|
options.addArguments("--disable-blink-features=AutomationControlled");
|
|
options.addArguments("--no-sandbox");
|
|
options.addArguments("--disable-dev-shm-usage");
|
|
|
|
WebDriverManager.chromedriver().setup();
|
|
WebDriver driver = new ChromeDriver(options);
|
|
driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(5));
|
|
|
|
try {
|
|
// 豆瓣读书 → 小说分类页面
|
|
String url = "https://book.douban.com/tag/小说?type=T";
|
|
driver.get(url);
|
|
Thread.sleep(3000); // 等待页面加载完成
|
|
|
|
// 循环爬取,直到拿到limit条数据
|
|
while (bookList.size() < limit) {
|
|
List<WebElement> items = driver.findElements(By.cssSelector(".info"));
|
|
|
|
for (WebElement item : items) {
|
|
if (bookList.size() >= limit) break;
|
|
|
|
try {
|
|
// 提取书名
|
|
String title = item.findElement(By.cssSelector("h2 a")).getText().trim();
|
|
// 提取评分(String类型,如"9.3")
|
|
String ratingStr = item.findElement(By.cssSelector(".rating_nums")).getText().trim();
|
|
|
|
// 转换数据,匹配Book类构造器
|
|
double price = 0.0; // 豆瓣无价格,用默认值
|
|
int starRating = 0;
|
|
if (!ratingStr.isEmpty()) {
|
|
starRating = (int) Math.round(Double.parseDouble(ratingStr));
|
|
}
|
|
String category = "小说";
|
|
|
|
// 按构造器顺序调用:title, price, starRating, category
|
|
bookList.add(new Book(title, price, starRating, category));
|
|
} catch (Exception e) {
|
|
// 个别元素缺失直接跳过,不影响整体爬取
|
|
}
|
|
}
|
|
|
|
// 如果数据不够,点击下一页继续爬取
|
|
if (bookList.size() < limit) {
|
|
try {
|
|
WebElement nextBtn = driver.findElement(By.cssSelector(".paginator .next a"));
|
|
nextBtn.click();
|
|
Thread.sleep(3000);
|
|
} catch (Exception e) {
|
|
// 没有下一页则退出循环
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
} finally {
|
|
driver.quit(); // 关闭浏览器,释放资源
|
|
}
|
|
|
|
System.out.println("✅ 豆瓣读书真实爬取完成,拿到:" + bookList.size() + " 条数据");
|
|
return bookList;
|
|
}
|
|
}
|