package com.rental.crawler; import com.rental.crawler.model.Book; import com.rental.crawler.util.HttpUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class DoubanCrawler { private static final String BASE_URL = "https://book.douban.com/tag/论文"; public List crawl(int pageCount) throws IOException { List books = new ArrayList<>(); for (int page = 0; page < pageCount; page++) { String url = BASE_URL + "?start=" + (page * 20); System.out.println("正在爬取: " + url); Document doc = HttpUtil.getDocument(url); Elements bookElements = doc.select(".subject-item"); for (Element bookElement : bookElements) { Book book = parseBook(bookElement); if (book != null) { books.add(book); } } try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } } return books; } private Book parseBook(Element bookElement) { Book book = new Book(); Element titleElement = bookElement.selectFirst(".info h2 a"); if (titleElement != null) { book.setTitle(titleElement.text().trim()); book.setUrl(titleElement.attr("href")); } Element infoElement = bookElement.selectFirst(".info .pub"); if (infoElement != null) { String info = infoElement.text().trim(); String[] parts = info.split("/"); if (parts.length >= 4) { book.setAuthors(parts[0].trim()); book.setPublisher(parts[1].trim()); book.setPublishDate(parts[2].trim()); book.setPrice(parts[3].trim()); } } Element ratingElement = bookElement.selectFirst(".info .rating_nums"); if (ratingElement != null) { try { book.setRating(Double.parseDouble(ratingElement.text().trim())); } catch (NumberFormatException e) { book.setRating(0.0); } } Element summaryElement = bookElement.selectFirst(".info p"); if (summaryElement != null) { book.setSummary(summaryElement.text().trim()); } return book; } }