You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
2.5 KiB

package com.rental.crawler;
import com.rental.crawler.model.Book;
import com.rental.crawler.util.HttpUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class DoubanCrawler {
private static final String BASE_URL = "https://book.douban.com/tag/论文";
public List<Book> crawl(int pageCount) throws IOException {
List<Book> books = new ArrayList<>();
for (int page = 0; page < pageCount; page++) {
String url = BASE_URL + "?start=" + (page * 20);
System.out.println("正在爬取: " + url);
Document doc = HttpUtil.getDocument(url);
Elements bookElements = doc.select(".subject-item");
for (Element bookElement : bookElements) {
Book book = parseBook(bookElement);
if (book != null) {
books.add(book);
}
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
return books;
}
private Book parseBook(Element bookElement) {
Book book = new Book();
Element titleElement = bookElement.selectFirst(".info h2 a");
if (titleElement != null) {
book.setTitle(titleElement.text().trim());
book.setUrl(titleElement.attr("href"));
}
Element infoElement = bookElement.selectFirst(".info .pub");
if (infoElement != null) {
String info = infoElement.text().trim();
String[] parts = info.split("/");
if (parts.length >= 4) {
book.setAuthors(parts[0].trim());
book.setPublisher(parts[1].trim());
book.setPublishDate(parts[2].trim());
book.setPrice(parts[3].trim());
}
}
Element ratingElement = bookElement.selectFirst(".info .rating_nums");
if (ratingElement != null) {
try {
book.setRating(Double.parseDouble(ratingElement.text().trim()));
} catch (NumberFormatException e) {
book.setRating(0.0);
}
}
Element summaryElement = bookElement.selectFirst(".info p");
if (summaryElement != null) {
book.setSummary(summaryElement.text().trim());
}
return book;
}
}