You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
80 lines
2.5 KiB
80 lines
2.5 KiB
package com.rental.crawler;
|
|
|
|
import com.rental.crawler.model.Book;
|
|
import com.rental.crawler.util.HttpUtil;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class DoubanCrawler {
|
|
private static final String BASE_URL = "https://book.douban.com/tag/论文";
|
|
|
|
public List<Book> crawl(int pageCount) throws IOException {
|
|
List<Book> books = new ArrayList<>();
|
|
|
|
for (int page = 0; page < pageCount; page++) {
|
|
String url = BASE_URL + "?start=" + (page * 20);
|
|
System.out.println("正在爬取: " + url);
|
|
|
|
Document doc = HttpUtil.getDocument(url);
|
|
|
|
Elements bookElements = doc.select(".subject-item");
|
|
for (Element bookElement : bookElements) {
|
|
Book book = parseBook(bookElement);
|
|
if (book != null) {
|
|
books.add(book);
|
|
}
|
|
}
|
|
|
|
try {
|
|
Thread.sleep(1000);
|
|
} catch (InterruptedException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
return books;
|
|
}
|
|
|
|
private Book parseBook(Element bookElement) {
|
|
Book book = new Book();
|
|
|
|
Element titleElement = bookElement.selectFirst(".info h2 a");
|
|
if (titleElement != null) {
|
|
book.setTitle(titleElement.text().trim());
|
|
book.setUrl(titleElement.attr("href"));
|
|
}
|
|
|
|
Element infoElement = bookElement.selectFirst(".info .pub");
|
|
if (infoElement != null) {
|
|
String info = infoElement.text().trim();
|
|
String[] parts = info.split("/");
|
|
if (parts.length >= 4) {
|
|
book.setAuthors(parts[0].trim());
|
|
book.setPublisher(parts[1].trim());
|
|
book.setPublishDate(parts[2].trim());
|
|
book.setPrice(parts[3].trim());
|
|
}
|
|
}
|
|
|
|
Element ratingElement = bookElement.selectFirst(".info .rating_nums");
|
|
if (ratingElement != null) {
|
|
try {
|
|
book.setRating(Double.parseDouble(ratingElement.text().trim()));
|
|
} catch (NumberFormatException e) {
|
|
book.setRating(0.0);
|
|
}
|
|
}
|
|
|
|
Element summaryElement = bookElement.selectFirst(".info p");
|
|
if (summaryElement != null) {
|
|
book.setSummary(summaryElement.text().trim());
|
|
}
|
|
|
|
return book;
|
|
}
|
|
}
|
|
|