You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
184 lines
5.8 KiB
184 lines
5.8 KiB
package strategy;
|
|
|
|
import model.CrawlResult;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import exception.ParseException;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class DangDangStrategy extends AbstractCrawlStrategy {
|
|
private static final Logger logger = LoggerFactory.getLogger(DangDangStrategy.class);
|
|
|
|
private static final String BASE_URL = "http://category.dangdang.com/cp01.01.02.00.00.00.html?page_index=%d";
|
|
private static final String SITE_NAME = "当当网图书搜索";
|
|
|
|
@Override
|
|
public String getBaseUrl() {
|
|
return "http://search.dangdang.com/?key=%B6%C1%CA%E9&act=input&page_index=1";
|
|
}
|
|
|
|
@Override
|
|
public String getSiteName() {
|
|
return SITE_NAME;
|
|
}
|
|
|
|
@Override
|
|
public List<CrawlResult> crawlPage(int page) throws IOException, ParseException {
|
|
List<CrawlResult> results = new ArrayList<>();
|
|
String url = String.format(BASE_URL, page);
|
|
logger.info("正在爬取当当网第 {} 页: {}", page, url);
|
|
Document doc = fetchDocument(url);
|
|
|
|
if (doc == null) {
|
|
throw new IOException("无法获取页面: " + url);
|
|
}
|
|
|
|
Elements items = doc.select("ul.bigimg li");
|
|
if (items.isEmpty()) {
|
|
items = doc.select(".search_booklist li");
|
|
}
|
|
if (items.isEmpty()) {
|
|
items = doc.select("li[class*=item]");
|
|
}
|
|
|
|
if (items.isEmpty()) {
|
|
logger.warn("当当网第 {} 页未找到任何图书列表元素", page);
|
|
}
|
|
|
|
for (Element e : items) {
|
|
try {
|
|
CrawlResult result = parseDangDangItem(e);
|
|
if (result != null) {
|
|
results.add(result);
|
|
}
|
|
} catch (Exception ex) {
|
|
logger.debug("解析当当网图书项失败: {}", ex.getMessage());
|
|
}
|
|
}
|
|
|
|
logger.info("当当网第 {} 页解析完成,获取 {} 条数据", page, results.size());
|
|
return results;
|
|
}
|
|
|
|
private CrawlResult parseDangDangItem(Element e) {
|
|
String title = "";
|
|
String priceText = "";
|
|
String originalPriceText = "";
|
|
String imageUrl = "";
|
|
String author = "当当网";
|
|
|
|
Element titleElem = e.selectFirst("a[title]");
|
|
if (titleElem == null) {
|
|
titleElem = e.selectFirst(".title a");
|
|
}
|
|
if (titleElem == null) {
|
|
titleElem = e.selectFirst(".name a");
|
|
}
|
|
if (titleElem == null) {
|
|
titleElem = e.selectFirst("h4 a");
|
|
}
|
|
if (titleElem == null) {
|
|
titleElem = e.selectFirst(".book-title a");
|
|
}
|
|
|
|
if (titleElem != null) {
|
|
title = titleElem.attr("title");
|
|
if (title.isEmpty()) {
|
|
title = titleElem.text();
|
|
}
|
|
}
|
|
|
|
if (title == null || title.isEmpty() || title.length() < 3) {
|
|
return null;
|
|
}
|
|
|
|
if (title.contains("登录") || title.contains("注册") ||
|
|
title.contains("购物车") || title.contains("帮助") ||
|
|
title.contains("支付") || title.contains("商家") ||
|
|
title.contains("中心") || title.contains("客服") ||
|
|
title.contains("意见") || title.contains("反馈") ||
|
|
title.contains("投诉") || title.contains("我的")) {
|
|
return null;
|
|
}
|
|
|
|
Element priceN = e.selectFirst(".price_n");
|
|
if (priceN == null) {
|
|
priceN = e.selectFirst(".price");
|
|
}
|
|
if (priceN == null) {
|
|
priceN = e.selectFirst("span.price");
|
|
}
|
|
if (priceN != null) {
|
|
priceText = priceN.text();
|
|
}
|
|
priceText = priceText.replace("¥", "").replace("元", "").replace("¥", "").trim();
|
|
|
|
Element priceO = e.selectFirst(".price_o");
|
|
if (priceO == null) {
|
|
priceO = e.selectFirst(".original-price");
|
|
}
|
|
if (priceO != null) {
|
|
originalPriceText = priceO.text();
|
|
}
|
|
originalPriceText = originalPriceText.replace("¥", "").replace("元", "").replace("¥", "").replace("定价", "").trim();
|
|
|
|
Element img = e.selectFirst("img");
|
|
if (img != null) {
|
|
imageUrl = img.attr("src");
|
|
if (imageUrl.isEmpty()) {
|
|
imageUrl = img.attr("data-original");
|
|
}
|
|
if (imageUrl.isEmpty()) {
|
|
imageUrl = img.attr("data-lazy-src");
|
|
}
|
|
}
|
|
|
|
Element authorElem = e.selectFirst(".search_book_author a");
|
|
if (authorElem != null) {
|
|
author = authorElem.text();
|
|
}
|
|
if (author.isEmpty()) {
|
|
Element publisherElem = e.selectFirst(".search_book_publisher");
|
|
if (publisherElem != null) {
|
|
author = publisherElem.text();
|
|
}
|
|
}
|
|
if (author.isEmpty() || author.length() > 100) {
|
|
author = "当当网";
|
|
}
|
|
|
|
double price = parsePrice(priceText);
|
|
if (price == 0) {
|
|
try {
|
|
String numStr = priceText.replaceAll("[^0-9.]", "");
|
|
if (!numStr.isEmpty()) {
|
|
price = Double.parseDouble(numStr);
|
|
}
|
|
} catch (NumberFormatException ex) {
|
|
price = 0;
|
|
}
|
|
}
|
|
|
|
double originalPrice = parsePrice(originalPriceText);
|
|
if (originalPrice == 0) originalPrice = price * 1.3;
|
|
double discount = parseDiscount(price, originalPrice);
|
|
|
|
return new CrawlResult(title, price, originalPrice, discount, imageUrl, author);
|
|
}
|
|
|
|
@Override
|
|
public CrawlResult parseItem(Element element) throws ParseException {
|
|
return parseDangDangItem(element);
|
|
}
|
|
|
|
@Override
|
|
public int getPageSize() {
|
|
return 30;
|
|
}
|
|
}
|