You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

184 lines
5.8 KiB

package strategy;
import model.CrawlResult;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import exception.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class DangDangStrategy extends AbstractCrawlStrategy {
private static final Logger logger = LoggerFactory.getLogger(DangDangStrategy.class);
private static final String BASE_URL = "http://category.dangdang.com/cp01.01.02.00.00.00.html?page_index=%d";
private static final String SITE_NAME = "当当网图书搜索";
@Override
public String getBaseUrl() {
return "http://search.dangdang.com/?key=%B6%C1%CA%E9&act=input&page_index=1";
}
@Override
public String getSiteName() {
return SITE_NAME;
}
@Override
public List<CrawlResult> crawlPage(int page) throws IOException, ParseException {
List<CrawlResult> results = new ArrayList<>();
String url = String.format(BASE_URL, page);
logger.info("正在爬取当当网第 {} 页: {}", page, url);
Document doc = fetchDocument(url);
if (doc == null) {
throw new IOException("无法获取页面: " + url);
}
Elements items = doc.select("ul.bigimg li");
if (items.isEmpty()) {
items = doc.select(".search_booklist li");
}
if (items.isEmpty()) {
items = doc.select("li[class*=item]");
}
if (items.isEmpty()) {
logger.warn("当当网第 {} 页未找到任何图书列表元素", page);
}
for (Element e : items) {
try {
CrawlResult result = parseDangDangItem(e);
if (result != null) {
results.add(result);
}
} catch (Exception ex) {
logger.debug("解析当当网图书项失败: {}", ex.getMessage());
}
}
logger.info("当当网第 {} 页解析完成,获取 {} 条数据", page, results.size());
return results;
}
private CrawlResult parseDangDangItem(Element e) {
String title = "";
String priceText = "";
String originalPriceText = "";
String imageUrl = "";
String author = "当当网";
Element titleElem = e.selectFirst("a[title]");
if (titleElem == null) {
titleElem = e.selectFirst(".title a");
}
if (titleElem == null) {
titleElem = e.selectFirst(".name a");
}
if (titleElem == null) {
titleElem = e.selectFirst("h4 a");
}
if (titleElem == null) {
titleElem = e.selectFirst(".book-title a");
}
if (titleElem != null) {
title = titleElem.attr("title");
if (title.isEmpty()) {
title = titleElem.text();
}
}
if (title == null || title.isEmpty() || title.length() < 3) {
return null;
}
if (title.contains("登录") || title.contains("注册") ||
title.contains("购物车") || title.contains("帮助") ||
title.contains("支付") || title.contains("商家") ||
title.contains("中心") || title.contains("客服") ||
title.contains("意见") || title.contains("反馈") ||
title.contains("投诉") || title.contains("我的")) {
return null;
}
Element priceN = e.selectFirst(".price_n");
if (priceN == null) {
priceN = e.selectFirst(".price");
}
if (priceN == null) {
priceN = e.selectFirst("span.price");
}
if (priceN != null) {
priceText = priceN.text();
}
priceText = priceText.replace("¥", "").replace("元", "").replace("¥", "").trim();
Element priceO = e.selectFirst(".price_o");
if (priceO == null) {
priceO = e.selectFirst(".original-price");
}
if (priceO != null) {
originalPriceText = priceO.text();
}
originalPriceText = originalPriceText.replace("¥", "").replace("元", "").replace("¥", "").replace("定价", "").trim();
Element img = e.selectFirst("img");
if (img != null) {
imageUrl = img.attr("src");
if (imageUrl.isEmpty()) {
imageUrl = img.attr("data-original");
}
if (imageUrl.isEmpty()) {
imageUrl = img.attr("data-lazy-src");
}
}
Element authorElem = e.selectFirst(".search_book_author a");
if (authorElem != null) {
author = authorElem.text();
}
if (author.isEmpty()) {
Element publisherElem = e.selectFirst(".search_book_publisher");
if (publisherElem != null) {
author = publisherElem.text();
}
}
if (author.isEmpty() || author.length() > 100) {
author = "当当网";
}
double price = parsePrice(priceText);
if (price == 0) {
try {
String numStr = priceText.replaceAll("[^0-9.]", "");
if (!numStr.isEmpty()) {
price = Double.parseDouble(numStr);
}
} catch (NumberFormatException ex) {
price = 0;
}
}
double originalPrice = parsePrice(originalPriceText);
if (originalPrice == 0) originalPrice = price * 1.3;
double discount = parseDiscount(price, originalPrice);
return new CrawlResult(title, price, originalPrice, discount, imageUrl, author);
}
@Override
public CrawlResult parseItem(Element element) throws ParseException {
return parseDangDangItem(element);
}
@Override
public int getPageSize() {
return 30;
}
}