package strategy; import model.CrawlResult; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import exception.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class DangDangStrategy extends AbstractCrawlStrategy { private static final Logger logger = LoggerFactory.getLogger(DangDangStrategy.class); private static final String BASE_URL = "http://category.dangdang.com/cp01.01.02.00.00.00.html?page_index=%d"; private static final String SITE_NAME = "当当网图书搜索"; @Override public String getBaseUrl() { return "http://search.dangdang.com/?key=%B6%C1%CA%E9&act=input&page_index=1"; } @Override public String getSiteName() { return SITE_NAME; } @Override public List crawlPage(int page) throws IOException, ParseException { List results = new ArrayList<>(); String url = String.format(BASE_URL, page); logger.info("正在爬取当当网第 {} 页: {}", page, url); Document doc = fetchDocument(url); if (doc == null) { throw new IOException("无法获取页面: " + url); } Elements items = doc.select("ul.bigimg li"); if (items.isEmpty()) { items = doc.select(".search_booklist li"); } if (items.isEmpty()) { items = doc.select("li[class*=item]"); } if (items.isEmpty()) { logger.warn("当当网第 {} 页未找到任何图书列表元素", page); } for (Element e : items) { try { CrawlResult result = parseDangDangItem(e); if (result != null) { results.add(result); } } catch (Exception ex) { logger.debug("解析当当网图书项失败: {}", ex.getMessage()); } } logger.info("当当网第 {} 页解析完成,获取 {} 条数据", page, results.size()); return results; } private CrawlResult parseDangDangItem(Element e) { String title = ""; String priceText = ""; String originalPriceText = ""; String imageUrl = ""; String author = "当当网"; Element titleElem = e.selectFirst("a[title]"); if (titleElem == null) { titleElem = e.selectFirst(".title a"); } if (titleElem == null) { titleElem = e.selectFirst(".name a"); } if (titleElem == null) { titleElem = e.selectFirst("h4 a"); } if (titleElem == null) { titleElem = e.selectFirst(".book-title a"); } if (titleElem != null) { title = titleElem.attr("title"); if (title.isEmpty()) { title = titleElem.text(); } } if (title == null || title.isEmpty() || title.length() < 3) { return null; } if (title.contains("登录") || title.contains("注册") || title.contains("购物车") || title.contains("帮助") || title.contains("支付") || title.contains("商家") || title.contains("中心") || title.contains("客服") || title.contains("意见") || title.contains("反馈") || title.contains("投诉") || title.contains("我的")) { return null; } Element priceN = e.selectFirst(".price_n"); if (priceN == null) { priceN = e.selectFirst(".price"); } if (priceN == null) { priceN = e.selectFirst("span.price"); } if (priceN != null) { priceText = priceN.text(); } priceText = priceText.replace("¥", "").replace("元", "").replace("¥", "").trim(); Element priceO = e.selectFirst(".price_o"); if (priceO == null) { priceO = e.selectFirst(".original-price"); } if (priceO != null) { originalPriceText = priceO.text(); } originalPriceText = originalPriceText.replace("¥", "").replace("元", "").replace("¥", "").replace("定价", "").trim(); Element img = e.selectFirst("img"); if (img != null) { imageUrl = img.attr("src"); if (imageUrl.isEmpty()) { imageUrl = img.attr("data-original"); } if (imageUrl.isEmpty()) { imageUrl = img.attr("data-lazy-src"); } } Element authorElem = e.selectFirst(".search_book_author a"); if (authorElem != null) { author = authorElem.text(); } if (author.isEmpty()) { Element publisherElem = e.selectFirst(".search_book_publisher"); if (publisherElem != null) { author = publisherElem.text(); } } if (author.isEmpty() || author.length() > 100) { author = "当当网"; } double price = parsePrice(priceText); if (price == 0) { try { String numStr = priceText.replaceAll("[^0-9.]", ""); if (!numStr.isEmpty()) { price = Double.parseDouble(numStr); } } catch (NumberFormatException ex) { price = 0; } } double originalPrice = parsePrice(originalPriceText); if (originalPrice == 0) originalPrice = price * 1.3; double discount = parseDiscount(price, originalPrice); return new CrawlResult(title, price, originalPrice, discount, imageUrl, author); } @Override public CrawlResult parseItem(Element element) throws ParseException { return parseDangDangItem(element); } @Override public int getPageSize() { return 30; } }