import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class DangDangCrawlStrategy extends AbstractCrawlStrategy { private static final String BASE_URL = "http://bang.dangdang.com/books/bestsellers/%d"; public String getBaseUrl() { return BASE_URL; } public List crawlPage(int page) throws IOException { List results = new ArrayList(); String url = String.format(BASE_URL, page); Document doc = fetchDocument(url); Elements bookElements = doc.select("li.clearfix"); if (bookElements.isEmpty()) { bookElements = doc.select("li"); } for (Element e : bookElements) { CrawlResult result = parseItem(e); if (result != null) { results.add(result); } } return results; } public CrawlResult parseItem(Element element) { String title = element.select("a[title]").attr("title"); if (title == null || title.isEmpty() || title.length() < 10) { title = element.select(".name a").text(); } if (title == null || title.isEmpty() || title.length() < 10) { return null; } String priceText = element.select("span.price_n").text(); if (priceText.isEmpty()) priceText = element.select(".price").text(); if (priceText.isEmpty()) priceText = element.select("[class*=price]").text(); String originalPriceText = element.select("span.price_r").text(); if (originalPriceText.isEmpty()) originalPriceText = priceText; String imageUrl = element.select("img").attr("src"); if (imageUrl.isEmpty()) imageUrl = element.select("img").attr("data-original"); if (imageUrl.isEmpty()) imageUrl = element.select("img").attr("data-src"); String author = element.select("span.author").text(); if (author.isEmpty()) author = element.select("[class*=author]").text(); if (author.isEmpty()) author = "DangDang"; double price = parsePrice(priceText); double originalPrice = parsePrice(originalPriceText); double discount = parseDiscount(price, originalPrice); return new CrawlResult(title, price, originalPrice, discount, imageUrl, author); } }