You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

66 lines
2.4 KiB

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class DangDangCrawlStrategy extends AbstractCrawlStrategy<CrawlResult> {
private static final String BASE_URL = "http://bang.dangdang.com/books/bestsellers/%d";
public String getBaseUrl() {
return BASE_URL;
}
public List<CrawlResult> crawlPage(int page) throws IOException {
List<CrawlResult> results = new ArrayList<CrawlResult>();
String url = String.format(BASE_URL, page);
Document doc = fetchDocument(url);
Elements bookElements = doc.select("li.clearfix");
if (bookElements.isEmpty()) {
bookElements = doc.select("li");
}
for (Element e : bookElements) {
CrawlResult result = parseItem(e);
if (result != null) {
results.add(result);
}
}
return results;
}
public CrawlResult parseItem(Element element) {
String title = element.select("a[title]").attr("title");
if (title == null || title.isEmpty() || title.length() < 10) {
title = element.select(".name a").text();
}
if (title == null || title.isEmpty() || title.length() < 10) {
return null;
}
String priceText = element.select("span.price_n").text();
if (priceText.isEmpty()) priceText = element.select(".price").text();
if (priceText.isEmpty()) priceText = element.select("[class*=price]").text();
String originalPriceText = element.select("span.price_r").text();
if (originalPriceText.isEmpty()) originalPriceText = priceText;
String imageUrl = element.select("img").attr("src");
if (imageUrl.isEmpty()) imageUrl = element.select("img").attr("data-original");
if (imageUrl.isEmpty()) imageUrl = element.select("img").attr("data-src");
String author = element.select("span.author").text();
if (author.isEmpty()) author = element.select("[class*=author]").text();
if (author.isEmpty()) author = "DangDang";
double price = parsePrice(priceText);
double originalPrice = parsePrice(originalPriceText);
double discount = parseDiscount(price, originalPrice);
return new CrawlResult(title, price, originalPrice, discount, imageUrl, author);
}
}