You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
66 lines
2.4 KiB
66 lines
2.4 KiB
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class DangDangCrawlStrategy extends AbstractCrawlStrategy<CrawlResult> {
|
|
|
|
private static final String BASE_URL = "http://bang.dangdang.com/books/bestsellers/%d";
|
|
|
|
public String getBaseUrl() {
|
|
return BASE_URL;
|
|
}
|
|
|
|
public List<CrawlResult> crawlPage(int page) throws IOException {
|
|
List<CrawlResult> results = new ArrayList<CrawlResult>();
|
|
String url = String.format(BASE_URL, page);
|
|
Document doc = fetchDocument(url);
|
|
|
|
Elements bookElements = doc.select("li.clearfix");
|
|
if (bookElements.isEmpty()) {
|
|
bookElements = doc.select("li");
|
|
}
|
|
|
|
for (Element e : bookElements) {
|
|
CrawlResult result = parseItem(e);
|
|
if (result != null) {
|
|
results.add(result);
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
|
|
public CrawlResult parseItem(Element element) {
|
|
String title = element.select("a[title]").attr("title");
|
|
if (title == null || title.isEmpty() || title.length() < 10) {
|
|
title = element.select(".name a").text();
|
|
}
|
|
if (title == null || title.isEmpty() || title.length() < 10) {
|
|
return null;
|
|
}
|
|
|
|
String priceText = element.select("span.price_n").text();
|
|
if (priceText.isEmpty()) priceText = element.select(".price").text();
|
|
if (priceText.isEmpty()) priceText = element.select("[class*=price]").text();
|
|
|
|
String originalPriceText = element.select("span.price_r").text();
|
|
if (originalPriceText.isEmpty()) originalPriceText = priceText;
|
|
|
|
String imageUrl = element.select("img").attr("src");
|
|
if (imageUrl.isEmpty()) imageUrl = element.select("img").attr("data-original");
|
|
if (imageUrl.isEmpty()) imageUrl = element.select("img").attr("data-src");
|
|
|
|
String author = element.select("span.author").text();
|
|
if (author.isEmpty()) author = element.select("[class*=author]").text();
|
|
if (author.isEmpty()) author = "DangDang";
|
|
|
|
double price = parsePrice(priceText);
|
|
double originalPrice = parsePrice(originalPriceText);
|
|
double discount = parseDiscount(price, originalPrice);
|
|
|
|
return new CrawlResult(title, price, originalPrice, discount, imageUrl, author);
|
|
}
|
|
}
|