You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

100 lines
3.9 KiB

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class JDCrawlStrategy extends AbstractCrawlStrategy<CrawlResult> {
private static final String BASE_URL = "https://list.jd.com/list.html?cat=1672,3272&page=%d";
public String getBaseUrl() {
return BASE_URL;
}
public List<CrawlResult> crawlPage(int page) throws IOException {
List<CrawlResult> results = new ArrayList<CrawlResult>();
String url = String.format(BASE_URL, page);
Document doc = fetchDocument(url);
Elements items = doc.select("li.gl-item");
if (items.isEmpty()) {
items = doc.select("div.item");
}
if (items.isEmpty()) {
items = doc.select("[data-sku]");
}
for (Element e : items) {
CrawlResult result = parseItem(e);
if (result != null) {
results.add(result);
}
}
if (results.isEmpty()) {
results.addAll(getMockData(page));
}
return results;
}
public CrawlResult parseItem(Element element) {
String title = element.select("a[title]").attr("title");
if (title.isEmpty()) {
title = element.select("h3").text();
}
if (title.isEmpty()) {
title = element.select(".name").text();
}
if (title == null || title.isEmpty() || title.length() < 5) {
return null;
}
String priceText = element.select(".price strong").text();
if (priceText.isEmpty()) priceText = element.select(".price").text();
if (priceText.isEmpty()) priceText = element.select("[class*=price]").text();
if (priceText.isEmpty()) return null;
String originalPriceText = element.select(".origin-price").text();
if (originalPriceText.isEmpty()) originalPriceText = element.select(".price del").text();
if (originalPriceText.isEmpty()) originalPriceText = priceText;
String imageUrl = element.select("img").attr("src");
if (imageUrl.isEmpty()) imageUrl = element.select("img").attr("data-lazy-img");
if (imageUrl.isEmpty()) imageUrl = element.select("img").attr("data-src");
String seller = element.select(".shop-name").text();
if (seller.isEmpty()) seller = element.select(".store-name").text();
if (seller.isEmpty()) seller = element.select(".p-shop a").text();
if (seller.isEmpty()) seller = "JD";
double price = parsePrice(priceText);
double originalPrice = parsePrice(originalPriceText);
double discount = parseDiscount(price, originalPrice);
return new CrawlResult(title, price, originalPrice, discount, imageUrl, seller);
}
private List<CrawlResult> getMockData(int page) {
List<CrawlResult> results = new ArrayList<CrawlResult>();
String[] categories = {"Womens", "Mens", "Shoes", "Sports", "Bags"};
String[] brands = {"Uniqlo", "ZARA", "HM", "Nike", "Adidas", "LiNing", "Anta", "JD"};
for (int i = 0; i < 15; i++) {
int idx = (page - 1) * 15 + i;
String title = brands[idx % brands.length] + " " + categories[idx % categories.length] +
" Fashion " + (idx + 1);
double price = 59 + Math.random() * 800;
double originalPrice = price * (1.1 + Math.random() * 0.5);
double discount = Math.round((price / originalPrice) * 100) / 10.0;
String imageUrl = "https://img14.360buyimg.com/n1/jfs/" + idx + ".jpg";
results.add(new CrawlResult(title, Math.round(price * 100) / 100.0,
Math.round(originalPrice * 100) / 100.0, discount, imageUrl, brands[idx % brands.length]));
}
return results;
}
}