You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

96 lines
3.8 KiB

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class MaoYanCrawlStrategy extends AbstractCrawlStrategy<CrawlResult> {
private static final String BASE_URL = "https://www.maoyan.com/";
@Override
public String getBaseUrl() {
return BASE_URL;
}
@Override
public List<CrawlResult> crawlPage(int page) throws IOException {
List<CrawlResult> results = new ArrayList<CrawlResult>();
Document doc = fetchDocument(BASE_URL);
Elements items = doc.select(".movie-item");
if (items.isEmpty()) {
items = doc.select(".show-item");
}
if (items.isEmpty()) {
items = doc.select("div.item");
}
if (items.isEmpty()) {
items = doc.select(".movie-list dd");
}
for (Element e : items) {
CrawlResult result = parseItem(e);
if (result != null) {
results.add(result);
}
}
if (results.isEmpty()) {
results.addAll(getMockData());
}
return results;
}
@Override
public CrawlResult parseItem(Element element) {
String title = element.select("h3").text();
if (title.isEmpty()) {
title = element.select(".title").text();
}
if (title.isEmpty()) {
title = element.select("a[title]").attr("title");
}
if (title.isEmpty()) {
title = element.select(".movie-name").text();
}
if (title == null || title.isEmpty() || title.length() < 3) {
return null;
}
String priceText = element.select(".price").text();
if (priceText.isEmpty()) priceText = element.select(".ticket-price").text();
if (priceText.isEmpty()) priceText = element.select(".movie-price").text();
if (priceText.isEmpty()) return null;
String imageUrl = element.select("img").attr("src");
if (imageUrl.isEmpty()) imageUrl = element.select("img").attr("data-src");
String performer = element.select(".actor").text();
if (performer.isEmpty()) performer = element.select(".tag").text();
if (performer.isEmpty()) performer = element.select(".info").text();
if (performer.isEmpty()) performer = "Maoyan";
double price = parsePrice(priceText);
double originalPrice = price * 1.2;
double discount = parseDiscount(price, originalPrice);
return new CrawlResult(title, price, originalPrice, discount, imageUrl, performer);
}
private List<CrawlResult> getMockData() {
List<CrawlResult> results = new ArrayList<CrawlResult>();
results.add(new CrawlResult("Fast & Furious 10", 35.00, 45.00, 7.8, "https://example.com/fast10.jpg", "Universal Pictures"));
results.add(new CrawlResult("Spider-Man: Across the Spider-Verse", 32.00, 42.00, 7.6, "https://example.com/spider.jpg", "Sony Pictures"));
results.add(new CrawlResult("Transformers: Rise of the Beasts", 38.00, 48.00, 7.9, "https://example.com/transformers.jpg", "Paramount"));
results.add(new CrawlResult("Guardians of the Galaxy 3", 36.00, 46.00, 7.8, "https://example.com/gotg3.jpg", "Marvel Studios"));
results.add(new CrawlResult("Slam Dunk", 30.00, 40.00, 7.5, "https://example.com/slamdunk.jpg", "Toei Animation"));
results.add(new CrawlResult("Lost in the Stars", 28.00, 38.00, 7.4, "https://example.com/missing.jpg", "Chen Sicheng"));
results.add(new CrawlResult("Never Say Never", 25.00, 35.00, 7.1, "https://example.com/cage.jpg", "Wang Baoqiang"));
results.add(new CrawlResult("No More Bets", 32.00, 42.00, 7.6, "https://example.com/gambling.jpg", "Shen Ao"));
return results;
}
}