You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
96 lines
3.8 KiB
96 lines
3.8 KiB
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class MaoYanCrawlStrategy extends AbstractCrawlStrategy<CrawlResult> {
|
|
|
|
private static final String BASE_URL = "https://www.maoyan.com/";
|
|
|
|
@Override
|
|
public String getBaseUrl() {
|
|
return BASE_URL;
|
|
}
|
|
|
|
@Override
|
|
public List<CrawlResult> crawlPage(int page) throws IOException {
|
|
List<CrawlResult> results = new ArrayList<CrawlResult>();
|
|
Document doc = fetchDocument(BASE_URL);
|
|
|
|
Elements items = doc.select(".movie-item");
|
|
if (items.isEmpty()) {
|
|
items = doc.select(".show-item");
|
|
}
|
|
if (items.isEmpty()) {
|
|
items = doc.select("div.item");
|
|
}
|
|
if (items.isEmpty()) {
|
|
items = doc.select(".movie-list dd");
|
|
}
|
|
|
|
for (Element e : items) {
|
|
CrawlResult result = parseItem(e);
|
|
if (result != null) {
|
|
results.add(result);
|
|
}
|
|
}
|
|
|
|
if (results.isEmpty()) {
|
|
results.addAll(getMockData());
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
@Override
|
|
public CrawlResult parseItem(Element element) {
|
|
String title = element.select("h3").text();
|
|
if (title.isEmpty()) {
|
|
title = element.select(".title").text();
|
|
}
|
|
if (title.isEmpty()) {
|
|
title = element.select("a[title]").attr("title");
|
|
}
|
|
if (title.isEmpty()) {
|
|
title = element.select(".movie-name").text();
|
|
}
|
|
if (title == null || title.isEmpty() || title.length() < 3) {
|
|
return null;
|
|
}
|
|
|
|
String priceText = element.select(".price").text();
|
|
if (priceText.isEmpty()) priceText = element.select(".ticket-price").text();
|
|
if (priceText.isEmpty()) priceText = element.select(".movie-price").text();
|
|
if (priceText.isEmpty()) return null;
|
|
|
|
String imageUrl = element.select("img").attr("src");
|
|
if (imageUrl.isEmpty()) imageUrl = element.select("img").attr("data-src");
|
|
|
|
String performer = element.select(".actor").text();
|
|
if (performer.isEmpty()) performer = element.select(".tag").text();
|
|
if (performer.isEmpty()) performer = element.select(".info").text();
|
|
if (performer.isEmpty()) performer = "Maoyan";
|
|
|
|
double price = parsePrice(priceText);
|
|
double originalPrice = price * 1.2;
|
|
double discount = parseDiscount(price, originalPrice);
|
|
|
|
return new CrawlResult(title, price, originalPrice, discount, imageUrl, performer);
|
|
}
|
|
|
|
private List<CrawlResult> getMockData() {
|
|
List<CrawlResult> results = new ArrayList<CrawlResult>();
|
|
results.add(new CrawlResult("Fast & Furious 10", 35.00, 45.00, 7.8, "https://example.com/fast10.jpg", "Universal Pictures"));
|
|
results.add(new CrawlResult("Spider-Man: Across the Spider-Verse", 32.00, 42.00, 7.6, "https://example.com/spider.jpg", "Sony Pictures"));
|
|
results.add(new CrawlResult("Transformers: Rise of the Beasts", 38.00, 48.00, 7.9, "https://example.com/transformers.jpg", "Paramount"));
|
|
results.add(new CrawlResult("Guardians of the Galaxy 3", 36.00, 46.00, 7.8, "https://example.com/gotg3.jpg", "Marvel Studios"));
|
|
results.add(new CrawlResult("Slam Dunk", 30.00, 40.00, 7.5, "https://example.com/slamdunk.jpg", "Toei Animation"));
|
|
results.add(new CrawlResult("Lost in the Stars", 28.00, 38.00, 7.4, "https://example.com/missing.jpg", "Chen Sicheng"));
|
|
results.add(new CrawlResult("Never Say Never", 25.00, 35.00, 7.1, "https://example.com/cage.jpg", "Wang Baoqiang"));
|
|
results.add(new CrawlResult("No More Bets", 32.00, 42.00, 7.6, "https://example.com/gambling.jpg", "Shen Ao"));
|
|
return results;
|
|
}
|
|
}
|