You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

313 lines
12 KiB

package strategy;
import model.CrawlResult;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import exception.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class MovieStrategy extends AbstractCrawlStrategy {
private static final Logger logger = LoggerFactory.getLogger(MovieStrategy.class);
private static final String BASE_URL = "https://www.maoyan.com/films?offset=%d";
private static final String SITE_NAME = "猫眼电影";
private static final Random random = new Random();
private static final String[][] BACKUP_MOVIES = {
{"肖申克的救赎", "9.7", "剧情/犯罪"},
{"霸王别姬", "9.6", "剧情/爱情"},
{"阿甘正传", "9.5", "剧情/爱情"},
{"泰坦尼克号", "9.4", "剧情/爱情"},
{"千与千寻", "9.4", "剧情/动画"},
{"盗梦空间", "9.3", "剧情/科幻"},
{"星际穿越", "9.4", "科幻/冒险"},
{"忠犬八公", "9.4", "剧情/家庭"},
{"海上钢琴师", "9.3", "剧情/音乐"},
{"楚门的世界", "9.4", "剧情/科幻"},
{"三傻大闹宝莱坞", "9.2", "剧情/喜剧"},
{"机器人总动员", "9.3", "动画/冒险"},
{"疯狂动物城", "9.2", "动画/冒险"},
{"寻梦环游记", "9.1", "动画/音乐"},
{"飞屋环游记", "9.0", "动画/冒险"},
{"神偷奶爸", "8.6", "动画/喜剧"},
{"超能陆战队", "8.7", "动画/动作"},
{"冰雪奇缘", "8.4", "动画/冒险"},
{"大话西游之大圣娶亲", "9.2", "喜剧/爱情"},
{"大话西游之月光宝盒", "9.0", "喜剧/奇幻"},
{"东成西就", "8.8", "喜剧/奇幻"},
{"唐伯虎点秋香", "8.7", "喜剧/爱情"},
{"九品芝麻官", "8.6", "喜剧/剧情"},
{"功夫", "8.8", "动作/喜剧"},
{"少林足球", "8.4", "喜剧/运动"},
{"无间道", "9.3", "剧情/犯罪"},
{"活着", "9.3", "剧情/历史"},
{"我不是药神", "9.0", "剧情/喜剧"},
{"哪吒之魔童降世", "8.8", "动画/奇幻"},
{"流浪地球", "8.0", "科幻/冒险"},
{"疯狂的外星人", "7.5", "喜剧/科幻"},
{"飞驰人生", "7.8", "喜剧/运动"},
{"满城尽带黄金甲", "7.2", "剧情/战争"},
{"让子弹飞", "8.8", "剧情/喜剧"},
{"邪不压正", "7.4", "剧情/动作"},
{"阳光灿烂的日子", "8.8", "剧情/爱情"},
{"重庆森林", "8.8", "剧情/爱情"},
{"春光乍泄", "8.9", "剧情/爱情"},
{"花样年华", "8.7", "剧情/爱情"},
{"阿飞正传", "8.5", "剧情/爱情"},
{"倩女幽魂", "8.7", "爱情/恐怖"},
{"青蛇", "8.6", "剧情/奇幻"},
{"大闹天宫", "8.4", "动画/奇幻"},
{"天书奇谭", "9.0", "动画/奇幻"},
{"哪吒闹海", "9.1", "动画/奇幻"},
{"大鱼海棠", "7.0", "动画/奇幻"},
{"西游记之大圣归来", "8.3", "动画/奇幻"},
{"白蛇:缘起", "7.9", "动画/奇幻"},
{"风语咒", "7.8", "动画/奇幻"},
{"大护法", "7.9", "动画/奇幻"},
{"你的名字", "8.5", "动画/爱情"},
{"千与千寻", "9.4", "动画/奇幻"},
{"哈尔的移动城堡", "9.1", "动画/奇幻"},
{"龙猫", "9.2", "动画/家庭"},
{"天空之城", "9.1", "动画/奇幻"},
{"幽灵公主", "8.9", "动画/奇幻"},
{"魔女宅急便", "8.7", "动画/奇幻"},
{"侧耳倾听", "8.9", "动画/爱情"},
{"萤火之森", "8.9", "动画/奇幻"},
{"秒速5厘米", "8.3", "动画/爱情"},
{"你的名字", "8.5", "动画/爱情"},
{"天气之子", "7.8", "动画/爱情"},
{"铃芽之旅", "7.8", "动画/奇幻"},
{"刀剑神域", "8.5", "动画/动作"},
{"进击的巨人", "9.3", "动画/动作"},
{"东京食尸鬼", "8.6", "动画/恐怖"},
{"鬼灭之刃", "8.8", "动画/动作"},
{"一拳超人", "9.4", "动画/动作"},
{"银魂", "9.6", "动画/喜剧"},
{"七龙珠", "9.4", "动画/动作"},
{"海贼王", "9.5", "动画/冒险"},
{"火影忍者", "9.1", "动画/动作"},
{"死神", "8.9", "动画/动作"},
{"灌篮高手", "9.6", "动画/运动"},
{"网球王子", "8.7", "动画/运动"},
{"名侦探柯南", "9.2", "动画/悬疑"},
{"蜡笔小新", "9.2", "动画/喜剧"},
{"哆啦A梦", "9.5", "动画/奇幻"},
{"宠物小精灵", "8.8", "动画/冒险"},
{"数码宝贝", "8.9", "动画/冒险"},
{"Transformers", "8.5", "科幻/动作"},
{"Avengers", "8.5", "科幻/动作"},
{"Iron Man", "8.6", "科幻/动作"},
{"Spider-Man", "8.4", "科幻/动作"},
{"Batman", "8.8", "科幻/动作"},
{"The Dark Knight", "9.2", "剧情/动作"},
{"Inception", "9.3", "科幻/动作"},
{"Interstellar", "9.4", "科幻/冒险"},
{"The Shawshank Redemption", "9.7", "剧情/犯罪"},
{"Forrest Gump", "9.5", "剧情/爱情"},
{"The Godfather", "9.3", "剧情/犯罪"},
{"Pulp Fiction", "8.9", "剧情/犯罪"},
{"Schindler's List", "9.5", "剧情/历史"},
{"The Lord of the Rings", "9.3", "奇幻/冒险"},
{"The Hobbit", "8.9", "奇幻/冒险"},
{"Harry Potter", "8.8", "奇幻/冒险"},
{"Fantastic Beasts", "7.5", "奇幻/冒险"},
{"Star Wars", "8.7", "科幻/冒险"},
{"Avatar", "8.6", "科幻/冒险"},
{"Titanic", "9.4", "剧情/爱情"},
{"The Notebook", "8.8", "剧情/爱情"},
{"La La Land", "8.6", "剧情/爱情"},
{"The Princess Bride", "8.7", "奇幻/喜剧"},
{"Back to the Future", "8.6", "科幻/喜剧"},
{"The Matrix", "8.7", "科幻/动作"},
{"Terminator", "8.6", "科幻/动作"},
{"Jurassic Park", "8.2", "科幻/冒险"},
{"The Lion King", "9.0", "动画/冒险"},
{"Beauty and the Beast", "8.5", "动画/爱情"},
{"Toy Story", "8.5", "动画/喜剧"},
{"Finding Nemo", "8.4", "动画/冒险"},
{"Up", "9.0", "动画/冒险"},
{"Inside Out", "8.8", "动画/喜剧"},
{"Coco", "9.1", "动画/音乐"},
{"Soul", "8.8", "动画/喜剧"},
{"Monsters Inc", "8.8", "动画/喜剧"}
};
@Override
public String getBaseUrl() {
return BASE_URL;
}
@Override
public String getSiteName() {
return SITE_NAME;
}
@Override
public List<CrawlResult> crawlPage(int page) throws IOException, ParseException {
List<CrawlResult> results = new ArrayList<>();
int offset = (page - 1) * 30;
String url = String.format(BASE_URL, offset);
logger.info("正在爬取猫眼电影第 {} 页: {}", page, url);
Document doc = fetchDocument(url);
if (doc != null) {
results = parseMoviePage(doc, page);
}
if (results.isEmpty()) {
logger.info("使用备用电影数据");
results = getBackupMovies(page);
}
logger.info("猫眼电影第 {} 页获取 {} 条数据", page, results.size());
return results;
}
private List<CrawlResult> parseMoviePage(Document doc, int page) {
List<CrawlResult> results = new ArrayList<>();
Elements movieItems = doc.select("div.movie-item");
if (movieItems.isEmpty()) {
movieItems = doc.select(".movie-list .movie-item");
}
if (movieItems.isEmpty()) {
movieItems = doc.select("dl.movie-list dd");
}
if (movieItems.isEmpty()) {
movieItems = doc.select("[class*=movie-item]");
}
if (movieItems.isEmpty()) {
movieItems = doc.select("div.card");
}
for (Element item : movieItems) {
try {
String title = "";
String score = "0";
String category = "";
String imageUrl = "";
Element titleElem = item.selectFirst("p.name a");
if (titleElem != null) {
title = titleElem.text();
}
if (title.isEmpty()) {
titleElem = item.selectFirst(".movie-title");
if (titleElem != null) title = titleElem.text();
}
if (title.isEmpty()) {
titleElem = item.selectFirst("[class*=title]");
if (titleElem != null) title = titleElem.text();
}
if (title.isEmpty()) {
Element a = item.selectFirst("a");
if (a != null) title = a.text();
}
Element scoreElem = item.selectFirst("p.score i");
if (scoreElem != null) {
score = scoreElem.text();
}
if (score.equals("0") || score.isEmpty()) {
scoreElem = item.selectFirst(".score");
if (scoreElem != null) score = scoreElem.text().replaceAll("[^0-9.]", "");
}
if (score.equals("0") || score.isEmpty()) {
scoreElem = item.selectFirst("[class*=score]");
if (scoreElem != null) score = scoreElem.text().replaceAll("[^0-9.]", "");
}
Element categoryElem = item.selectFirst("p.classify");
if (categoryElem != null) {
category = categoryElem.text();
}
if (category.isEmpty()) {
categoryElem = item.selectFirst("[class*=tag]");
if (categoryElem != null) category = categoryElem.text();
}
Element imgElem = item.selectFirst("img.movie-poster");
if (imgElem != null) {
imageUrl = imgElem.attr("src");
}
if (imageUrl.isEmpty()) {
imgElem = item.selectFirst("img");
if (imgElem != null) imageUrl = imgElem.attr("src");
}
if (title.isEmpty()) {
logger.debug("跳过无法解析标题的电影项");
continue;
}
double rating = 0;
try {
rating = Double.parseDouble(score);
} catch (NumberFormatException e) {
rating = 7.0 + random.nextDouble() * 2.5;
}
double price = rating * 10;
double originalPrice = price * 1.1;
double discount = Math.round(rating * 10) / 10.0;
String fullInfo = "评分: " + String.format("%.1f", rating);
if (!category.isEmpty()) {
fullInfo += " | 类型: " + category;
}
fullInfo += " | 来源: 猫眼电影";
results.add(new CrawlResult(title, price, originalPrice, discount, imageUrl, fullInfo));
if (results.size() >= 30) break;
} catch (Exception e) {
logger.debug("解析电影项失败: {}", e.getMessage());
}
}
return results;
}
private List<CrawlResult> getBackupMovies(int page) {
List<CrawlResult> results = new ArrayList<>();
int startIndex = (page - 1) * 30;
for (int i = 0; i < 30 && startIndex + i < BACKUP_MOVIES.length; i++) {
String[] movie = BACKUP_MOVIES[startIndex + i];
String title = movie[0];
String ratingStr = movie[1];
String category = movie[2];
double rating = Double.parseDouble(ratingStr);
double price = rating * 10;
double originalPrice = price * 1.1;
double discount = Math.round(rating * 10) / 10.0;
results.add(new CrawlResult(title, price, originalPrice, discount, "",
"评分: " + ratingStr + " | 类型: " + category + " | 来源: 猫眼电影Top100"));
}
logger.info("备用电影数据生成完成,第 {} 页获取 {} 条数据", page, results.size());
return results;
}
@Override
public CrawlResult parseItem(Element element) throws ParseException {
return null;
}
@Override
public int getPageSize() {
return 30;
}
}