You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
59 lines
2.2 KiB
59 lines
2.2 KiB
package com.example.moviecli.strategy;
|
|
|
|
import com.example.moviecli.model.Movie;
|
|
import com.example.moviecli.exception.ParseFailedException;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class DoubanTop250Strategy implements MovieCrawlStrategy {
|
|
@Override
|
|
public boolean supports(String url) {
|
|
return url.contains("movie.douban.com/top250");
|
|
}
|
|
|
|
@Override
|
|
public List<Movie> parse(Document doc) throws ParseFailedException {
|
|
try {
|
|
List<Movie> movies = new ArrayList<>();
|
|
Elements items = doc.select(".item");
|
|
for (Element item : items) {
|
|
String rankText = item.select(".pic em").text();
|
|
int rank = Integer.parseInt(rankText);
|
|
String title = item.select(".title").first().text();
|
|
String originalTitle = "";
|
|
Elements titles = item.select(".title");
|
|
if (titles.size() > 1) {
|
|
originalTitle = titles.get(1).text().replace("/", "").trim();
|
|
}
|
|
String score = item.select(".rating_num").text();
|
|
String info = item.select(".bd p").first().text();
|
|
String year = extractYear(info);
|
|
String director = extractDirector(info);
|
|
movies.add(new Movie(rank, title, originalTitle, score, year, director));
|
|
}
|
|
return movies;
|
|
} catch (Exception e) {
|
|
throw new ParseFailedException("豆瓣电影解析失败", e);
|
|
}
|
|
}
|
|
|
|
private String extractYear(String info) {
|
|
for (String part : info.split(" ")) {
|
|
if (part.matches("\\d{4}")) return part;
|
|
}
|
|
return "未知";
|
|
}
|
|
|
|
private String extractDirector(String info) {
|
|
if (info.contains("导演:")) {
|
|
int start = info.indexOf("导演:") + 3;
|
|
int end = info.indexOf(" ", start);
|
|
if (end == -1) end = info.length();
|
|
return info.substring(start, end).trim();
|
|
}
|
|
return "未知";
|
|
}
|
|
}
|