You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
109 lines
4.4 KiB
109 lines
4.4 KiB
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
// 豆瓣电影爬虫:继承抽象父类,实现多态,修复403反爬
|
|
public class MovieCrawler extends BaseMovieCrawler {
|
|
private static final String BASE_URL = "https://movie.douban.com/top250";
|
|
// 延长请求间隔,降低反爬风险
|
|
private static final int DELAY_MS = 2000;
|
|
|
|
@Override
|
|
public List<Movie> crawl(int limit) throws IOException {
|
|
List<Movie> movies = new ArrayList<>();
|
|
int page = 0;
|
|
|
|
System.out.println("=== 开始爬取豆瓣Top250电影 ===");
|
|
|
|
while (movies.size() < limit) {
|
|
String url = BASE_URL + "?start=" + (page * 25);
|
|
System.out.println("正在爬取页面:" + url);
|
|
|
|
try {
|
|
// 🔴 核心修复:添加完整请求头,模拟真实浏览器,绕过403反爬
|
|
Document doc = Jsoup.connect(url)
|
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
|
|
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
|
.header("Referer", "https://movie.douban.com/top250")
|
|
.header("Cache-Control", "max-age=0")
|
|
.header("Upgrade-Insecure-Requests", "1")
|
|
.cookie("bid", "abc123xyz456")
|
|
.timeout(15000)
|
|
.get();
|
|
|
|
// 选择页面中所有电影条目
|
|
Elements movieElements = doc.select(".item");
|
|
System.out.println("当前页面找到 " + movieElements.size() + " 部电影");
|
|
|
|
// 遍历每个电影条目,解析并封装成Movie对象
|
|
for (Element element : movieElements) {
|
|
if (movies.size() >= limit) break;
|
|
Movie movie = parseMovie(element);
|
|
if (movie != null) {
|
|
movies.add(movie);
|
|
System.out.println("成功爬取:" + movie.getTitle());
|
|
}
|
|
// 控制请求间隔,防止被封
|
|
Thread.sleep(DELAY_MS);
|
|
}
|
|
|
|
page++;
|
|
} catch (Exception e) {
|
|
System.err.println("爬取页面异常:" + e.getMessage());
|
|
e.printStackTrace();
|
|
break;
|
|
}
|
|
}
|
|
|
|
System.out.println("=== 爬取完成,共获取 " + movies.size() + " 部电影 ===");
|
|
return movies;
|
|
}
|
|
|
|
@Override
|
|
protected Movie parseMovie(Element element) {
|
|
try {
|
|
// 1. 提取电影标题
|
|
Element titleElement = element.selectFirst(".hd .title");
|
|
if (titleElement == null) return null;
|
|
String title = titleElement.text();
|
|
|
|
// 2. 提取年份(正则匹配4位数字)
|
|
Element infoElement = element.selectFirst(".bd p:first-child");
|
|
if (infoElement == null) return null;
|
|
String infoText = infoElement.text();
|
|
int year = 0;
|
|
Matcher matcher = Pattern.compile("(\\d{4})").matcher(infoText);
|
|
if (matcher.find()) {
|
|
year = Integer.parseInt(matcher.group(1));
|
|
}
|
|
if (year == 0) return null;
|
|
|
|
// 3. 提取评分
|
|
Element ratingElement = element.selectFirst(".rating_num");
|
|
if (ratingElement == null) return null;
|
|
double rating = Double.parseDouble(ratingElement.text());
|
|
|
|
// 4. 提取电影类型
|
|
String genre = "";
|
|
if (infoText.contains("/")) {
|
|
String[] parts = infoText.split("/");
|
|
if (parts.length > 2) {
|
|
genre = parts[2].trim();
|
|
}
|
|
}
|
|
|
|
// 5. 封装成具体子类对象(院线电影示例)
|
|
return new TheatreMovie(title, year, rating, genre, 59.9);
|
|
} catch (Exception e) {
|
|
System.err.println("解析电影异常:" + e.getMessage());
|
|
return null;
|
|
}
|
|
}
|
|
}
|