import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; // 豆瓣电影爬虫:继承抽象父类,实现多态,修复403反爬 public class MovieCrawler extends BaseMovieCrawler { private static final String BASE_URL = "https://movie.douban.com/top250"; // 延长请求间隔,降低反爬风险 private static final int DELAY_MS = 2000; @Override public List crawl(int limit) throws IOException { List movies = new ArrayList<>(); int page = 0; System.out.println("=== 开始爬取豆瓣Top250电影 ==="); while (movies.size() < limit) { String url = BASE_URL + "?start=" + (page * 25); System.out.println("正在爬取页面:" + url); try { // 🔴 核心修复:添加完整请求头,模拟真实浏览器,绕过403反爬 Document doc = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") .header("Referer", "https://movie.douban.com/top250") .header("Cache-Control", "max-age=0") .header("Upgrade-Insecure-Requests", "1") .cookie("bid", "abc123xyz456") .timeout(15000) .get(); // 选择页面中所有电影条目 Elements movieElements = doc.select(".item"); System.out.println("当前页面找到 " + movieElements.size() + " 部电影"); // 遍历每个电影条目,解析并封装成Movie对象 for (Element element : movieElements) { if (movies.size() >= limit) break; Movie movie = parseMovie(element); if (movie != null) { movies.add(movie); System.out.println("成功爬取:" + movie.getTitle()); } // 控制请求间隔,防止被封 Thread.sleep(DELAY_MS); } page++; } catch (Exception e) { System.err.println("爬取页面异常:" + e.getMessage()); e.printStackTrace(); break; } } System.out.println("=== 爬取完成,共获取 " + movies.size() + " 部电影 ==="); return movies; } @Override protected Movie parseMovie(Element element) { try { // 1. 提取电影标题 Element titleElement = element.selectFirst(".hd .title"); if (titleElement == null) return null; String title = titleElement.text(); // 2. 提取年份(正则匹配4位数字) Element infoElement = element.selectFirst(".bd p:first-child"); if (infoElement == null) return null; String infoText = infoElement.text(); int year = 0; Matcher matcher = Pattern.compile("(\\d{4})").matcher(infoText); if (matcher.find()) { year = Integer.parseInt(matcher.group(1)); } if (year == 0) return null; // 3. 提取评分 Element ratingElement = element.selectFirst(".rating_num"); if (ratingElement == null) return null; double rating = Double.parseDouble(ratingElement.text()); // 4. 提取电影类型 String genre = ""; if (infoText.contains("/")) { String[] parts = infoText.split("/"); if (parts.length > 2) { genre = parts[2].trim(); } } // 5. 封装成具体子类对象(院线电影示例) return new TheatreMovie(title, year, rating, genre, 59.9); } catch (Exception e) { System.err.println("解析电影异常:" + e.getMessage()); return null; } } }