diff --git a/实验二/MovieCrawler.java b/实验二/MovieCrawler.java new file mode 100644 index 0000000..64539d8 --- /dev/null +++ b/实验二/MovieCrawler.java @@ -0,0 +1,109 @@ +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +// 豆瓣电影爬虫:继承抽象父类,实现多态,修复403反爬 +public class MovieCrawler extends BaseMovieCrawler { + private static final String BASE_URL = "https://movie.douban.com/top250"; + // 延长请求间隔,降低反爬风险 + private static final int DELAY_MS = 2000; + + @Override + public List crawl(int limit) throws IOException { + List movies = new ArrayList<>(); + int page = 0; + + System.out.println("=== 开始爬取豆瓣Top250电影 ==="); + + while (movies.size() < limit) { + String url = BASE_URL + "?start=" + (page * 25); + System.out.println("正在爬取页面:" + url); + + try { + // 🔴 核心修复:添加完整请求头,模拟真实浏览器,绕过403反爬 + Document doc = Jsoup.connect(url) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") + .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") + .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") + .header("Referer", "https://movie.douban.com/top250") + .header("Cache-Control", "max-age=0") + .header("Upgrade-Insecure-Requests", "1") + .cookie("bid", "abc123xyz456") + .timeout(15000) + .get(); + + // 选择页面中所有电影条目 + Elements movieElements = doc.select(".item"); + System.out.println("当前页面找到 " + movieElements.size() + " 部电影"); + + // 遍历每个电影条目,解析并封装成Movie对象 + for (Element element : movieElements) { + if (movies.size() >= limit) break; + Movie movie = parseMovie(element); + if (movie != null) { + movies.add(movie); + System.out.println("成功爬取:" + movie.getTitle()); + } + // 控制请求间隔,防止被封 + Thread.sleep(DELAY_MS); + } + + page++; + } catch (Exception e) { + System.err.println("爬取页面异常:" + e.getMessage()); + e.printStackTrace(); + break; + } + } + + System.out.println("=== 爬取完成,共获取 " + movies.size() + " 部电影 ==="); + return movies; + } + + @Override + protected Movie parseMovie(Element element) { + try { + // 1. 提取电影标题 + Element titleElement = element.selectFirst(".hd .title"); + if (titleElement == null) return null; + String title = titleElement.text(); + + // 2. 提取年份(正则匹配4位数字) + Element infoElement = element.selectFirst(".bd p:first-child"); + if (infoElement == null) return null; + String infoText = infoElement.text(); + int year = 0; + Matcher matcher = Pattern.compile("(\\d{4})").matcher(infoText); + if (matcher.find()) { + year = Integer.parseInt(matcher.group(1)); + } + if (year == 0) return null; + + // 3. 提取评分 + Element ratingElement = element.selectFirst(".rating_num"); + if (ratingElement == null) return null; + double rating = Double.parseDouble(ratingElement.text()); + + // 4. 提取电影类型 + String genre = ""; + if (infoText.contains("/")) { + String[] parts = infoText.split("/"); + if (parts.length > 2) { + genre = parts[2].trim(); + } + } + + // 5. 封装成具体子类对象(院线电影示例) + return new TheatreMovie(title, year, rating, genre, 59.9); + } catch (Exception e) { + System.err.println("解析电影异常:" + e.getMessage()); + return null; + } + } +} \ No newline at end of file