import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class MovieCrawler { private static final String BASE_URL = "https://movie.douban.com/top250"; private static final int DELAY_MS = 1000; // 控制请求频率 public List crawlTopMovies(int limit) throws IOException { List movies = new ArrayList<>(); int page = 0; System.out.println("Starting to crawl movies..."); while (movies.size() < limit) { String url = BASE_URL + "?start=" + page * 25; System.out.println("Crawling page: " + url); try { Document doc = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") .timeout(10000) .get(); // 打印页面标题,确认是否成功获取页面 System.out.println("Page title: " + doc.title()); // 选择电影元素 Elements movieElements = doc.select(".item"); System.out.println("Found " + movieElements.size() + " movie elements"); for (Element element : movieElements) { if (movies.size() >= limit) break; int currentCount = movies.size() + 1; System.out.println("Processing movie " + currentCount + "..."); try { Movie movie = parseMovie(element); if (movie != null) { movies.add(movie); System.out.println("Added movie: " + movie.getTitle()); } else { System.out.println("Skipping movie, parsing failed"); } // 控制请求频率 Thread.sleep(DELAY_MS); } catch (Exception e) { System.err.println("Error parsing movie: " + e.getMessage()); e.printStackTrace(); } } page++; } catch (Exception e) { System.err.println("Error crawling page: " + e.getMessage()); e.printStackTrace(); break; } } System.out.println("Crawling finished. Found " + movies.size() + " movies."); return movies; } private Movie parseMovie(Element element) { try { // 提取标题 Element titleElement = element.selectFirst(".hd .title"); if (titleElement == null) { System.err.println("Title element not found"); return null; } String title = titleElement.text(); System.out.println("Title: " + title); // 提取年份 Element yearElement = element.selectFirst(".bd p:first-child"); if (yearElement == null) { System.err.println("Year element not found"); return null; } String yearText = yearElement.text().trim(); System.out.println("Year text: " + yearText); // 从字符串中提取年份 int year = 0; // 使用正则表达式提取年份 java.util.regex.Matcher matcher = java.util.regex.Pattern.compile("(\\d{4})").matcher(yearText); if (matcher.find()) { year = Integer.parseInt(matcher.group(1)); } if (year == 0) { System.err.println("Year not found in text: " + yearText); return null; } System.out.println("Year: " + year); // 提取评分 Element ratingElement = element.selectFirst(".rating_num"); if (ratingElement == null) { System.err.println("Rating element not found"); return null; } String ratingText = ratingElement.text(); System.out.println("Rating text: " + ratingText); double rating = Double.parseDouble(ratingText); System.out.println("Rating: " + rating); // 提取类型 String genre = ""; String infoText = yearElement.text(); if (infoText.contains("/")) { String[] parts = infoText.split("/"); if (parts.length > 2) { genre = parts[2].trim(); } } System.out.println("Genre: " + genre); // 简化处理,不进入详情页 String director = ""; String actors = ""; String synopsis = ""; System.out.println("Parsed movie: " + title + " (" + year + ") - " + rating + " - " + genre); return new Movie(title, year, rating, genre, director, actors, synopsis); } catch (Exception e) { System.err.println("Error parsing movie: " + e.getMessage()); e.printStackTrace(); return null; } } }