From 4b55d64bcf00237dcfbf38ef8206ea522252c02f Mon Sep 17 00:00:00 2001 From: YangYuting <3481369387@qq.com> Date: Tue, 24 Mar 2026 17:55:44 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20'W3'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- W3/MovieCrawler.java | 141 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 W3/MovieCrawler.java diff --git a/W3/MovieCrawler.java b/W3/MovieCrawler.java new file mode 100644 index 0000000..851fbb0 --- /dev/null +++ b/W3/MovieCrawler.java @@ -0,0 +1,141 @@ +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class MovieCrawler { + private static final String BASE_URL = "https://movie.douban.com/top250"; + private static final int DELAY_MS = 1000; // 控制请求频率 + + public List crawlTopMovies(int limit) throws IOException { + List movies = new ArrayList<>(); + int page = 0; + + System.out.println("Starting to crawl movies..."); + + while (movies.size() < limit) { + String url = BASE_URL + "?start=" + page * 25; + System.out.println("Crawling page: " + url); + + try { + Document doc = Jsoup.connect(url) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") + .timeout(10000) + .get(); + + // 打印页面标题,确认是否成功获取页面 + System.out.println("Page title: " + doc.title()); + + // 选择电影元素 + Elements movieElements = doc.select(".item"); + System.out.println("Found " + movieElements.size() + " movie elements"); + + for (Element element : movieElements) { + if (movies.size() >= limit) break; + + int currentCount = movies.size() + 1; + System.out.println("Processing movie " + currentCount + "..."); + + try { + Movie movie = parseMovie(element); + if (movie != null) { + movies.add(movie); + System.out.println("Added movie: " + movie.getTitle()); + } else { + System.out.println("Skipping movie, parsing failed"); + } + + // 控制请求频率 + Thread.sleep(DELAY_MS); + } catch (Exception e) { + System.err.println("Error parsing movie: " + e.getMessage()); + e.printStackTrace(); + } + } + + page++; + } catch (Exception e) { + System.err.println("Error crawling page: " + e.getMessage()); + e.printStackTrace(); + break; + } + } + + System.out.println("Crawling finished. Found " + movies.size() + " movies."); + return movies; + } + + private Movie parseMovie(Element element) { + try { + // 提取标题 + Element titleElement = element.selectFirst(".hd .title"); + if (titleElement == null) { + System.err.println("Title element not found"); + return null; + } + String title = titleElement.text(); + System.out.println("Title: " + title); + + // 提取年份 + Element yearElement = element.selectFirst(".bd p:first-child"); + if (yearElement == null) { + System.err.println("Year element not found"); + return null; + } + String yearText = yearElement.text().trim(); + System.out.println("Year text: " + yearText); + + // 从字符串中提取年份 + int year = 0; + // 使用正则表达式提取年份 + java.util.regex.Matcher matcher = java.util.regex.Pattern.compile("(\\d{4})").matcher(yearText); + if (matcher.find()) { + year = Integer.parseInt(matcher.group(1)); + } + if (year == 0) { + System.err.println("Year not found in text: " + yearText); + return null; + } + System.out.println("Year: " + year); + + // 提取评分 + Element ratingElement = element.selectFirst(".rating_num"); + if (ratingElement == null) { + System.err.println("Rating element not found"); + return null; + } + String ratingText = ratingElement.text(); + System.out.println("Rating text: " + ratingText); + + double rating = Double.parseDouble(ratingText); + System.out.println("Rating: " + rating); + + // 提取类型 + String genre = ""; + String infoText = yearElement.text(); + if (infoText.contains("/")) { + String[] parts = infoText.split("/"); + if (parts.length > 2) { + genre = parts[2].trim(); + } + } + System.out.println("Genre: " + genre); + + // 简化处理,不进入详情页 + String director = ""; + String actors = ""; + String synopsis = ""; + + System.out.println("Parsed movie: " + title + " (" + year + ") - " + rating + " - " + genre); + return new Movie(title, year, rating, genre, director, actors, synopsis); + } catch (Exception e) { + System.err.println("Error parsing movie: " + e.getMessage()); + e.printStackTrace(); + return null; + } + } +}