package com.crawler.spider; import com.crawler.model.Movie; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.*; public class DoubanSpider { private static final String BASE_URL = "https://movie.douban.com/top250"; private static final int MAX_PAGES = 10; private static final int THREAD_POOL_SIZE = 3; private static final int REQUEST_DELAY = 1000; public List crawlMovies() { List movieList = new ArrayList<>(); ExecutorService executorService = Executors.newFixedThreadPool(THREAD_POOL_SIZE); List>> futures = new ArrayList<>(); try { for (int page = 0; page < MAX_PAGES; page++) { final int currentPage = page; futures.add(executorService.submit(() -> { try { Thread.sleep(REQUEST_DELAY); return crawlPage(currentPage); } catch (Exception e) { e.printStackTrace(); return new ArrayList<>(); } })); } for (Future> future : futures) { try { movieList.addAll(future.get()); } catch (Exception e) { e.printStackTrace(); } } } finally { executorService.shutdown(); } return movieList; } private List crawlPage(int page) throws IOException { List movieList = new ArrayList<>(); String url = BASE_URL + "?start=" + (page * 25); System.out.println("爬取页面: " + url); Document document = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") .timeout(10000) .get(); System.out.println("页面标题: " + document.title()); // 选择电影条目 Elements movieItems = document.select(".grid_view li"); System.out.println("找到电影条目数: " + movieItems.size()); for (Element item : movieItems) { Movie movie = parseMovie(item); if (movie != null) { movieList.add(movie); } } System.out.println("页面" + (page + 1) + "爬取成功,获取电影数: " + movieList.size()); return movieList; } private Movie parseMovie(Element item) { Movie movie = new Movie(); try { // 排名 Element rankElement = item.selectFirst(".pic em"); if (rankElement != null) { movie.setRank(Integer.parseInt(rankElement.text().trim())); } // 标题 Element titleElement = item.selectFirst(".title"); if (titleElement != null) { movie.setTitle(titleElement.text().trim()); } // 评分 Element ratingElement = item.selectFirst(".rating_num"); if (ratingElement != null) { movie.setRating(Double.parseDouble(ratingElement.text().trim())); } // 评价人数 Element ratingPeopleElement = item.selectFirst(".star span:nth-child(4)"); if (ratingPeopleElement != null) { String ratingPeople = ratingPeopleElement.text().trim(); movie.setRatingPeople(Integer.parseInt(ratingPeople.replaceAll("[^0-9]", ""))); } // 导演和演员 Element infoElement = item.selectFirst(".bd p:first-child"); if (infoElement != null) { String info = infoElement.text().trim(); // 提取导演 if (info.contains("导演:")) { int directorStart = info.indexOf("导演:") + 3; int directorEnd = info.indexOf("主演:"); if (directorEnd == -1) { directorEnd = info.indexOf(" "); // 找到第一个数字年份的位置 for (int i = 0; i < info.length(); i++) { if (Character.isDigit(info.charAt(i))) { directorEnd = i; break; } } } if (directorEnd != -1) { movie.setDirector(info.substring(directorStart, directorEnd).trim()); } } // 提取主演 if (info.contains("主演:")) { int actorsStart = info.indexOf("主演:") + 3; int actorsEnd = info.length(); // 找到第一个数字年份的位置 for (int i = actorsStart; i < info.length(); i++) { if (Character.isDigit(info.charAt(i))) { actorsEnd = i; break; } } movie.setActors(info.substring(actorsStart, actorsEnd).trim()); } // 提取年份、国家/地区和类型 // 找到年份的开始位置(第一个数字) int yearStart = -1; for (int i = 0; i < info.length(); i++) { if (Character.isDigit(info.charAt(i))) { yearStart = i; break; } } if (yearStart != -1) { // 提取年份(4位数字) if (yearStart + 4 <= info.length()) { String year = info.substring(yearStart, yearStart + 4); if (year.matches("\\d{4}")) { movie.setYear(year); } } // 提取国家/地区和类型 int slashIndex = info.indexOf("/", yearStart); if (slashIndex != -1) { // 提取国家/地区 int nextSlashIndex = info.indexOf("/", slashIndex + 1); if (nextSlashIndex != -1) { String country = info.substring(slashIndex + 1, nextSlashIndex).trim(); movie.setCountry(country); // 提取类型 String genre = info.substring(nextSlashIndex + 1).trim(); // 取第一个类型 if (!genre.isEmpty()) { String[] genres = genre.split(" "); if (genres.length > 0) { movie.setGenre(genres[0]); } } } } } } // 简介 Element quoteElement = item.selectFirst(".inq"); if (quoteElement != null) { movie.setQuote(quoteElement.text().trim()); } // 过滤无效电影 if (movie.getTitle() == null || movie.getTitle().isEmpty()) { return null; } return movie; } catch (Exception e) { e.printStackTrace(); return null; } } }