You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

206 lines
7.7 KiB

package com.crawler.spider;
import com.crawler.model.Movie;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.*;
public class DoubanSpider {
private static final String BASE_URL = "https://movie.douban.com/top250";
private static final int MAX_PAGES = 10;
private static final int THREAD_POOL_SIZE = 3;
private static final int REQUEST_DELAY = 1000;
public List<Movie> crawlMovies() {
List<Movie> movieList = new ArrayList<>();
ExecutorService executorService = Executors.newFixedThreadPool(THREAD_POOL_SIZE);
List<Future<List<Movie>>> futures = new ArrayList<>();
try {
for (int page = 0; page < MAX_PAGES; page++) {
final int currentPage = page;
futures.add(executorService.submit(() -> {
try {
Thread.sleep(REQUEST_DELAY);
return crawlPage(currentPage);
} catch (Exception e) {
e.printStackTrace();
return new ArrayList<>();
}
}));
}
for (Future<List<Movie>> future : futures) {
try {
movieList.addAll(future.get());
} catch (Exception e) {
e.printStackTrace();
}
}
} finally {
executorService.shutdown();
}
return movieList;
}
private List<Movie> crawlPage(int page) throws IOException {
List<Movie> movieList = new ArrayList<>();
String url = BASE_URL + "?start=" + (page * 25);
System.out.println("爬取页面: " + url);
Document document = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
.timeout(10000)
.get();
System.out.println("页面标题: " + document.title());
// 选择电影条目
Elements movieItems = document.select(".grid_view li");
System.out.println("找到电影条目数: " + movieItems.size());
for (Element item : movieItems) {
Movie movie = parseMovie(item);
if (movie != null) {
movieList.add(movie);
}
}
System.out.println("页面" + (page + 1) + "爬取成功,获取电影数: " + movieList.size());
return movieList;
}
private Movie parseMovie(Element item) {
Movie movie = new Movie();
try {
// 排名
Element rankElement = item.selectFirst(".pic em");
if (rankElement != null) {
movie.setRank(Integer.parseInt(rankElement.text().trim()));
}
// 标题
Element titleElement = item.selectFirst(".title");
if (titleElement != null) {
movie.setTitle(titleElement.text().trim());
}
// 评分
Element ratingElement = item.selectFirst(".rating_num");
if (ratingElement != null) {
movie.setRating(Double.parseDouble(ratingElement.text().trim()));
}
// 评价人数
Element ratingPeopleElement = item.selectFirst(".star span:nth-child(4)");
if (ratingPeopleElement != null) {
String ratingPeople = ratingPeopleElement.text().trim();
movie.setRatingPeople(Integer.parseInt(ratingPeople.replaceAll("[^0-9]", "")));
}
// 导演和演员
Element infoElement = item.selectFirst(".bd p:first-child");
if (infoElement != null) {
String info = infoElement.text().trim();
// 提取导演
if (info.contains("导演:")) {
int directorStart = info.indexOf("导演:") + 3;
int directorEnd = info.indexOf("主演:");
if (directorEnd == -1) {
directorEnd = info.indexOf(" ");
// 找到第一个数字年份的位置
for (int i = 0; i < info.length(); i++) {
if (Character.isDigit(info.charAt(i))) {
directorEnd = i;
break;
}
}
}
if (directorEnd != -1) {
movie.setDirector(info.substring(directorStart, directorEnd).trim());
}
}
// 提取主演
if (info.contains("主演:")) {
int actorsStart = info.indexOf("主演:") + 3;
int actorsEnd = info.length();
// 找到第一个数字年份的位置
for (int i = actorsStart; i < info.length(); i++) {
if (Character.isDigit(info.charAt(i))) {
actorsEnd = i;
break;
}
}
movie.setActors(info.substring(actorsStart, actorsEnd).trim());
}
// 提取年份、国家/地区和类型
// 找到年份的开始位置(第一个数字)
int yearStart = -1;
for (int i = 0; i < info.length(); i++) {
if (Character.isDigit(info.charAt(i))) {
yearStart = i;
break;
}
}
if (yearStart != -1) {
// 提取年份(4位数字)
if (yearStart + 4 <= info.length()) {
String year = info.substring(yearStart, yearStart + 4);
if (year.matches("\\d{4}")) {
movie.setYear(year);
}
}
// 提取国家/地区和类型
int slashIndex = info.indexOf("/", yearStart);
if (slashIndex != -1) {
// 提取国家/地区
int nextSlashIndex = info.indexOf("/", slashIndex + 1);
if (nextSlashIndex != -1) {
String country = info.substring(slashIndex + 1, nextSlashIndex).trim();
movie.setCountry(country);
// 提取类型
String genre = info.substring(nextSlashIndex + 1).trim();
// 取第一个类型
if (!genre.isEmpty()) {
String[] genres = genre.split(" ");
if (genres.length > 0) {
movie.setGenre(genres[0]);
}
}
}
}
}
}
// 简介
Element quoteElement = item.selectFirst(".inq");
if (quoteElement != null) {
movie.setQuote(quoteElement.text().trim());
}
// 过滤无效电影
if (movie.getTitle() == null || movie.getTitle().isEmpty()) {
return null;
}
return movie;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
}