You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
206 lines
7.7 KiB
206 lines
7.7 KiB
package com.crawler.spider;
|
|
|
|
import com.crawler.model.Movie;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.concurrent.*;
|
|
|
|
public class DoubanSpider {
|
|
private static final String BASE_URL = "https://movie.douban.com/top250";
|
|
private static final int MAX_PAGES = 10;
|
|
private static final int THREAD_POOL_SIZE = 3;
|
|
private static final int REQUEST_DELAY = 1000;
|
|
|
|
public List<Movie> crawlMovies() {
|
|
List<Movie> movieList = new ArrayList<>();
|
|
ExecutorService executorService = Executors.newFixedThreadPool(THREAD_POOL_SIZE);
|
|
List<Future<List<Movie>>> futures = new ArrayList<>();
|
|
|
|
try {
|
|
for (int page = 0; page < MAX_PAGES; page++) {
|
|
final int currentPage = page;
|
|
futures.add(executorService.submit(() -> {
|
|
try {
|
|
Thread.sleep(REQUEST_DELAY);
|
|
return crawlPage(currentPage);
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
return new ArrayList<>();
|
|
}
|
|
}));
|
|
}
|
|
|
|
for (Future<List<Movie>> future : futures) {
|
|
try {
|
|
movieList.addAll(future.get());
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
} finally {
|
|
executorService.shutdown();
|
|
}
|
|
|
|
return movieList;
|
|
}
|
|
|
|
private List<Movie> crawlPage(int page) throws IOException {
|
|
List<Movie> movieList = new ArrayList<>();
|
|
String url = BASE_URL + "?start=" + (page * 25);
|
|
System.out.println("爬取页面: " + url);
|
|
|
|
Document document = Jsoup.connect(url)
|
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
|
.timeout(10000)
|
|
.get();
|
|
|
|
System.out.println("页面标题: " + document.title());
|
|
|
|
// 选择电影条目
|
|
Elements movieItems = document.select(".grid_view li");
|
|
System.out.println("找到电影条目数: " + movieItems.size());
|
|
|
|
for (Element item : movieItems) {
|
|
Movie movie = parseMovie(item);
|
|
if (movie != null) {
|
|
movieList.add(movie);
|
|
}
|
|
}
|
|
|
|
System.out.println("页面" + (page + 1) + "爬取成功,获取电影数: " + movieList.size());
|
|
return movieList;
|
|
}
|
|
|
|
private Movie parseMovie(Element item) {
|
|
Movie movie = new Movie();
|
|
|
|
try {
|
|
// 排名
|
|
Element rankElement = item.selectFirst(".pic em");
|
|
if (rankElement != null) {
|
|
movie.setRank(Integer.parseInt(rankElement.text().trim()));
|
|
}
|
|
|
|
// 标题
|
|
Element titleElement = item.selectFirst(".title");
|
|
if (titleElement != null) {
|
|
movie.setTitle(titleElement.text().trim());
|
|
}
|
|
|
|
// 评分
|
|
Element ratingElement = item.selectFirst(".rating_num");
|
|
if (ratingElement != null) {
|
|
movie.setRating(Double.parseDouble(ratingElement.text().trim()));
|
|
}
|
|
|
|
// 评价人数
|
|
Element ratingPeopleElement = item.selectFirst(".star span:nth-child(4)");
|
|
if (ratingPeopleElement != null) {
|
|
String ratingPeople = ratingPeopleElement.text().trim();
|
|
movie.setRatingPeople(Integer.parseInt(ratingPeople.replaceAll("[^0-9]", "")));
|
|
}
|
|
|
|
// 导演和演员
|
|
Element infoElement = item.selectFirst(".bd p:first-child");
|
|
if (infoElement != null) {
|
|
String info = infoElement.text().trim();
|
|
|
|
// 提取导演
|
|
if (info.contains("导演:")) {
|
|
int directorStart = info.indexOf("导演:") + 3;
|
|
int directorEnd = info.indexOf("主演:");
|
|
if (directorEnd == -1) {
|
|
directorEnd = info.indexOf(" ");
|
|
// 找到第一个数字年份的位置
|
|
for (int i = 0; i < info.length(); i++) {
|
|
if (Character.isDigit(info.charAt(i))) {
|
|
directorEnd = i;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (directorEnd != -1) {
|
|
movie.setDirector(info.substring(directorStart, directorEnd).trim());
|
|
}
|
|
}
|
|
|
|
// 提取主演
|
|
if (info.contains("主演:")) {
|
|
int actorsStart = info.indexOf("主演:") + 3;
|
|
int actorsEnd = info.length();
|
|
// 找到第一个数字年份的位置
|
|
for (int i = actorsStart; i < info.length(); i++) {
|
|
if (Character.isDigit(info.charAt(i))) {
|
|
actorsEnd = i;
|
|
break;
|
|
}
|
|
}
|
|
movie.setActors(info.substring(actorsStart, actorsEnd).trim());
|
|
}
|
|
|
|
// 提取年份、国家/地区和类型
|
|
// 找到年份的开始位置(第一个数字)
|
|
int yearStart = -1;
|
|
for (int i = 0; i < info.length(); i++) {
|
|
if (Character.isDigit(info.charAt(i))) {
|
|
yearStart = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (yearStart != -1) {
|
|
// 提取年份(4位数字)
|
|
if (yearStart + 4 <= info.length()) {
|
|
String year = info.substring(yearStart, yearStart + 4);
|
|
if (year.matches("\\d{4}")) {
|
|
movie.setYear(year);
|
|
}
|
|
}
|
|
|
|
// 提取国家/地区和类型
|
|
int slashIndex = info.indexOf("/", yearStart);
|
|
if (slashIndex != -1) {
|
|
// 提取国家/地区
|
|
int nextSlashIndex = info.indexOf("/", slashIndex + 1);
|
|
if (nextSlashIndex != -1) {
|
|
String country = info.substring(slashIndex + 1, nextSlashIndex).trim();
|
|
movie.setCountry(country);
|
|
|
|
// 提取类型
|
|
String genre = info.substring(nextSlashIndex + 1).trim();
|
|
// 取第一个类型
|
|
if (!genre.isEmpty()) {
|
|
String[] genres = genre.split(" ");
|
|
if (genres.length > 0) {
|
|
movie.setGenre(genres[0]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// 简介
|
|
Element quoteElement = item.selectFirst(".inq");
|
|
if (quoteElement != null) {
|
|
movie.setQuote(quoteElement.text().trim());
|
|
}
|
|
|
|
// 过滤无效电影
|
|
if (movie.getTitle() == null || movie.getTitle().isEmpty()) {
|
|
return null;
|
|
}
|
|
|
|
return movie;
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
return null;
|
|
}
|
|
}
|
|
}
|