You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
170 lines
6.5 KiB
170 lines
6.5 KiB
package com.crawler.strategy;
|
|
|
|
import com.crawler.model.Article;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.InputStreamReader;
|
|
import java.net.HttpURLConnection;
|
|
import java.net.URL;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class DoubanTop250Strategy implements CrawlStrategy {
|
|
|
|
private static final int TOTAL_MOVIES = 250;
|
|
private static final int MOVIES_PER_PAGE = 25;
|
|
|
|
@Override
|
|
public List<Article> crawl(String url) {
|
|
List<Article> allMovies = new ArrayList<>();
|
|
try {
|
|
System.out.println("🎬 开始爬取豆瓣电影 Top 250...");
|
|
System.out.println("⏳ 预计需要爬取 " + (TOTAL_MOVIES / MOVIES_PER_PAGE) + " 页");
|
|
|
|
for (int page = 0; page < TOTAL_MOVIES; page += MOVIES_PER_PAGE) {
|
|
String pageUrl = "https://movie.douban.com/top250?start=" + page + "&filter=";
|
|
System.out.println("📄 正在爬取第 " + (page / MOVIES_PER_PAGE + 1) + " 页...");
|
|
|
|
List<Article> pageMovies = crawlPage(pageUrl, page / MOVIES_PER_PAGE + 1);
|
|
allMovies.addAll(pageMovies);
|
|
|
|
System.out.println("✅ 第 " + (page / MOVIES_PER_PAGE + 1) + " 页完成,已获取 " + allMovies.size() + " 部电影");
|
|
|
|
try {
|
|
Thread.sleep(1000);
|
|
} catch (InterruptedException e) {
|
|
Thread.currentThread().interrupt();
|
|
break;
|
|
}
|
|
}
|
|
|
|
System.out.println("🎉 完成!共爬取 " + allMovies.size() + " 部电影");
|
|
} catch (Exception e) {
|
|
System.err.println("❌ 爬取失败: " + e.getMessage());
|
|
Article errorArticle = new Article();
|
|
errorArticle.setTitle("Error crawling Douban Top 250");
|
|
errorArticle.setUrl(url);
|
|
errorArticle.setContent("Error details: " + e.getMessage());
|
|
errorArticle.setSource("douban");
|
|
allMovies.add(errorArticle);
|
|
}
|
|
return allMovies;
|
|
}
|
|
|
|
private List<Article> crawlPage(String url, int pageNum) {
|
|
List<Article> movies = new ArrayList<>();
|
|
try {
|
|
URL urlObj = new URL(url);
|
|
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
|
|
connection.setRequestMethod("GET");
|
|
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
|
|
connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
|
connection.setConnectTimeout(15000);
|
|
connection.setReadTimeout(15000);
|
|
|
|
StringBuilder html = new StringBuilder();
|
|
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
|
|
String line;
|
|
while ((line = reader.readLine()) != null) {
|
|
html.append(line).append("\n");
|
|
}
|
|
}
|
|
|
|
movies = parseMovies(html.toString());
|
|
} catch (Exception e) {
|
|
System.err.println("⚠️ 第 " + pageNum + " 页爬取失败: " + e.getMessage());
|
|
}
|
|
return movies;
|
|
}
|
|
|
|
private List<Article> parseMovies(String html) {
|
|
List<Article> movies = new ArrayList<>();
|
|
|
|
String moviePattern = "<div class=\"item\">[\\s\\S]*?</div>\\s*</div>\\s*</div>";
|
|
Pattern pattern = Pattern.compile(moviePattern, Pattern.DOTALL);
|
|
Matcher matcher = pattern.matcher(html);
|
|
|
|
while (matcher.find()) {
|
|
try {
|
|
Article movie = parseSingleMovie(matcher.group());
|
|
if (movie != null) {
|
|
movies.add(movie);
|
|
}
|
|
} catch (Exception e) {
|
|
continue;
|
|
}
|
|
}
|
|
return movies;
|
|
}
|
|
|
|
private Article parseSingleMovie(String movieHtml) {
|
|
Article movie = new Article();
|
|
movie.setSource("douban");
|
|
|
|
try {
|
|
Pattern titlePattern = Pattern.compile("<span class=\"title\">(.*?)</span>");
|
|
Matcher titleMatcher = titlePattern.matcher(movieHtml);
|
|
if (titleMatcher.find()) {
|
|
movie.setTitle(titleMatcher.group(1));
|
|
}
|
|
|
|
Pattern linkPattern = Pattern.compile("<a href=\"(.*?)\"");
|
|
Matcher linkMatcher = linkPattern.matcher(movieHtml);
|
|
if (linkMatcher.find()) {
|
|
movie.setUrl(linkMatcher.group(1));
|
|
}
|
|
|
|
Pattern ratingPattern = Pattern.compile("<span class=\"rating_num\">(.*?)</span>");
|
|
Matcher ratingMatcher = ratingPattern.matcher(movieHtml);
|
|
String rating = "";
|
|
if (ratingMatcher.find()) {
|
|
rating = ratingMatcher.group(1);
|
|
}
|
|
|
|
Pattern yearPattern = Pattern.compile("(\\d{4})\\s*/");
|
|
Matcher yearMatcher = yearPattern.matcher(movieHtml);
|
|
String year = "";
|
|
if (yearMatcher.find()) {
|
|
year = yearMatcher.group(1);
|
|
}
|
|
|
|
Pattern quotePattern = Pattern.compile("<span class=\"inq\">(.*?)</span>");
|
|
Matcher quoteMatcher = quotePattern.matcher(movieHtml);
|
|
String quote = "";
|
|
if (quoteMatcher.find()) {
|
|
quote = quoteMatcher.group(1);
|
|
}
|
|
|
|
Pattern infoPattern = Pattern.compile("<p class=\"\">(.*?)</p>", Pattern.DOTALL);
|
|
Matcher infoMatcher = infoPattern.matcher(movieHtml);
|
|
String info = "";
|
|
if (infoMatcher.find()) {
|
|
info = infoMatcher.group(1).replaceAll("<br\\s*/?>", "\n").replaceAll("<[^>]+>", "").trim();
|
|
}
|
|
|
|
StringBuilder content = new StringBuilder();
|
|
content.append("🎬 电影名称: ").append(movie.getTitle()).append("\n");
|
|
content.append("⭐ 评分: ").append(rating).append("\n");
|
|
content.append("📅 年份: ").append(year).append("\n");
|
|
if (!quote.isEmpty()) {
|
|
content.append("💬 简介: ").append(quote).append("\n");
|
|
}
|
|
content.append("\n📝 详细信息:\n").append(info);
|
|
|
|
movie.setContent(content.toString());
|
|
movie.setAuthor("豆瓣电影");
|
|
|
|
} catch (Exception e) {
|
|
return null;
|
|
}
|
|
|
|
return movie;
|
|
}
|
|
|
|
@Override
|
|
public String getStrategyName() {
|
|
return "douban";
|
|
}
|
|
}
|