package com.crawler.strategy; import com.crawler.model.Article; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class DoubanTop250Strategy implements CrawlStrategy { private static final int TOTAL_MOVIES = 250; private static final int MOVIES_PER_PAGE = 25; @Override public List
crawl(String url) { List
allMovies = new ArrayList<>(); try { System.out.println("🎬 开始爬取豆瓣电影 Top 250..."); System.out.println("⏳ 预计需要爬取 " + (TOTAL_MOVIES / MOVIES_PER_PAGE) + " 页"); for (int page = 0; page < TOTAL_MOVIES; page += MOVIES_PER_PAGE) { String pageUrl = "https://movie.douban.com/top250?start=" + page + "&filter="; System.out.println("📄 正在爬取第 " + (page / MOVIES_PER_PAGE + 1) + " 页..."); List
pageMovies = crawlPage(pageUrl, page / MOVIES_PER_PAGE + 1); allMovies.addAll(pageMovies); System.out.println("✅ 第 " + (page / MOVIES_PER_PAGE + 1) + " 页完成,已获取 " + allMovies.size() + " 部电影"); try { Thread.sleep(1000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); break; } } System.out.println("🎉 完成!共爬取 " + allMovies.size() + " 部电影"); } catch (Exception e) { System.err.println("❌ 爬取失败: " + e.getMessage()); Article errorArticle = new Article(); errorArticle.setTitle("Error crawling Douban Top 250"); errorArticle.setUrl(url); errorArticle.setContent("Error details: " + e.getMessage()); errorArticle.setSource("douban"); allMovies.add(errorArticle); } return allMovies; } private List
crawlPage(String url, int pageNum) { List
movies = new ArrayList<>(); try { URL urlObj = new URL(url); HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); connection.setRequestMethod("GET"); connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); connection.setConnectTimeout(15000); connection.setReadTimeout(15000); StringBuilder html = new StringBuilder(); try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { String line; while ((line = reader.readLine()) != null) { html.append(line).append("\n"); } } movies = parseMovies(html.toString()); } catch (Exception e) { System.err.println("⚠️ 第 " + pageNum + " 页爬取失败: " + e.getMessage()); } return movies; } private List
parseMovies(String html) { List
movies = new ArrayList<>(); String moviePattern = "
[\\s\\S]*?
\\s*\\s*"; Pattern pattern = Pattern.compile(moviePattern, Pattern.DOTALL); Matcher matcher = pattern.matcher(html); while (matcher.find()) { try { Article movie = parseSingleMovie(matcher.group()); if (movie != null) { movies.add(movie); } } catch (Exception e) { continue; } } return movies; } private Article parseSingleMovie(String movieHtml) { Article movie = new Article(); movie.setSource("douban"); try { Pattern titlePattern = Pattern.compile("(.*?)"); Matcher titleMatcher = titlePattern.matcher(movieHtml); if (titleMatcher.find()) { movie.setTitle(titleMatcher.group(1)); } Pattern linkPattern = Pattern.compile("(.*?)"); Matcher ratingMatcher = ratingPattern.matcher(movieHtml); String rating = ""; if (ratingMatcher.find()) { rating = ratingMatcher.group(1); } Pattern yearPattern = Pattern.compile("(\\d{4})\\s*/"); Matcher yearMatcher = yearPattern.matcher(movieHtml); String year = ""; if (yearMatcher.find()) { year = yearMatcher.group(1); } Pattern quotePattern = Pattern.compile("(.*?)"); Matcher quoteMatcher = quotePattern.matcher(movieHtml); String quote = ""; if (quoteMatcher.find()) { quote = quoteMatcher.group(1); } Pattern infoPattern = Pattern.compile("

(.*?)

", Pattern.DOTALL); Matcher infoMatcher = infoPattern.matcher(movieHtml); String info = ""; if (infoMatcher.find()) { info = infoMatcher.group(1).replaceAll("", "\n").replaceAll("<[^>]+>", "").trim(); } StringBuilder content = new StringBuilder(); content.append("🎬 电影名称: ").append(movie.getTitle()).append("\n"); content.append("⭐ 评分: ").append(rating).append("\n"); content.append("📅 年份: ").append(year).append("\n"); if (!quote.isEmpty()) { content.append("💬 简介: ").append(quote).append("\n"); } content.append("\n📝 详细信息:\n").append(info); movie.setContent(content.toString()); movie.setAuthor("豆瓣电影"); } catch (Exception e) { return null; } return movie; } @Override public String getStrategyName() { return "douban"; } }