1 changed files with 194 additions and 0 deletions
@ -0,0 +1,194 @@ |
|||||
|
package project.crawler; |
||||
|
|
||||
|
import project.bean.Movie; |
||||
|
import project.utils.DataCleaner; |
||||
|
import project.utils.HttpUtils; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class MovieCrawler { |
||||
|
public static List<Movie> crawlMovies(int pageCount) throws Exception { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
|
||||
|
for (int page = 1; page <= pageCount; page++) { |
||||
|
String url = "https://movie.douban.com/top250?start=" + (page - 1) * 25; |
||||
|
System.out.println("Crawling page " + page + " from " + url); |
||||
|
try { |
||||
|
String html = HttpUtils.getHtml(url); |
||||
|
System.out.println("Got HTML content, length: " + html.length()); |
||||
|
|
||||
|
// 打印 HTML 内容的前 500 个字符,了解实际结构
|
||||
|
if (html.length() > 500) { |
||||
|
System.out.println("HTML preview: " + html.substring(0, 500) + "..."); |
||||
|
} |
||||
|
|
||||
|
List<Movie> pageMovies = parseMovies(html); |
||||
|
System.out.println("Parsed " + pageMovies.size() + " movies from page " + page); |
||||
|
movies.addAll(pageMovies); |
||||
|
} catch (Exception e) { |
||||
|
System.out.println("Error crawling page " + page + ": " + e.getMessage()); |
||||
|
} |
||||
|
Thread.sleep(1000); // 控制请求频率
|
||||
|
} |
||||
|
|
||||
|
System.out.println("Total movies crawled: " + movies.size()); |
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
private static List<Movie> parseMovies(String html) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
|
||||
|
// Find all movie items by looking for <div class="item"> and matching until </div> at the same nesting level
|
||||
|
int startIndex = 0; |
||||
|
int count = 0; |
||||
|
|
||||
|
while (true) { |
||||
|
int itemStart = html.indexOf("<div class=\"item\">", startIndex); |
||||
|
if (itemStart < 0) break; |
||||
|
|
||||
|
// Find the matching </div> by counting nested divs
|
||||
|
int pos = itemStart + "<div class=\"item\">".length(); |
||||
|
int depth = 1; |
||||
|
int itemEnd = -1; |
||||
|
|
||||
|
while (pos < html.length() && depth > 0) { |
||||
|
int nextOpen = html.indexOf("<div", pos); |
||||
|
int nextClose = html.indexOf("</div>", pos); |
||||
|
|
||||
|
if (nextClose < 0) break; // No closing tag found
|
||||
|
|
||||
|
if (nextOpen >= 0 && nextOpen < nextClose) { |
||||
|
// Found an opening div before closing
|
||||
|
depth++; |
||||
|
pos = nextOpen + 4; |
||||
|
} else { |
||||
|
// Found a closing div
|
||||
|
depth--; |
||||
|
if (depth == 0) { |
||||
|
itemEnd = nextClose + 6; |
||||
|
} |
||||
|
pos = nextClose + 6; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (itemEnd > itemStart) { |
||||
|
count++; |
||||
|
String movieHtml = html.substring(itemStart, itemEnd); |
||||
|
// Don't print movie HTML to avoid excessive output
|
||||
|
Movie movie = parseMovie(movieHtml); |
||||
|
if (movie != null) { |
||||
|
movies.add(movie); |
||||
|
} |
||||
|
startIndex = itemEnd; |
||||
|
} else { |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
System.out.println("Found " + count + " movie items, parsed " + movies.size() + " valid movies"); |
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
private static Movie parseMovie(String movieHtml) { |
||||
|
try { |
||||
|
// Extract title from img alt attribute
|
||||
|
String title = ""; |
||||
|
int altIndex = movieHtml.indexOf("alt="); |
||||
|
if (altIndex > 0) { |
||||
|
int start = movieHtml.indexOf('"', altIndex); |
||||
|
int end = movieHtml.indexOf('"', start + 1); |
||||
|
if (start > 0 && end > 0) { |
||||
|
title = movieHtml.substring(start + 1, end).trim(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Extract rating
|
||||
|
double rating = 0.0; |
||||
|
int ratingIndex = movieHtml.indexOf("rating_num"); |
||||
|
if (ratingIndex > 0) { |
||||
|
int start = movieHtml.indexOf('>', ratingIndex); |
||||
|
int end = movieHtml.indexOf("</span>", start); |
||||
|
if (start > 0 && end > 0) { |
||||
|
String ratingStr = movieHtml.substring(start + 1, end).trim(); |
||||
|
try { |
||||
|
rating = Double.parseDouble(ratingStr); |
||||
|
} catch (NumberFormatException e) { |
||||
|
rating = 0.0; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Extract year and director from movie info
|
||||
|
int year = 0; |
||||
|
String director = "Unknown"; |
||||
|
|
||||
|
// Find the info section which contains year and director
|
||||
|
// Look for <p> tag without class or with specific class
|
||||
|
int infoStart = -1; |
||||
|
int pStart = movieHtml.indexOf("<p>"); |
||||
|
int pClassStart = movieHtml.indexOf("<p class=\"\">"); |
||||
|
|
||||
|
if (pStart >= 0) { |
||||
|
infoStart = pStart; |
||||
|
} |
||||
|
if (pClassStart >= 0 && (pStart < 0 || pClassStart < pStart)) { |
||||
|
infoStart = pClassStart; |
||||
|
} |
||||
|
|
||||
|
if (infoStart > 0) { |
||||
|
int infoEnd = movieHtml.indexOf("</p>", infoStart); |
||||
|
if (infoEnd > infoStart) { |
||||
|
String infoSection = movieHtml.substring(infoStart, infoEnd); |
||||
|
|
||||
|
// Extract year - look for 4-digit year after <br> tag
|
||||
|
int brIndex = infoSection.indexOf("<br>"); |
||||
|
if (brIndex > 0) { |
||||
|
String afterBr = infoSection.substring(brIndex + 4).trim(); |
||||
|
// Find first 4-digit number
|
||||
|
for (int i = 0; i <= afterBr.length() - 4; i++) { |
||||
|
String possibleYear = afterBr.substring(i, i + 4); |
||||
|
if (possibleYear.matches("\\d{4}")) { |
||||
|
try { |
||||
|
year = Integer.parseInt(possibleYear); |
||||
|
break; |
||||
|
} catch (NumberFormatException e) { |
||||
|
// Continue
|
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Extract director - director info is between "导演:" and " "
|
||||
|
// Look for the pattern: 导演: [director name]
|
||||
|
int directorLabelIdx = infoSection.indexOf("\u5bfc\u6f14:"); // Unicode for "导演:"
|
||||
|
if (directorLabelIdx >= 0) { |
||||
|
int directorStart = directorLabelIdx + 3; // Skip "导演:"
|
||||
|
int directorEnd = infoSection.indexOf(" ", directorStart); |
||||
|
if (directorEnd > directorStart) { |
||||
|
director = infoSection.substring(directorStart, directorEnd).trim(); |
||||
|
// Clean up any remaining HTML
|
||||
|
director = director.replaceAll("<[^>]*>", "").trim(); |
||||
|
// Extract only Chinese name (before space)
|
||||
|
int spaceIdx = director.indexOf(" "); |
||||
|
if (spaceIdx > 0) { |
||||
|
director = director.substring(0, spaceIdx).trim(); |
||||
|
} |
||||
|
if (director.isEmpty()) director = "Unknown"; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// If title and rating are valid, create movie object
|
||||
|
if (!title.isEmpty() && rating > 0) { |
||||
|
return new Movie(title, rating, year, director); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
// Silently handle exceptions
|
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue