1 changed files with 194 additions and 0 deletions
@ -0,0 +1,194 @@ |
|||
package project.crawler; |
|||
|
|||
import project.bean.Movie; |
|||
import project.utils.DataCleaner; |
|||
import project.utils.HttpUtils; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class MovieCrawler { |
|||
public static List<Movie> crawlMovies(int pageCount) throws Exception { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
|
|||
for (int page = 1; page <= pageCount; page++) { |
|||
String url = "https://movie.douban.com/top250?start=" + (page - 1) * 25; |
|||
System.out.println("Crawling page " + page + " from " + url); |
|||
try { |
|||
String html = HttpUtils.getHtml(url); |
|||
System.out.println("Got HTML content, length: " + html.length()); |
|||
|
|||
// 打印 HTML 内容的前 500 个字符,了解实际结构
|
|||
if (html.length() > 500) { |
|||
System.out.println("HTML preview: " + html.substring(0, 500) + "..."); |
|||
} |
|||
|
|||
List<Movie> pageMovies = parseMovies(html); |
|||
System.out.println("Parsed " + pageMovies.size() + " movies from page " + page); |
|||
movies.addAll(pageMovies); |
|||
} catch (Exception e) { |
|||
System.out.println("Error crawling page " + page + ": " + e.getMessage()); |
|||
} |
|||
Thread.sleep(1000); // 控制请求频率
|
|||
} |
|||
|
|||
System.out.println("Total movies crawled: " + movies.size()); |
|||
return movies; |
|||
} |
|||
|
|||
private static List<Movie> parseMovies(String html) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
|
|||
// Find all movie items by looking for <div class="item"> and matching until </div> at the same nesting level
|
|||
int startIndex = 0; |
|||
int count = 0; |
|||
|
|||
while (true) { |
|||
int itemStart = html.indexOf("<div class=\"item\">", startIndex); |
|||
if (itemStart < 0) break; |
|||
|
|||
// Find the matching </div> by counting nested divs
|
|||
int pos = itemStart + "<div class=\"item\">".length(); |
|||
int depth = 1; |
|||
int itemEnd = -1; |
|||
|
|||
while (pos < html.length() && depth > 0) { |
|||
int nextOpen = html.indexOf("<div", pos); |
|||
int nextClose = html.indexOf("</div>", pos); |
|||
|
|||
if (nextClose < 0) break; // No closing tag found
|
|||
|
|||
if (nextOpen >= 0 && nextOpen < nextClose) { |
|||
// Found an opening div before closing
|
|||
depth++; |
|||
pos = nextOpen + 4; |
|||
} else { |
|||
// Found a closing div
|
|||
depth--; |
|||
if (depth == 0) { |
|||
itemEnd = nextClose + 6; |
|||
} |
|||
pos = nextClose + 6; |
|||
} |
|||
} |
|||
|
|||
if (itemEnd > itemStart) { |
|||
count++; |
|||
String movieHtml = html.substring(itemStart, itemEnd); |
|||
// Don't print movie HTML to avoid excessive output
|
|||
Movie movie = parseMovie(movieHtml); |
|||
if (movie != null) { |
|||
movies.add(movie); |
|||
} |
|||
startIndex = itemEnd; |
|||
} else { |
|||
break; |
|||
} |
|||
} |
|||
|
|||
System.out.println("Found " + count + " movie items, parsed " + movies.size() + " valid movies"); |
|||
return movies; |
|||
} |
|||
|
|||
private static Movie parseMovie(String movieHtml) { |
|||
try { |
|||
// Extract title from img alt attribute
|
|||
String title = ""; |
|||
int altIndex = movieHtml.indexOf("alt="); |
|||
if (altIndex > 0) { |
|||
int start = movieHtml.indexOf('"', altIndex); |
|||
int end = movieHtml.indexOf('"', start + 1); |
|||
if (start > 0 && end > 0) { |
|||
title = movieHtml.substring(start + 1, end).trim(); |
|||
} |
|||
} |
|||
|
|||
// Extract rating
|
|||
double rating = 0.0; |
|||
int ratingIndex = movieHtml.indexOf("rating_num"); |
|||
if (ratingIndex > 0) { |
|||
int start = movieHtml.indexOf('>', ratingIndex); |
|||
int end = movieHtml.indexOf("</span>", start); |
|||
if (start > 0 && end > 0) { |
|||
String ratingStr = movieHtml.substring(start + 1, end).trim(); |
|||
try { |
|||
rating = Double.parseDouble(ratingStr); |
|||
} catch (NumberFormatException e) { |
|||
rating = 0.0; |
|||
} |
|||
} |
|||
} |
|||
|
|||
// Extract year and director from movie info
|
|||
int year = 0; |
|||
String director = "Unknown"; |
|||
|
|||
// Find the info section which contains year and director
|
|||
// Look for <p> tag without class or with specific class
|
|||
int infoStart = -1; |
|||
int pStart = movieHtml.indexOf("<p>"); |
|||
int pClassStart = movieHtml.indexOf("<p class=\"\">"); |
|||
|
|||
if (pStart >= 0) { |
|||
infoStart = pStart; |
|||
} |
|||
if (pClassStart >= 0 && (pStart < 0 || pClassStart < pStart)) { |
|||
infoStart = pClassStart; |
|||
} |
|||
|
|||
if (infoStart > 0) { |
|||
int infoEnd = movieHtml.indexOf("</p>", infoStart); |
|||
if (infoEnd > infoStart) { |
|||
String infoSection = movieHtml.substring(infoStart, infoEnd); |
|||
|
|||
// Extract year - look for 4-digit year after <br> tag
|
|||
int brIndex = infoSection.indexOf("<br>"); |
|||
if (brIndex > 0) { |
|||
String afterBr = infoSection.substring(brIndex + 4).trim(); |
|||
// Find first 4-digit number
|
|||
for (int i = 0; i <= afterBr.length() - 4; i++) { |
|||
String possibleYear = afterBr.substring(i, i + 4); |
|||
if (possibleYear.matches("\\d{4}")) { |
|||
try { |
|||
year = Integer.parseInt(possibleYear); |
|||
break; |
|||
} catch (NumberFormatException e) { |
|||
// Continue
|
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
// Extract director - director info is between "导演:" and " "
|
|||
// Look for the pattern: 导演: [director name]
|
|||
int directorLabelIdx = infoSection.indexOf("\u5bfc\u6f14:"); // Unicode for "导演:"
|
|||
if (directorLabelIdx >= 0) { |
|||
int directorStart = directorLabelIdx + 3; // Skip "导演:"
|
|||
int directorEnd = infoSection.indexOf(" ", directorStart); |
|||
if (directorEnd > directorStart) { |
|||
director = infoSection.substring(directorStart, directorEnd).trim(); |
|||
// Clean up any remaining HTML
|
|||
director = director.replaceAll("<[^>]*>", "").trim(); |
|||
// Extract only Chinese name (before space)
|
|||
int spaceIdx = director.indexOf(" "); |
|||
if (spaceIdx > 0) { |
|||
director = director.substring(0, spaceIdx).trim(); |
|||
} |
|||
if (director.isEmpty()) director = "Unknown"; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
// If title and rating are valid, create movie object
|
|||
if (!title.isEmpty() && rating > 0) { |
|||
return new Movie(title, rating, year, director); |
|||
} |
|||
} catch (Exception e) { |
|||
// Silently handle exceptions
|
|||
} |
|||
return null; |
|||
} |
|||
} |
|||
Loading…
Reference in new issue