Browse Source

w4-张思渊-202401070104

main
zhangsiyuan 3 weeks ago
parent
commit
6980b6ff4f
  1. 194
      project/src/project/crawler/MovieCrawler.java

194
project/src/project/crawler/MovieCrawler.java

@ -0,0 +1,194 @@
package project.crawler;
import project.bean.Movie;
import project.utils.DataCleaner;
import project.utils.HttpUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class MovieCrawler {
public static List<Movie> crawlMovies(int pageCount) throws Exception {
List<Movie> movies = new ArrayList<>();
for (int page = 1; page <= pageCount; page++) {
String url = "https://movie.douban.com/top250?start=" + (page - 1) * 25;
System.out.println("Crawling page " + page + " from " + url);
try {
String html = HttpUtils.getHtml(url);
System.out.println("Got HTML content, length: " + html.length());
// 打印 HTML 内容的前 500 个字符,了解实际结构
if (html.length() > 500) {
System.out.println("HTML preview: " + html.substring(0, 500) + "...");
}
List<Movie> pageMovies = parseMovies(html);
System.out.println("Parsed " + pageMovies.size() + " movies from page " + page);
movies.addAll(pageMovies);
} catch (Exception e) {
System.out.println("Error crawling page " + page + ": " + e.getMessage());
}
Thread.sleep(1000); // 控制请求频率
}
System.out.println("Total movies crawled: " + movies.size());
return movies;
}
private static List<Movie> parseMovies(String html) {
List<Movie> movies = new ArrayList<>();
// Find all movie items by looking for <div class="item"> and matching until </div> at the same nesting level
int startIndex = 0;
int count = 0;
while (true) {
int itemStart = html.indexOf("<div class=\"item\">", startIndex);
if (itemStart < 0) break;
// Find the matching </div> by counting nested divs
int pos = itemStart + "<div class=\"item\">".length();
int depth = 1;
int itemEnd = -1;
while (pos < html.length() && depth > 0) {
int nextOpen = html.indexOf("<div", pos);
int nextClose = html.indexOf("</div>", pos);
if (nextClose < 0) break; // No closing tag found
if (nextOpen >= 0 && nextOpen < nextClose) {
// Found an opening div before closing
depth++;
pos = nextOpen + 4;
} else {
// Found a closing div
depth--;
if (depth == 0) {
itemEnd = nextClose + 6;
}
pos = nextClose + 6;
}
}
if (itemEnd > itemStart) {
count++;
String movieHtml = html.substring(itemStart, itemEnd);
// Don't print movie HTML to avoid excessive output
Movie movie = parseMovie(movieHtml);
if (movie != null) {
movies.add(movie);
}
startIndex = itemEnd;
} else {
break;
}
}
System.out.println("Found " + count + " movie items, parsed " + movies.size() + " valid movies");
return movies;
}
private static Movie parseMovie(String movieHtml) {
try {
// Extract title from img alt attribute
String title = "";
int altIndex = movieHtml.indexOf("alt=");
if (altIndex > 0) {
int start = movieHtml.indexOf('"', altIndex);
int end = movieHtml.indexOf('"', start + 1);
if (start > 0 && end > 0) {
title = movieHtml.substring(start + 1, end).trim();
}
}
// Extract rating
double rating = 0.0;
int ratingIndex = movieHtml.indexOf("rating_num");
if (ratingIndex > 0) {
int start = movieHtml.indexOf('>', ratingIndex);
int end = movieHtml.indexOf("</span>", start);
if (start > 0 && end > 0) {
String ratingStr = movieHtml.substring(start + 1, end).trim();
try {
rating = Double.parseDouble(ratingStr);
} catch (NumberFormatException e) {
rating = 0.0;
}
}
}
// Extract year and director from movie info
int year = 0;
String director = "Unknown";
// Find the info section which contains year and director
// Look for <p> tag without class or with specific class
int infoStart = -1;
int pStart = movieHtml.indexOf("<p>");
int pClassStart = movieHtml.indexOf("<p class=\"\">");
if (pStart >= 0) {
infoStart = pStart;
}
if (pClassStart >= 0 && (pStart < 0 || pClassStart < pStart)) {
infoStart = pClassStart;
}
if (infoStart > 0) {
int infoEnd = movieHtml.indexOf("</p>", infoStart);
if (infoEnd > infoStart) {
String infoSection = movieHtml.substring(infoStart, infoEnd);
// Extract year - look for 4-digit year after <br> tag
int brIndex = infoSection.indexOf("<br>");
if (brIndex > 0) {
String afterBr = infoSection.substring(brIndex + 4).trim();
// Find first 4-digit number
for (int i = 0; i <= afterBr.length() - 4; i++) {
String possibleYear = afterBr.substring(i, i + 4);
if (possibleYear.matches("\\d{4}")) {
try {
year = Integer.parseInt(possibleYear);
break;
} catch (NumberFormatException e) {
// Continue
}
}
}
}
// Extract director - director info is between "导演:" and "&nbsp;"
// Look for the pattern: 导演: [director name]&nbsp;
int directorLabelIdx = infoSection.indexOf("\u5bfc\u6f14:"); // Unicode for "导演:"
if (directorLabelIdx >= 0) {
int directorStart = directorLabelIdx + 3; // Skip "导演:"
int directorEnd = infoSection.indexOf("&nbsp;", directorStart);
if (directorEnd > directorStart) {
director = infoSection.substring(directorStart, directorEnd).trim();
// Clean up any remaining HTML
director = director.replaceAll("<[^>]*>", "").trim();
// Extract only Chinese name (before space)
int spaceIdx = director.indexOf(" ");
if (spaceIdx > 0) {
director = director.substring(0, spaceIdx).trim();
}
if (director.isEmpty()) director = "Unknown";
}
}
}
}
// If title and rating are valid, create movie object
if (!title.isEmpty() && rating > 0) {
return new Movie(title, rating, year, director);
}
} catch (Exception e) {
// Silently handle exceptions
}
return null;
}
}
Loading…
Cancel
Save