diff --git a/project/src/project/crawler/MovieCrawler.java b/project/src/project/crawler/MovieCrawler.java new file mode 100644 index 0000000..9dda973 --- /dev/null +++ b/project/src/project/crawler/MovieCrawler.java @@ -0,0 +1,194 @@ +package project.crawler; + +import project.bean.Movie; +import project.utils.DataCleaner; +import project.utils.HttpUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class MovieCrawler { + public static List crawlMovies(int pageCount) throws Exception { + List movies = new ArrayList<>(); + + for (int page = 1; page <= pageCount; page++) { + String url = "https://movie.douban.com/top250?start=" + (page - 1) * 25; + System.out.println("Crawling page " + page + " from " + url); + try { + String html = HttpUtils.getHtml(url); + System.out.println("Got HTML content, length: " + html.length()); + + // 打印 HTML 内容的前 500 个字符,了解实际结构 + if (html.length() > 500) { + System.out.println("HTML preview: " + html.substring(0, 500) + "..."); + } + + List pageMovies = parseMovies(html); + System.out.println("Parsed " + pageMovies.size() + " movies from page " + page); + movies.addAll(pageMovies); + } catch (Exception e) { + System.out.println("Error crawling page " + page + ": " + e.getMessage()); + } + Thread.sleep(1000); // 控制请求频率 + } + + System.out.println("Total movies crawled: " + movies.size()); + return movies; + } + + private static List parseMovies(String html) { + List movies = new ArrayList<>(); + + // Find all movie items by looking for
and matching until
at the same nesting level + int startIndex = 0; + int count = 0; + + while (true) { + int itemStart = html.indexOf("
", startIndex); + if (itemStart < 0) break; + + // Find the matching
by counting nested divs + int pos = itemStart + "
".length(); + int depth = 1; + int itemEnd = -1; + + while (pos < html.length() && depth > 0) { + int nextOpen = html.indexOf("", pos); + + if (nextClose < 0) break; // No closing tag found + + if (nextOpen >= 0 && nextOpen < nextClose) { + // Found an opening div before closing + depth++; + pos = nextOpen + 4; + } else { + // Found a closing div + depth--; + if (depth == 0) { + itemEnd = nextClose + 6; + } + pos = nextClose + 6; + } + } + + if (itemEnd > itemStart) { + count++; + String movieHtml = html.substring(itemStart, itemEnd); + // Don't print movie HTML to avoid excessive output + Movie movie = parseMovie(movieHtml); + if (movie != null) { + movies.add(movie); + } + startIndex = itemEnd; + } else { + break; + } + } + + System.out.println("Found " + count + " movie items, parsed " + movies.size() + " valid movies"); + return movies; + } + + private static Movie parseMovie(String movieHtml) { + try { + // Extract title from img alt attribute + String title = ""; + int altIndex = movieHtml.indexOf("alt="); + if (altIndex > 0) { + int start = movieHtml.indexOf('"', altIndex); + int end = movieHtml.indexOf('"', start + 1); + if (start > 0 && end > 0) { + title = movieHtml.substring(start + 1, end).trim(); + } + } + + // Extract rating + double rating = 0.0; + int ratingIndex = movieHtml.indexOf("rating_num"); + if (ratingIndex > 0) { + int start = movieHtml.indexOf('>', ratingIndex); + int end = movieHtml.indexOf("", start); + if (start > 0 && end > 0) { + String ratingStr = movieHtml.substring(start + 1, end).trim(); + try { + rating = Double.parseDouble(ratingStr); + } catch (NumberFormatException e) { + rating = 0.0; + } + } + } + + // Extract year and director from movie info + int year = 0; + String director = "Unknown"; + + // Find the info section which contains year and director + // Look for

tag without class or with specific class + int infoStart = -1; + int pStart = movieHtml.indexOf("

"); + int pClassStart = movieHtml.indexOf("

"); + + if (pStart >= 0) { + infoStart = pStart; + } + if (pClassStart >= 0 && (pStart < 0 || pClassStart < pStart)) { + infoStart = pClassStart; + } + + if (infoStart > 0) { + int infoEnd = movieHtml.indexOf("

", infoStart); + if (infoEnd > infoStart) { + String infoSection = movieHtml.substring(infoStart, infoEnd); + + // Extract year - look for 4-digit year after
tag + int brIndex = infoSection.indexOf("
"); + if (brIndex > 0) { + String afterBr = infoSection.substring(brIndex + 4).trim(); + // Find first 4-digit number + for (int i = 0; i <= afterBr.length() - 4; i++) { + String possibleYear = afterBr.substring(i, i + 4); + if (possibleYear.matches("\\d{4}")) { + try { + year = Integer.parseInt(possibleYear); + break; + } catch (NumberFormatException e) { + // Continue + } + } + } + } + + // Extract director - director info is between "导演:" and " " + // Look for the pattern: 导演: [director name]  + int directorLabelIdx = infoSection.indexOf("\u5bfc\u6f14:"); // Unicode for "导演:" + if (directorLabelIdx >= 0) { + int directorStart = directorLabelIdx + 3; // Skip "导演:" + int directorEnd = infoSection.indexOf(" ", directorStart); + if (directorEnd > directorStart) { + director = infoSection.substring(directorStart, directorEnd).trim(); + // Clean up any remaining HTML + director = director.replaceAll("<[^>]*>", "").trim(); + // Extract only Chinese name (before space) + int spaceIdx = director.indexOf(" "); + if (spaceIdx > 0) { + director = director.substring(0, spaceIdx).trim(); + } + if (director.isEmpty()) director = "Unknown"; + } + } + } + } + + // If title and rating are valid, create movie object + if (!title.isEmpty() && rating > 0) { + return new Movie(title, rating, year, director); + } + } catch (Exception e) { + // Silently handle exceptions + } + return null; + } +} \ No newline at end of file