w4-张思渊-202401070104

3 months ago · 6980b6ff4f
1 changed files with 194 additions and 0 deletions
--- a/project/src/project/crawler/MovieCrawler.java
+++ b/project/src/project/crawler/MovieCrawler.java
@ -0,0 +1,194 @@
 package project.crawler;
 import project.bean.Movie;
 import project.utils.DataCleaner;
 import project.utils.HttpUtils;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 public class MovieCrawler {
    public static List<Movie> crawlMovies(int pageCount) throws Exception {
        List<Movie> movies = new ArrayList<>();
        for (int page = 1; page <= pageCount; page++) {
            String url = "https://movie.douban.com/top250?start=" + (page - 1) * 25;
            System.out.println("Crawling page " + page + " from " + url);
            try {
                String html = HttpUtils.getHtml(url);
                System.out.println("Got HTML content, length: " + html.length());
                // 打印 HTML 内容的前 500 个字符，了解实际结构
                if (html.length() > 500) {
                    System.out.println("HTML preview: " + html.substring(0, 500) + "...");
                }
                List<Movie> pageMovies = parseMovies(html);
                System.out.println("Parsed " + pageMovies.size() + " movies from page " + page);
                movies.addAll(pageMovies);
            } catch (Exception e) {
                System.out.println("Error crawling page " + page + ": " + e.getMessage());
            }
            Thread.sleep(1000); // 控制请求频率
        }
        System.out.println("Total movies crawled: " + movies.size());
        return movies;
    }
    private static List<Movie> parseMovies(String html) {
        List<Movie> movies = new ArrayList<>();
        // Find all movie items by looking for <div class="item"> and matching until </div> at the same nesting level
        int startIndex = 0;
        int count = 0;
        while (true) {
            int itemStart = html.indexOf("<div class=\"item\">", startIndex);
            if (itemStart < 0) break;
            // Find the matching </div> by counting nested divs
            int pos = itemStart + "<div class=\"item\">".length();
            int depth = 1;
            int itemEnd = -1;
            while (pos < html.length() && depth > 0) {
                int nextOpen = html.indexOf("<div", pos);
                int nextClose = html.indexOf("</div>", pos);
                if (nextClose < 0) break; // No closing tag found
                if (nextOpen >= 0 && nextOpen < nextClose) {
                    // Found an opening div before closing
                    depth++;
                    pos = nextOpen + 4;
                } else {
                    // Found a closing div
                    depth--;
                    if (depth == 0) {
                        itemEnd = nextClose + 6;
                    }
                    pos = nextClose + 6;
                }
            }
            if (itemEnd > itemStart) {
                count++;
                String movieHtml = html.substring(itemStart, itemEnd);
                // Don't print movie HTML to avoid excessive output
                Movie movie = parseMovie(movieHtml);
                if (movie != null) {
                    movies.add(movie);
                }
                startIndex = itemEnd;
            } else {
                break;
            }
        }
        System.out.println("Found " + count + " movie items, parsed " + movies.size() + " valid movies");
        return movies;
    }
    private static Movie parseMovie(String movieHtml) {
        try {
            // Extract title from img alt attribute
            String title = "";
            int altIndex = movieHtml.indexOf("alt=");
            if (altIndex > 0) {
                int start = movieHtml.indexOf('"', altIndex);
                int end = movieHtml.indexOf('"', start + 1);
                if (start > 0 && end > 0) {
                    title = movieHtml.substring(start + 1, end).trim();
                }
            }
            // Extract rating
            double rating = 0.0;
            int ratingIndex = movieHtml.indexOf("rating_num");
            if (ratingIndex > 0) {
                int start = movieHtml.indexOf('>', ratingIndex);
                int end = movieHtml.indexOf("</span>", start);
                if (start > 0 && end > 0) {
                    String ratingStr = movieHtml.substring(start + 1, end).trim();
                    try {
                        rating = Double.parseDouble(ratingStr);
                    } catch (NumberFormatException e) {
                        rating = 0.0;
                    }
                }
            }
            // Extract year and director from movie info
            int year = 0;
            String director = "Unknown";
            // Find the info section which contains year and director
            // Look for <p> tag without class or with specific class
            int infoStart = -1;
            int pStart = movieHtml.indexOf("<p>");
            int pClassStart = movieHtml.indexOf("<p class=\"\">");
            if (pStart >= 0) {
                infoStart = pStart;
            }
            if (pClassStart >= 0 && (pStart < 0 || pClassStart < pStart)) {
                infoStart = pClassStart;
            }
            if (infoStart > 0) {
                int infoEnd = movieHtml.indexOf("</p>", infoStart);
                if (infoEnd > infoStart) {
                    String infoSection = movieHtml.substring(infoStart, infoEnd);
                    // Extract year - look for 4-digit year after <br> tag
                    int brIndex = infoSection.indexOf("<br>");
                    if (brIndex > 0) {
                        String afterBr = infoSection.substring(brIndex + 4).trim();
                        // Find first 4-digit number
                        for (int i = 0; i <= afterBr.length() - 4; i++) {
                            String possibleYear = afterBr.substring(i, i + 4);
                            if (possibleYear.matches("\\d{4}")) {
                                try {
                                    year = Integer.parseInt(possibleYear);
                                    break;
                                } catch (NumberFormatException e) {
                                    // Continue
                                }
                            }
                        }
                    }
                    // Extract director - director info is between "导演:" and "&nbsp;"
                    // Look for the pattern: 导演: [director name]&nbsp;
                    int directorLabelIdx = infoSection.indexOf("\u5bfc\u6f14:"); // Unicode for "导演:"
                    if (directorLabelIdx >= 0) {
                        int directorStart = directorLabelIdx + 3; // Skip "导演:"
                        int directorEnd = infoSection.indexOf("&nbsp;", directorStart);
                        if (directorEnd > directorStart) {
                            director = infoSection.substring(directorStart, directorEnd).trim();
                            // Clean up any remaining HTML
                            director = director.replaceAll("<[^>]*>", "").trim();
                            // Extract only Chinese name (before space)
                            int spaceIdx = director.indexOf(" ");
                            if (spaceIdx > 0) {
                                director = director.substring(0, spaceIdx).trim();
                            }
                            if (director.isEmpty()) director = "Unknown";
                        }
                    }
                }
            }
            // If title and rating are valid, create movie object
            if (!title.isEmpty() && rating > 0) {
                return new Movie(title, rating, year, director);
            }
        } catch (Exception e) {
            // Silently handle exceptions
        }
        return null;
    }
 }