import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; public class Spider { protected String baseUrl; protected String userAgent; public Spider(String baseUrl) { this.baseUrl = baseUrl; this.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; } public Document fetchPage(String url) { try { return Jsoup.connect(url) .userAgent(userAgent) .timeout(10000) .get(); } catch (IOException e) { System.err.println("Error fetching " + url + ": " + e.getMessage()); return null; } } public List parsePage(Document doc) { throw new UnsupportedOperationException("Subclass must override parsePage method"); } public void saveData(List data, String filename) { if (data == null || data.isEmpty()) { System.out.println("No data to save"); return; } try (PrintWriter writer = new PrintWriter(new FileWriter(filename, false))) { writer.println("title,rating,link"); for (Movie movie : data) { writer.println(movie.getTitle() + "," + movie.getRating() + "," + movie.getLink()); } System.out.println("Data saved to " + filename); } catch (IOException e) { System.err.println("Error saving data: " + e.getMessage()); } } public List run(int startPage, Integer endPage) { List allData = new ArrayList<>(); int currentPage = startPage; while (true) { String url = endPage != null ? baseUrl + "?start=" + ((currentPage - 1) * 25) : baseUrl; System.out.println("Processing page " + currentPage + ": " + url); Document doc = fetchPage(url); if (doc == null) { break; } List pageData = parsePage(doc); if (pageData == null || pageData.isEmpty()) { break; } allData.addAll(pageData); if (endPage != null && currentPage >= endPage) { break; } currentPage++; try { Thread.sleep(1000); // 防止请求过快被封 } catch (InterruptedException e) { Thread.currentThread().interrupt(); break; } } return allData; } } class Movie { private String title; private String rating; private String link; public Movie(String title, String rating, String link) { this.title = title; this.rating = rating; this.link = link; } public String getTitle() { return title; } public String getRating() { return rating; } public String getLink() { return link; } } class DoubanMovieSpider extends Spider { public DoubanMovieSpider() { super("https://movie.douban.com/top250"); } @Override public List parsePage(Document doc) { List movies = new ArrayList<>(); Elements items = doc.select("div.item"); for (Element item : items) { String title = item.select("span.title").first().text(); String rating = item.select("span.rating_num").first().text(); String link = item.select("a").first().attr("href"); movies.add(new Movie(title, rating, link)); } return movies; } } class Main { public static void main(String[] args) { DoubanMovieSpider spider = new DoubanMovieSpider(); List data = spider.run(1, 10); // 爬取前10页 spider.saveData(data, "douban_movies.csv"); System.out.println("爬取完成,共获取" + data.size() + "条数据"); } }