You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

143 lines
4.2 KiB

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
public class Spider {
protected String baseUrl;
protected String userAgent;
public Spider(String baseUrl) {
this.baseUrl = baseUrl;
this.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36";
}
public Document fetchPage(String url) {
try {
return Jsoup.connect(url)
.userAgent(userAgent)
.timeout(10000)
.get();
} catch (IOException e) {
System.err.println("Error fetching " + url + ": " + e.getMessage());
return null;
}
}
public List<Movie> parsePage(Document doc) {
throw new UnsupportedOperationException("Subclass must override parsePage method");
}
public void saveData(List<Movie> data, String filename) {
if (data == null || data.isEmpty()) {
System.out.println("No data to save");
return;
}
try (PrintWriter writer = new PrintWriter(new FileWriter(filename, false))) {
writer.println("title,rating,link");
for (Movie movie : data) {
writer.println(movie.getTitle() + "," + movie.getRating() + "," + movie.getLink());
}
System.out.println("Data saved to " + filename);
} catch (IOException e) {
System.err.println("Error saving data: " + e.getMessage());
}
}
public List<Movie> run(int startPage, Integer endPage) {
List<Movie> allData = new ArrayList<>();
int currentPage = startPage;
while (true) {
String url = endPage != null ? baseUrl + "?start=" + ((currentPage - 1) * 25) : baseUrl;
System.out.println("Processing page " + currentPage + ": " + url);
Document doc = fetchPage(url);
if (doc == null) {
break;
}
List<Movie> pageData = parsePage(doc);
if (pageData == null || pageData.isEmpty()) {
break;
}
allData.addAll(pageData);
if (endPage != null && currentPage >= endPage) {
break;
}
currentPage++;
try {
Thread.sleep(1000); // 防止请求过快被封
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
return allData;
}
}
class Movie {
private String title;
private String rating;
private String link;
public Movie(String title, String rating, String link) {
this.title = title;
this.rating = rating;
this.link = link;
}
public String getTitle() {
return title;
}
public String getRating() {
return rating;
}
public String getLink() {
return link;
}
}
class DoubanMovieSpider extends Spider {
public DoubanMovieSpider() {
super("https://movie.douban.com/top250");
}
@Override
public List<Movie> parsePage(Document doc) {
List<Movie> movies = new ArrayList<>();
Elements items = doc.select("div.item");
for (Element item : items) {
String title = item.select("span.title").first().text();
String rating = item.select("span.rating_num").first().text();
String link = item.select("a").first().attr("href");
movies.add(new Movie(title, rating, link));
}
return movies;
}
}
class Main {
public static void main(String[] args) {
DoubanMovieSpider spider = new DoubanMovieSpider();
List<Movie> data = spider.run(1, 10); // 爬取前10页
spider.saveData(data, "douban_movies.csv");
System.out.println("爬取完成,共获取" + data.size() + "条数据");
}
}