You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
143 lines
4.2 KiB
143 lines
4.2 KiB
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.io.PrintWriter;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class Spider {
|
|
protected String baseUrl;
|
|
protected String userAgent;
|
|
|
|
public Spider(String baseUrl) {
|
|
this.baseUrl = baseUrl;
|
|
this.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36";
|
|
}
|
|
|
|
public Document fetchPage(String url) {
|
|
try {
|
|
return Jsoup.connect(url)
|
|
.userAgent(userAgent)
|
|
.timeout(10000)
|
|
.get();
|
|
} catch (IOException e) {
|
|
System.err.println("Error fetching " + url + ": " + e.getMessage());
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public List<Movie> parsePage(Document doc) {
|
|
throw new UnsupportedOperationException("Subclass must override parsePage method");
|
|
}
|
|
|
|
public void saveData(List<Movie> data, String filename) {
|
|
if (data == null || data.isEmpty()) {
|
|
System.out.println("No data to save");
|
|
return;
|
|
}
|
|
|
|
try (PrintWriter writer = new PrintWriter(new FileWriter(filename, false))) {
|
|
writer.println("title,rating,link");
|
|
for (Movie movie : data) {
|
|
writer.println(movie.getTitle() + "," + movie.getRating() + "," + movie.getLink());
|
|
}
|
|
System.out.println("Data saved to " + filename);
|
|
} catch (IOException e) {
|
|
System.err.println("Error saving data: " + e.getMessage());
|
|
}
|
|
}
|
|
|
|
public List<Movie> run(int startPage, Integer endPage) {
|
|
List<Movie> allData = new ArrayList<>();
|
|
int currentPage = startPage;
|
|
|
|
while (true) {
|
|
String url = endPage != null ? baseUrl + "?start=" + ((currentPage - 1) * 25) : baseUrl;
|
|
System.out.println("Processing page " + currentPage + ": " + url);
|
|
|
|
Document doc = fetchPage(url);
|
|
if (doc == null) {
|
|
break;
|
|
}
|
|
|
|
List<Movie> pageData = parsePage(doc);
|
|
if (pageData == null || pageData.isEmpty()) {
|
|
break;
|
|
}
|
|
|
|
allData.addAll(pageData);
|
|
|
|
if (endPage != null && currentPage >= endPage) {
|
|
break;
|
|
}
|
|
|
|
currentPage++;
|
|
try {
|
|
Thread.sleep(1000); // 防止请求过快被封
|
|
} catch (InterruptedException e) {
|
|
Thread.currentThread().interrupt();
|
|
break;
|
|
}
|
|
}
|
|
|
|
return allData;
|
|
}
|
|
}
|
|
|
|
class Movie {
|
|
private String title;
|
|
private String rating;
|
|
private String link;
|
|
|
|
public Movie(String title, String rating, String link) {
|
|
this.title = title;
|
|
this.rating = rating;
|
|
this.link = link;
|
|
}
|
|
|
|
public String getTitle() {
|
|
return title;
|
|
}
|
|
|
|
public String getRating() {
|
|
return rating;
|
|
}
|
|
|
|
public String getLink() {
|
|
return link;
|
|
}
|
|
}
|
|
|
|
class DoubanMovieSpider extends Spider {
|
|
public DoubanMovieSpider() {
|
|
super("https://movie.douban.com/top250");
|
|
}
|
|
|
|
@Override
|
|
public List<Movie> parsePage(Document doc) {
|
|
List<Movie> movies = new ArrayList<>();
|
|
Elements items = doc.select("div.item");
|
|
|
|
for (Element item : items) {
|
|
String title = item.select("span.title").first().text();
|
|
String rating = item.select("span.rating_num").first().text();
|
|
String link = item.select("a").first().attr("href");
|
|
|
|
movies.add(new Movie(title, rating, link));
|
|
}
|
|
|
|
return movies;
|
|
}
|
|
}
|
|
|
|
class Main {
|
|
public static void main(String[] args) {
|
|
DoubanMovieSpider spider = new DoubanMovieSpider();
|
|
List<Movie> data = spider.run(1, 10); // 爬取前10页
|
|
spider.saveData(data, "douban_movies.csv");
|
|
System.out.println("爬取完成,共获取" + data.size() + "条数据");
|
|
}
|
|
}
|