4 changed files with 288 additions and 0 deletions
@ -0,0 +1,2 @@ |
|||||
|
[safe] |
||||
|
directory = * |
||||
@ -0,0 +1,143 @@ |
|||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.io.PrintWriter; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class Spider { |
||||
|
protected String baseUrl; |
||||
|
protected String userAgent; |
||||
|
|
||||
|
public Spider(String baseUrl) { |
||||
|
this.baseUrl = baseUrl; |
||||
|
this.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; |
||||
|
} |
||||
|
|
||||
|
public Document fetchPage(String url) { |
||||
|
try { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent(userAgent) |
||||
|
.timeout(10000) |
||||
|
.get(); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("Error fetching " + url + ": " + e.getMessage()); |
||||
|
return null; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public List<Movie> parsePage(Document doc) { |
||||
|
throw new UnsupportedOperationException("Subclass must override parsePage method"); |
||||
|
} |
||||
|
|
||||
|
public void saveData(List<Movie> data, String filename) { |
||||
|
if (data == null || data.isEmpty()) { |
||||
|
System.out.println("No data to save"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try (PrintWriter writer = new PrintWriter(new FileWriter(filename, false))) { |
||||
|
writer.println("title,rating,link"); |
||||
|
for (Movie movie : data) { |
||||
|
writer.println(movie.getTitle() + "," + movie.getRating() + "," + movie.getLink()); |
||||
|
} |
||||
|
System.out.println("Data saved to " + filename); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("Error saving data: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public List<Movie> run(int startPage, Integer endPage) { |
||||
|
List<Movie> allData = new ArrayList<>(); |
||||
|
int currentPage = startPage; |
||||
|
|
||||
|
while (true) { |
||||
|
String url = endPage != null ? baseUrl + "?start=" + ((currentPage - 1) * 25) : baseUrl; |
||||
|
System.out.println("Processing page " + currentPage + ": " + url); |
||||
|
|
||||
|
Document doc = fetchPage(url); |
||||
|
if (doc == null) { |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
List<Movie> pageData = parsePage(doc); |
||||
|
if (pageData == null || pageData.isEmpty()) { |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
allData.addAll(pageData); |
||||
|
|
||||
|
if (endPage != null && currentPage >= endPage) { |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
currentPage++; |
||||
|
try { |
||||
|
Thread.sleep(1000); // 防止请求过快被封
|
||||
|
} catch (InterruptedException e) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return allData; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
class Movie { |
||||
|
private String title; |
||||
|
private String rating; |
||||
|
private String link; |
||||
|
|
||||
|
public Movie(String title, String rating, String link) { |
||||
|
this.title = title; |
||||
|
this.rating = rating; |
||||
|
this.link = link; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public String getRating() { |
||||
|
return rating; |
||||
|
} |
||||
|
|
||||
|
public String getLink() { |
||||
|
return link; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
class DoubanMovieSpider extends Spider { |
||||
|
public DoubanMovieSpider() { |
||||
|
super("https://movie.douban.com/top250"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> parsePage(Document doc) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
Elements items = doc.select("div.item"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
String title = item.select("span.title").first().text(); |
||||
|
String rating = item.select("span.rating_num").first().text(); |
||||
|
String link = item.select("a").first().attr("href"); |
||||
|
|
||||
|
movies.add(new Movie(title, rating, link)); |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
class Main { |
||||
|
public static void main(String[] args) { |
||||
|
DoubanMovieSpider spider = new DoubanMovieSpider(); |
||||
|
List<Movie> data = spider.run(1, 10); // 爬取前10页
|
||||
|
spider.saveData(data, "douban_movies.csv"); |
||||
|
System.out.println("爬取完成,共获取" + data.size() + "条数据"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,143 @@ |
|||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.io.PrintWriter; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class Spider { |
||||
|
protected String baseUrl; |
||||
|
protected String userAgent; |
||||
|
|
||||
|
public Spider(String baseUrl) { |
||||
|
this.baseUrl = baseUrl; |
||||
|
this.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; |
||||
|
} |
||||
|
|
||||
|
public Document fetchPage(String url) { |
||||
|
try { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent(userAgent) |
||||
|
.timeout(10000) |
||||
|
.get(); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("Error fetching " + url + ": " + e.getMessage()); |
||||
|
return null; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public List<Movie> parsePage(Document doc) { |
||||
|
throw new UnsupportedOperationException("Subclass must override parsePage method"); |
||||
|
} |
||||
|
|
||||
|
public void saveData(List<Movie> data, String filename) { |
||||
|
if (data == null || data.isEmpty()) { |
||||
|
System.out.println("No data to save"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try (PrintWriter writer = new PrintWriter(new FileWriter(filename, false))) { |
||||
|
writer.println("title,rating,link"); |
||||
|
for (Movie movie : data) { |
||||
|
writer.println(movie.getTitle() + "," + movie.getRating() + "," + movie.getLink()); |
||||
|
} |
||||
|
System.out.println("Data saved to " + filename); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("Error saving data: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public List<Movie> run(int startPage, Integer endPage) { |
||||
|
List<Movie> allData = new ArrayList<>(); |
||||
|
int currentPage = startPage; |
||||
|
|
||||
|
while (true) { |
||||
|
String url = endPage != null ? baseUrl + "?start=" + ((currentPage - 1) * 25) : baseUrl; |
||||
|
System.out.println("Processing page " + currentPage + ": " + url); |
||||
|
|
||||
|
Document doc = fetchPage(url); |
||||
|
if (doc == null) { |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
List<Movie> pageData = parsePage(doc); |
||||
|
if (pageData == null || pageData.isEmpty()) { |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
allData.addAll(pageData); |
||||
|
|
||||
|
if (endPage != null && currentPage >= endPage) { |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
currentPage++; |
||||
|
try { |
||||
|
Thread.sleep(1000); // 防止请求过快被封
|
||||
|
} catch (InterruptedException e) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return allData; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
class Movie { |
||||
|
private String title; |
||||
|
private String rating; |
||||
|
private String link; |
||||
|
|
||||
|
public Movie(String title, String rating, String link) { |
||||
|
this.title = title; |
||||
|
this.rating = rating; |
||||
|
this.link = link; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public String getRating() { |
||||
|
return rating; |
||||
|
} |
||||
|
|
||||
|
public String getLink() { |
||||
|
return link; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
class DoubanMovieSpider extends Spider { |
||||
|
public DoubanMovieSpider() { |
||||
|
super("https://movie.douban.com/top250"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> parsePage(Document doc) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
Elements items = doc.select("div.item"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
String title = item.select("span.title").first().text(); |
||||
|
String rating = item.select("span.rating_num").first().text(); |
||||
|
String link = item.select("a").first().attr("href"); |
||||
|
|
||||
|
movies.add(new Movie(title, rating, link)); |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
class Main { |
||||
|
public static void main(String[] args) { |
||||
|
DoubanMovieSpider spider = new DoubanMovieSpider(); |
||||
|
List<Movie> data = spider.run(1, 10); // 爬取前10页
|
||||
|
spider.saveData(data, "douban_movies.csv"); |
||||
|
System.out.println("爬取完成,共获取" + data.size() + "条数据"); |
||||
|
} |
||||
|
} |
||||
Binary file not shown.
Loading…
Reference in new issue