4 changed files with 288 additions and 0 deletions
@ -0,0 +1,2 @@ |
|||
[safe] |
|||
directory = * |
|||
@ -0,0 +1,143 @@ |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.io.PrintWriter; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class Spider { |
|||
protected String baseUrl; |
|||
protected String userAgent; |
|||
|
|||
public Spider(String baseUrl) { |
|||
this.baseUrl = baseUrl; |
|||
this.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; |
|||
} |
|||
|
|||
public Document fetchPage(String url) { |
|||
try { |
|||
return Jsoup.connect(url) |
|||
.userAgent(userAgent) |
|||
.timeout(10000) |
|||
.get(); |
|||
} catch (IOException e) { |
|||
System.err.println("Error fetching " + url + ": " + e.getMessage()); |
|||
return null; |
|||
} |
|||
} |
|||
|
|||
public List<Movie> parsePage(Document doc) { |
|||
throw new UnsupportedOperationException("Subclass must override parsePage method"); |
|||
} |
|||
|
|||
public void saveData(List<Movie> data, String filename) { |
|||
if (data == null || data.isEmpty()) { |
|||
System.out.println("No data to save"); |
|||
return; |
|||
} |
|||
|
|||
try (PrintWriter writer = new PrintWriter(new FileWriter(filename, false))) { |
|||
writer.println("title,rating,link"); |
|||
for (Movie movie : data) { |
|||
writer.println(movie.getTitle() + "," + movie.getRating() + "," + movie.getLink()); |
|||
} |
|||
System.out.println("Data saved to " + filename); |
|||
} catch (IOException e) { |
|||
System.err.println("Error saving data: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
public List<Movie> run(int startPage, Integer endPage) { |
|||
List<Movie> allData = new ArrayList<>(); |
|||
int currentPage = startPage; |
|||
|
|||
while (true) { |
|||
String url = endPage != null ? baseUrl + "?start=" + ((currentPage - 1) * 25) : baseUrl; |
|||
System.out.println("Processing page " + currentPage + ": " + url); |
|||
|
|||
Document doc = fetchPage(url); |
|||
if (doc == null) { |
|||
break; |
|||
} |
|||
|
|||
List<Movie> pageData = parsePage(doc); |
|||
if (pageData == null || pageData.isEmpty()) { |
|||
break; |
|||
} |
|||
|
|||
allData.addAll(pageData); |
|||
|
|||
if (endPage != null && currentPage >= endPage) { |
|||
break; |
|||
} |
|||
|
|||
currentPage++; |
|||
try { |
|||
Thread.sleep(1000); // 防止请求过快被封
|
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
return allData; |
|||
} |
|||
} |
|||
|
|||
class Movie { |
|||
private String title; |
|||
private String rating; |
|||
private String link; |
|||
|
|||
public Movie(String title, String rating, String link) { |
|||
this.title = title; |
|||
this.rating = rating; |
|||
this.link = link; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public String getRating() { |
|||
return rating; |
|||
} |
|||
|
|||
public String getLink() { |
|||
return link; |
|||
} |
|||
} |
|||
|
|||
class DoubanMovieSpider extends Spider { |
|||
public DoubanMovieSpider() { |
|||
super("https://movie.douban.com/top250"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> parsePage(Document doc) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
Elements items = doc.select("div.item"); |
|||
|
|||
for (Element item : items) { |
|||
String title = item.select("span.title").first().text(); |
|||
String rating = item.select("span.rating_num").first().text(); |
|||
String link = item.select("a").first().attr("href"); |
|||
|
|||
movies.add(new Movie(title, rating, link)); |
|||
} |
|||
|
|||
return movies; |
|||
} |
|||
} |
|||
|
|||
class Main { |
|||
public static void main(String[] args) { |
|||
DoubanMovieSpider spider = new DoubanMovieSpider(); |
|||
List<Movie> data = spider.run(1, 10); // 爬取前10页
|
|||
spider.saveData(data, "douban_movies.csv"); |
|||
System.out.println("爬取完成,共获取" + data.size() + "条数据"); |
|||
} |
|||
} |
|||
@ -0,0 +1,143 @@ |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.io.PrintWriter; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class Spider { |
|||
protected String baseUrl; |
|||
protected String userAgent; |
|||
|
|||
public Spider(String baseUrl) { |
|||
this.baseUrl = baseUrl; |
|||
this.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; |
|||
} |
|||
|
|||
public Document fetchPage(String url) { |
|||
try { |
|||
return Jsoup.connect(url) |
|||
.userAgent(userAgent) |
|||
.timeout(10000) |
|||
.get(); |
|||
} catch (IOException e) { |
|||
System.err.println("Error fetching " + url + ": " + e.getMessage()); |
|||
return null; |
|||
} |
|||
} |
|||
|
|||
public List<Movie> parsePage(Document doc) { |
|||
throw new UnsupportedOperationException("Subclass must override parsePage method"); |
|||
} |
|||
|
|||
public void saveData(List<Movie> data, String filename) { |
|||
if (data == null || data.isEmpty()) { |
|||
System.out.println("No data to save"); |
|||
return; |
|||
} |
|||
|
|||
try (PrintWriter writer = new PrintWriter(new FileWriter(filename, false))) { |
|||
writer.println("title,rating,link"); |
|||
for (Movie movie : data) { |
|||
writer.println(movie.getTitle() + "," + movie.getRating() + "," + movie.getLink()); |
|||
} |
|||
System.out.println("Data saved to " + filename); |
|||
} catch (IOException e) { |
|||
System.err.println("Error saving data: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
public List<Movie> run(int startPage, Integer endPage) { |
|||
List<Movie> allData = new ArrayList<>(); |
|||
int currentPage = startPage; |
|||
|
|||
while (true) { |
|||
String url = endPage != null ? baseUrl + "?start=" + ((currentPage - 1) * 25) : baseUrl; |
|||
System.out.println("Processing page " + currentPage + ": " + url); |
|||
|
|||
Document doc = fetchPage(url); |
|||
if (doc == null) { |
|||
break; |
|||
} |
|||
|
|||
List<Movie> pageData = parsePage(doc); |
|||
if (pageData == null || pageData.isEmpty()) { |
|||
break; |
|||
} |
|||
|
|||
allData.addAll(pageData); |
|||
|
|||
if (endPage != null && currentPage >= endPage) { |
|||
break; |
|||
} |
|||
|
|||
currentPage++; |
|||
try { |
|||
Thread.sleep(1000); // 防止请求过快被封
|
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
return allData; |
|||
} |
|||
} |
|||
|
|||
class Movie { |
|||
private String title; |
|||
private String rating; |
|||
private String link; |
|||
|
|||
public Movie(String title, String rating, String link) { |
|||
this.title = title; |
|||
this.rating = rating; |
|||
this.link = link; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public String getRating() { |
|||
return rating; |
|||
} |
|||
|
|||
public String getLink() { |
|||
return link; |
|||
} |
|||
} |
|||
|
|||
class DoubanMovieSpider extends Spider { |
|||
public DoubanMovieSpider() { |
|||
super("https://movie.douban.com/top250"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> parsePage(Document doc) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
Elements items = doc.select("div.item"); |
|||
|
|||
for (Element item : items) { |
|||
String title = item.select("span.title").first().text(); |
|||
String rating = item.select("span.rating_num").first().text(); |
|||
String link = item.select("a").first().attr("href"); |
|||
|
|||
movies.add(new Movie(title, rating, link)); |
|||
} |
|||
|
|||
return movies; |
|||
} |
|||
} |
|||
|
|||
class Main { |
|||
public static void main(String[] args) { |
|||
DoubanMovieSpider spider = new DoubanMovieSpider(); |
|||
List<Movie> data = spider.run(1, 10); // 爬取前10页
|
|||
spider.saveData(data, "douban_movies.csv"); |
|||
System.out.println("爬取完成,共获取" + data.size() + "条数据"); |
|||
} |
|||
} |
|||
Binary file not shown.
Loading…
Reference in new issue