1 changed files with 141 additions and 0 deletions
@ -0,0 +1,141 @@ |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class MovieCrawler { |
|||
private static final String BASE_URL = "https://movie.douban.com/top250"; |
|||
private static final int DELAY_MS = 1000; // 控制请求频率
|
|||
|
|||
public List<Movie> crawlTopMovies(int limit) throws IOException { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
int page = 0; |
|||
|
|||
System.out.println("Starting to crawl movies..."); |
|||
|
|||
while (movies.size() < limit) { |
|||
String url = BASE_URL + "?start=" + page * 25; |
|||
System.out.println("Crawling page: " + url); |
|||
|
|||
try { |
|||
Document doc = Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") |
|||
.timeout(10000) |
|||
.get(); |
|||
|
|||
// 打印页面标题,确认是否成功获取页面
|
|||
System.out.println("Page title: " + doc.title()); |
|||
|
|||
// 选择电影元素
|
|||
Elements movieElements = doc.select(".item"); |
|||
System.out.println("Found " + movieElements.size() + " movie elements"); |
|||
|
|||
for (Element element : movieElements) { |
|||
if (movies.size() >= limit) break; |
|||
|
|||
int currentCount = movies.size() + 1; |
|||
System.out.println("Processing movie " + currentCount + "..."); |
|||
|
|||
try { |
|||
Movie movie = parseMovie(element); |
|||
if (movie != null) { |
|||
movies.add(movie); |
|||
System.out.println("Added movie: " + movie.getTitle()); |
|||
} else { |
|||
System.out.println("Skipping movie, parsing failed"); |
|||
} |
|||
|
|||
// 控制请求频率
|
|||
Thread.sleep(DELAY_MS); |
|||
} catch (Exception e) { |
|||
System.err.println("Error parsing movie: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
|
|||
page++; |
|||
} catch (Exception e) { |
|||
System.err.println("Error crawling page: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
System.out.println("Crawling finished. Found " + movies.size() + " movies."); |
|||
return movies; |
|||
} |
|||
|
|||
private Movie parseMovie(Element element) { |
|||
try { |
|||
// 提取标题
|
|||
Element titleElement = element.selectFirst(".hd .title"); |
|||
if (titleElement == null) { |
|||
System.err.println("Title element not found"); |
|||
return null; |
|||
} |
|||
String title = titleElement.text(); |
|||
System.out.println("Title: " + title); |
|||
|
|||
// 提取年份
|
|||
Element yearElement = element.selectFirst(".bd p:first-child"); |
|||
if (yearElement == null) { |
|||
System.err.println("Year element not found"); |
|||
return null; |
|||
} |
|||
String yearText = yearElement.text().trim(); |
|||
System.out.println("Year text: " + yearText); |
|||
|
|||
// 从字符串中提取年份
|
|||
int year = 0; |
|||
// 使用正则表达式提取年份
|
|||
java.util.regex.Matcher matcher = java.util.regex.Pattern.compile("(\\d{4})").matcher(yearText); |
|||
if (matcher.find()) { |
|||
year = Integer.parseInt(matcher.group(1)); |
|||
} |
|||
if (year == 0) { |
|||
System.err.println("Year not found in text: " + yearText); |
|||
return null; |
|||
} |
|||
System.out.println("Year: " + year); |
|||
|
|||
// 提取评分
|
|||
Element ratingElement = element.selectFirst(".rating_num"); |
|||
if (ratingElement == null) { |
|||
System.err.println("Rating element not found"); |
|||
return null; |
|||
} |
|||
String ratingText = ratingElement.text(); |
|||
System.out.println("Rating text: " + ratingText); |
|||
|
|||
double rating = Double.parseDouble(ratingText); |
|||
System.out.println("Rating: " + rating); |
|||
|
|||
// 提取类型
|
|||
String genre = ""; |
|||
String infoText = yearElement.text(); |
|||
if (infoText.contains("/")) { |
|||
String[] parts = infoText.split("/"); |
|||
if (parts.length > 2) { |
|||
genre = parts[2].trim(); |
|||
} |
|||
} |
|||
System.out.println("Genre: " + genre); |
|||
|
|||
// 简化处理,不进入详情页
|
|||
String director = ""; |
|||
String actors = ""; |
|||
String synopsis = ""; |
|||
|
|||
System.out.println("Parsed movie: " + title + " (" + year + ") - " + rating + " - " + genre); |
|||
return new Movie(title, year, rating, genre, director, actors, synopsis); |
|||
} catch (Exception e) { |
|||
System.err.println("Error parsing movie: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
return null; |
|||
} |
|||
} |
|||
} |
|||
Loading…
Reference in new issue