Browse Source

上传文件至 'W3'

main
YangYuting 3 weeks ago
parent
commit
4b55d64bcf
  1. 141
      W3/MovieCrawler.java

141
W3/MovieCrawler.java

@ -0,0 +1,141 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class MovieCrawler {
private static final String BASE_URL = "https://movie.douban.com/top250";
private static final int DELAY_MS = 1000; // 控制请求频率
public List<Movie> crawlTopMovies(int limit) throws IOException {
List<Movie> movies = new ArrayList<>();
int page = 0;
System.out.println("Starting to crawl movies...");
while (movies.size() < limit) {
String url = BASE_URL + "?start=" + page * 25;
System.out.println("Crawling page: " + url);
try {
Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
.timeout(10000)
.get();
// 打印页面标题,确认是否成功获取页面
System.out.println("Page title: " + doc.title());
// 选择电影元素
Elements movieElements = doc.select(".item");
System.out.println("Found " + movieElements.size() + " movie elements");
for (Element element : movieElements) {
if (movies.size() >= limit) break;
int currentCount = movies.size() + 1;
System.out.println("Processing movie " + currentCount + "...");
try {
Movie movie = parseMovie(element);
if (movie != null) {
movies.add(movie);
System.out.println("Added movie: " + movie.getTitle());
} else {
System.out.println("Skipping movie, parsing failed");
}
// 控制请求频率
Thread.sleep(DELAY_MS);
} catch (Exception e) {
System.err.println("Error parsing movie: " + e.getMessage());
e.printStackTrace();
}
}
page++;
} catch (Exception e) {
System.err.println("Error crawling page: " + e.getMessage());
e.printStackTrace();
break;
}
}
System.out.println("Crawling finished. Found " + movies.size() + " movies.");
return movies;
}
private Movie parseMovie(Element element) {
try {
// 提取标题
Element titleElement = element.selectFirst(".hd .title");
if (titleElement == null) {
System.err.println("Title element not found");
return null;
}
String title = titleElement.text();
System.out.println("Title: " + title);
// 提取年份
Element yearElement = element.selectFirst(".bd p:first-child");
if (yearElement == null) {
System.err.println("Year element not found");
return null;
}
String yearText = yearElement.text().trim();
System.out.println("Year text: " + yearText);
// 从字符串中提取年份
int year = 0;
// 使用正则表达式提取年份
java.util.regex.Matcher matcher = java.util.regex.Pattern.compile("(\\d{4})").matcher(yearText);
if (matcher.find()) {
year = Integer.parseInt(matcher.group(1));
}
if (year == 0) {
System.err.println("Year not found in text: " + yearText);
return null;
}
System.out.println("Year: " + year);
// 提取评分
Element ratingElement = element.selectFirst(".rating_num");
if (ratingElement == null) {
System.err.println("Rating element not found");
return null;
}
String ratingText = ratingElement.text();
System.out.println("Rating text: " + ratingText);
double rating = Double.parseDouble(ratingText);
System.out.println("Rating: " + rating);
// 提取类型
String genre = "";
String infoText = yearElement.text();
if (infoText.contains("/")) {
String[] parts = infoText.split("/");
if (parts.length > 2) {
genre = parts[2].trim();
}
}
System.out.println("Genre: " + genre);
// 简化处理,不进入详情页
String director = "";
String actors = "";
String synopsis = "";
System.out.println("Parsed movie: " + title + " (" + year + ") - " + rating + " - " + genre);
return new Movie(title, year, rating, genre, director, actors, synopsis);
} catch (Exception e) {
System.err.println("Error parsing movie: " + e.getMessage());
e.printStackTrace();
return null;
}
}
}
Loading…
Cancel
Save