1 changed files with 109 additions and 0 deletions
@ -0,0 +1,109 @@ |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
// 豆瓣电影爬虫:继承抽象父类,实现多态,修复403反爬
|
|||
public class MovieCrawler extends BaseMovieCrawler { |
|||
private static final String BASE_URL = "https://movie.douban.com/top250"; |
|||
// 延长请求间隔,降低反爬风险
|
|||
private static final int DELAY_MS = 2000; |
|||
|
|||
@Override |
|||
public List<Movie> crawl(int limit) throws IOException { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
int page = 0; |
|||
|
|||
System.out.println("=== 开始爬取豆瓣Top250电影 ==="); |
|||
|
|||
while (movies.size() < limit) { |
|||
String url = BASE_URL + "?start=" + (page * 25); |
|||
System.out.println("正在爬取页面:" + url); |
|||
|
|||
try { |
|||
// 🔴 核心修复:添加完整请求头,模拟真实浏览器,绕过403反爬
|
|||
Document doc = Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
|||
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9") |
|||
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") |
|||
.header("Referer", "https://movie.douban.com/top250") |
|||
.header("Cache-Control", "max-age=0") |
|||
.header("Upgrade-Insecure-Requests", "1") |
|||
.cookie("bid", "abc123xyz456") |
|||
.timeout(15000) |
|||
.get(); |
|||
|
|||
// 选择页面中所有电影条目
|
|||
Elements movieElements = doc.select(".item"); |
|||
System.out.println("当前页面找到 " + movieElements.size() + " 部电影"); |
|||
|
|||
// 遍历每个电影条目,解析并封装成Movie对象
|
|||
for (Element element : movieElements) { |
|||
if (movies.size() >= limit) break; |
|||
Movie movie = parseMovie(element); |
|||
if (movie != null) { |
|||
movies.add(movie); |
|||
System.out.println("成功爬取:" + movie.getTitle()); |
|||
} |
|||
// 控制请求间隔,防止被封
|
|||
Thread.sleep(DELAY_MS); |
|||
} |
|||
|
|||
page++; |
|||
} catch (Exception e) { |
|||
System.err.println("爬取页面异常:" + e.getMessage()); |
|||
e.printStackTrace(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
System.out.println("=== 爬取完成,共获取 " + movies.size() + " 部电影 ==="); |
|||
return movies; |
|||
} |
|||
|
|||
@Override |
|||
protected Movie parseMovie(Element element) { |
|||
try { |
|||
// 1. 提取电影标题
|
|||
Element titleElement = element.selectFirst(".hd .title"); |
|||
if (titleElement == null) return null; |
|||
String title = titleElement.text(); |
|||
|
|||
// 2. 提取年份(正则匹配4位数字)
|
|||
Element infoElement = element.selectFirst(".bd p:first-child"); |
|||
if (infoElement == null) return null; |
|||
String infoText = infoElement.text(); |
|||
int year = 0; |
|||
Matcher matcher = Pattern.compile("(\\d{4})").matcher(infoText); |
|||
if (matcher.find()) { |
|||
year = Integer.parseInt(matcher.group(1)); |
|||
} |
|||
if (year == 0) return null; |
|||
|
|||
// 3. 提取评分
|
|||
Element ratingElement = element.selectFirst(".rating_num"); |
|||
if (ratingElement == null) return null; |
|||
double rating = Double.parseDouble(ratingElement.text()); |
|||
|
|||
// 4. 提取电影类型
|
|||
String genre = ""; |
|||
if (infoText.contains("/")) { |
|||
String[] parts = infoText.split("/"); |
|||
if (parts.length > 2) { |
|||
genre = parts[2].trim(); |
|||
} |
|||
} |
|||
|
|||
// 5. 封装成具体子类对象(院线电影示例)
|
|||
return new TheatreMovie(title, year, rating, genre, 59.9); |
|||
} catch (Exception e) { |
|||
System.err.println("解析电影异常:" + e.getMessage()); |
|||
return null; |
|||
} |
|||
} |
|||
} |
|||
Loading…
Reference in new issue