39 changed files with 5268 additions and 0 deletions
|
After Width: | Height: | Size: 147 KiB |
|
After Width: | Height: | Size: 46 KiB |
@ -0,0 +1,38 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" |
||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
||||
|
<modelVersion>4.0.0</modelVersion> |
||||
|
|
||||
|
<groupId>com.crawler</groupId> |
||||
|
<artifactId>my-crawler</artifactId> |
||||
|
<version>1.0-SNAPSHOT</version> |
||||
|
|
||||
|
<properties> |
||||
|
<maven.compiler.source>17</maven.compiler.source> |
||||
|
<maven.compiler.target>17</maven.compiler.target> |
||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
||||
|
</properties> |
||||
|
|
||||
|
<dependencies> |
||||
|
<!-- HTML解析与HTTP请求 --> |
||||
|
<dependency> |
||||
|
<groupId>org.jsoup</groupId> |
||||
|
<artifactId>jsoup</artifactId> |
||||
|
<version>1.17.2</version> |
||||
|
</dependency> |
||||
|
<!-- JSON序列化(用于保存数据到文件) --> |
||||
|
<dependency> |
||||
|
<groupId>com.google.code.gson</groupId> |
||||
|
<artifactId>gson</artifactId> |
||||
|
<version>2.10.1</version> |
||||
|
</dependency> |
||||
|
<!-- Lombok(简化Getter/Setter/构造器) --> |
||||
|
<dependency> |
||||
|
<groupId>org.projectlombok</groupId> |
||||
|
<artifactId>lombok</artifactId> |
||||
|
<version>1.18.30</version> |
||||
|
<scope>provided</scope> |
||||
|
</dependency> |
||||
|
</dependencies> |
||||
|
</project> |
||||
@ -0,0 +1,29 @@ |
|||||
|
package com.crawler; |
||||
|
|
||||
|
import com.crawler.command.CrawlCommand; |
||||
|
import com.crawler.command.SaveCommand; |
||||
|
import com.crawler.controller.CrawlerController; |
||||
|
import com.crawler.view.ConsoleView; |
||||
|
|
||||
|
public class App { |
||||
|
public static void main(String[] args) { |
||||
|
// 1. 初始化控制器
|
||||
|
CrawlerController controller = new CrawlerController(); |
||||
|
|
||||
|
// 2. 初始化视图
|
||||
|
ConsoleView view = new ConsoleView(); |
||||
|
|
||||
|
// 3. 注册命令到视图(Command模式绑定)
|
||||
|
view.registerMenuItem("1", "爬取豆瓣电影TOP250", |
||||
|
new CrawlCommand(controller, "doubanmovie")); |
||||
|
view.registerMenuItem("2", "爬取豆瓣音乐TOP250", |
||||
|
new CrawlCommand(controller, "doubanmusic")); |
||||
|
view.registerMenuItem("3", "爬取IMDb电影TOP250(豆瓣豆列)", |
||||
|
new CrawlCommand(controller, "imdbmovie")); |
||||
|
view.registerMenuItem("4", "保存最近爬取结果到文件", |
||||
|
new SaveCommand(controller, "./output")); |
||||
|
|
||||
|
// 4. 启动CLI交互
|
||||
|
view.start(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,10 @@ |
|||||
|
package com.crawler.command; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
|
||||
|
/** |
||||
|
* 命令接口(Command模式核心) |
||||
|
*/ |
||||
|
public interface Command { |
||||
|
void execute() throws CrawlerException; |
||||
|
} |
||||
@ -0,0 +1,29 @@ |
|||||
|
package com.crawler.command; |
||||
|
|
||||
|
import com.crawler.controller.CrawlerController; |
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.model.Article; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 爬取命令:封装指定数据源的爬取操作 |
||||
|
*/ |
||||
|
public class CrawlCommand implements Command { |
||||
|
private final CrawlerController controller; |
||||
|
private final String strategyKey; |
||||
|
|
||||
|
public CrawlCommand(CrawlerController controller, String strategyKey) { |
||||
|
this.controller = controller; |
||||
|
this.strategyKey = strategyKey; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws CrawlerException { |
||||
|
System.out.println("🚀 开始爬取 [" + strategyKey + "] ..."); |
||||
|
List<Article> articles = controller.crawl(strategyKey); |
||||
|
System.out.println("✅ 爬取完成!共获取 " + articles.size() + " 条数据"); |
||||
|
// 将结果暂存到Controller中,供后续SaveCommand使用
|
||||
|
controller.setLastCrawlResult(articles); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,55 @@ |
|||||
|
package com.crawler.command; |
||||
|
|
||||
|
import com.crawler.controller.CrawlerController; |
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.model.Article; |
||||
|
import com.google.gson.Gson; |
||||
|
import com.google.gson.GsonBuilder; |
||||
|
|
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.nio.file.Paths; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 保存命令:将最近一次爬取的结果保存到JSON文件 |
||||
|
*/ |
||||
|
public class SaveCommand implements Command { |
||||
|
private final CrawlerController controller; |
||||
|
private final String outputDir; |
||||
|
|
||||
|
public SaveCommand(CrawlerController controller, String outputDir) { |
||||
|
this.controller = controller; |
||||
|
this.outputDir = outputDir; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws CrawlerException { |
||||
|
List<Article> articles = controller.getLastCrawlResult(); |
||||
|
if (articles == null || articles.isEmpty()) { |
||||
|
System.out.println("⚠️ 没有可保存的数据,请先执行爬取命令"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
// 自动创建输出目录
|
||||
|
Path dirPath = Paths.get(outputDir); |
||||
|
if (!Files.exists(dirPath)) { |
||||
|
Files.createDirectories(dirPath); |
||||
|
} |
||||
|
|
||||
|
String fileName = outputDir + "/crawl_result_" + System.currentTimeMillis() + ".json"; |
||||
|
Gson gson = new GsonBuilder().setPrettyPrinting().create(); |
||||
|
|
||||
|
try (FileWriter writer = new FileWriter(fileName)) { |
||||
|
gson.toJson(articles, writer); |
||||
|
} |
||||
|
|
||||
|
System.out.println("💾 数据已保存至: " + fileName); |
||||
|
} catch (IOException e) { |
||||
|
throw new CrawlerException("保存文件失败: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,44 @@ |
|||||
|
package com.crawler.controller; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.model.Article; |
||||
|
import com.crawler.strategy.CrawlStrategy; |
||||
|
import com.crawler.strategy.DoubanTop250Strategy; |
||||
|
import com.crawler.strategy.ImdbViaDoubanStrategy; |
||||
|
import com.crawler.strategy.DoubanMusicTop250Strategy; |
||||
|
import lombok.Getter; |
||||
|
import lombok.Setter; |
||||
|
|
||||
|
import java.util.HashMap; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
/** |
||||
|
* 爬虫控制器:协调策略、管理爬取结果 |
||||
|
*/ |
||||
|
public class CrawlerController { |
||||
|
private final Map<String, CrawlStrategy> strategyMap; |
||||
|
@Setter |
||||
|
@Getter |
||||
|
private List<Article> lastCrawlResult; |
||||
|
|
||||
|
public CrawlerController() { |
||||
|
strategyMap = new HashMap<>(); |
||||
|
// 注册所有爬取策略
|
||||
|
strategyMap.put("doubanmovie", new DoubanTop250Strategy()); |
||||
|
strategyMap.put("doubanmusic", new DoubanMusicTop250Strategy()); |
||||
|
strategyMap.put("imdbmovie", new ImdbViaDoubanStrategy()); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 根据key执行对应策略的爬取 |
||||
|
*/ |
||||
|
public List<Article> crawl(String strategyKey) throws CrawlerException { |
||||
|
CrawlStrategy strategy = strategyMap.get(strategyKey); |
||||
|
if (strategy == null) { |
||||
|
throw new CrawlerException("未知的数据源标识: " + strategyKey + ",可用: doubanmovie/doubanmusic/imdbmovie", null); |
||||
|
} |
||||
|
return strategy.crawl(); |
||||
|
} |
||||
|
|
||||
|
} |
||||
@ -0,0 +1,5 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class CrawlerException extends Exception{ |
||||
|
public CrawlerException(String message, Throwable cause) { super(message, cause); } |
||||
|
} |
||||
@ -0,0 +1,5 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class NetworkException extends CrawlerException{ |
||||
|
public NetworkException(String message, Throwable cause) { super(message, cause); } |
||||
|
} |
||||
@ -0,0 +1,5 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class ParseException extends CrawlerException{ |
||||
|
public ParseException(String message, Throwable cause) { super(message, cause); } |
||||
|
} |
||||
@ -0,0 +1,22 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
import lombok.AllArgsConstructor; |
||||
|
import lombok.Data; |
||||
|
import lombok.NoArgsConstructor; |
||||
|
|
||||
|
/** |
||||
|
* 电影数据实体类 |
||||
|
*/ |
||||
|
@Data |
||||
|
@NoArgsConstructor |
||||
|
@AllArgsConstructor |
||||
|
public class Article { |
||||
|
/** 数据来源标识: douban_top250 / doubanmusic_top100 / imdb_top250 */ |
||||
|
private String source; |
||||
|
/** 电影名称 */ |
||||
|
private String title; |
||||
|
/** 评分 */ |
||||
|
private double rating; |
||||
|
/** 详情页URL */ |
||||
|
private String detailUrl; |
||||
|
} |
||||
@ -0,0 +1,22 @@ |
|||||
|
package com.crawler.strategy; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.model.Article; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 爬取策略接口(策略模式核心) |
||||
|
*/ |
||||
|
public interface CrawlStrategy { |
||||
|
/** |
||||
|
* 执行爬取任务 |
||||
|
* @return 爬取到的文章列表 |
||||
|
* @throws CrawlerException 爬取过程中的统一异常 |
||||
|
*/ |
||||
|
List<Article> crawl() throws CrawlerException; |
||||
|
|
||||
|
/** |
||||
|
* 获取当前策略对应的数据源名称 |
||||
|
*/ |
||||
|
String getSourceName(); |
||||
|
} |
||||
@ -0,0 +1,126 @@ |
|||||
|
package com.crawler.strategy; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.exception.NetworkException; |
||||
|
import com.crawler.model.Article; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
/** |
||||
|
* 豆瓣音乐TOP250爬取策略 |
||||
|
* URL: https://music.douban.com/top250?start=X
|
||||
|
* 每页25条,共10页 |
||||
|
*/ |
||||
|
public class DoubanMusicTop250Strategy implements CrawlStrategy { |
||||
|
|
||||
|
private static final String BASE_URL = "https://music.douban.com/top250?start="; |
||||
|
private static final int PAGE_SIZE = 25; |
||||
|
private static final int TOTAL_COUNT = 250; |
||||
|
// 建议使用更真实的UA,避免被识别为爬虫
|
||||
|
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"; |
||||
|
|
||||
|
// 匹配评分数字(如 "9.6")
|
||||
|
private static final Pattern RATING_PATTERN = Pattern.compile("([\\d.]+)"); |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> crawl() throws CrawlerException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
|
||||
|
for (int start = 0; start < TOTAL_COUNT; start += PAGE_SIZE) { |
||||
|
Document doc = fetchPage(start); |
||||
|
|
||||
|
// ✅ 【核心修改】豆瓣音乐TOP250使用table布局,而非电影的grid_view
|
||||
|
Elements items = doc.select("table tr.item"); |
||||
|
|
||||
|
if (items.isEmpty()) { |
||||
|
System.out.println(" ⚠️ start=" + start + " 未解析到数据,请检查网络或DOM结构"); |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
try { |
||||
|
// ✅ 【核心修改】音乐条目链接在 div.pl2 > a 中
|
||||
|
Element titleLink = item.selectFirst("div.pl2 a"); |
||||
|
if (titleLink == null) continue; |
||||
|
|
||||
|
String title = titleLink.text().trim(); |
||||
|
String detailUrl = titleLink.absUrl("href"); // 使用absUrl确保获取完整链接
|
||||
|
|
||||
|
double rating = parseRating(item); |
||||
|
|
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article("douban_music_top250", title, rating, detailUrl)); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
System.out.println(" ⚠️ 单条解析跳过: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
System.out.println(" [进度] 豆瓣音乐TOP250: " + articles.size() + "/" + TOTAL_COUNT); |
||||
|
|
||||
|
// 礼貌延迟2秒,避免触发频率限制
|
||||
|
if (start + PAGE_SIZE < TOTAL_COUNT) { |
||||
|
try { Thread.sleep(2000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } |
||||
|
} |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 解析评分:优先从.rating_nums提取,兜底从star区域正则匹配 |
||||
|
*/ |
||||
|
private double parseRating(Element item) { |
||||
|
try { |
||||
|
// ✅ 音乐版评分选择器与电影版一致,但增加空值保护
|
||||
|
Element ratingEl = item.selectFirst("span.rating_nums"); |
||||
|
if (ratingEl != null && !ratingEl.text().isEmpty()) { |
||||
|
return Double.parseDouble(ratingEl.text().trim()); |
||||
|
} |
||||
|
|
||||
|
// 兜底:从star容器文本中提取第一个合法评分
|
||||
|
Element starEl = item.selectFirst("div.star"); |
||||
|
String textToMatch = (starEl != null) ? starEl.text() : item.text(); |
||||
|
|
||||
|
Matcher m = RATING_PATTERN.matcher(textToMatch); |
||||
|
while (m.find()) { |
||||
|
double val = Double.parseDouble(m.group(1)); |
||||
|
if (val >= 0 && val <= 10) return val; |
||||
|
} |
||||
|
} catch (NumberFormatException ignored) {} |
||||
|
return 0.0; |
||||
|
} |
||||
|
|
||||
|
private Document fetchPage(int start) throws NetworkException { |
||||
|
String url = BASE_URL + start; |
||||
|
IOException lastEx = null; |
||||
|
for (int i = 0; i < 3; i++) { |
||||
|
try { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent(USER_AGENT) |
||||
|
.header("Referer", "https://music.douban.com/top250") |
||||
|
// ✅ 建议添加Cookie以提升稳定性(可从浏览器复制登录态Cookie)
|
||||
|
// .header("Cookie", "your_cookie_here")
|
||||
|
.timeout(15000) |
||||
|
.get(); |
||||
|
} catch (IOException e) { |
||||
|
lastEx = e; |
||||
|
System.out.println(" ⚠️ start=" + start + " 重试(" + (i + 1) + "/3)..."); |
||||
|
try { Thread.sleep(3000); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); } |
||||
|
} |
||||
|
} |
||||
|
throw new NetworkException("豆瓣音乐请求失败(start=" + start + "): " + lastEx.getMessage(), lastEx); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getSourceName() { |
||||
|
return "豆瓣音乐TOP250"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,69 @@ |
|||||
|
package com.crawler.strategy; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.exception.NetworkException; |
||||
|
import com.crawler.exception.ParseException; |
||||
|
import com.crawler.model.Article; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 豆瓣TOP250爬取策略 |
||||
|
*/ |
||||
|
public class DoubanTop250Strategy implements CrawlStrategy { |
||||
|
|
||||
|
private static final String BASE_URL = "https://movie.douban.com/top250?start="; |
||||
|
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> crawl() throws CrawlerException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
// 豆瓣TOP250共10页,每页25条
|
||||
|
for (int start = 0; start < 250; start += 25) { |
||||
|
String url = BASE_URL + start; |
||||
|
Document doc = Jsoup.connect(url) |
||||
|
.userAgent(USER_AGENT) |
||||
|
.timeout(10000) |
||||
|
.get(); |
||||
|
|
||||
|
Elements items = doc.select("ol.grid_view li"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.select(".title").first().text(); |
||||
|
String ratingStr = item.select(".rating_num").text(); |
||||
|
String detailUrl = item.select("a").attr("abs:href"); |
||||
|
|
||||
|
double rating = 0.0; |
||||
|
if (ratingStr != null && !ratingStr.isEmpty()) { |
||||
|
rating = Double.parseDouble(ratingStr); |
||||
|
} |
||||
|
|
||||
|
articles.add(new Article("douban_top250", title, rating, detailUrl)); |
||||
|
} |
||||
|
|
||||
|
// ⚠️ 豆瓣反爬严格,必须加延迟,避免被封IP
|
||||
|
Thread.sleep(2000); |
||||
|
System.out.println(" [进度] 已爬取豆瓣TOP250: " + (start + 25) + "/250"); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("豆瓣TOP250网络请求失败: " + e.getMessage(), e); |
||||
|
} catch (InterruptedException e) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
throw new CrawlerException("豆瓣爬取线程被中断", e); |
||||
|
} catch (NumberFormatException e) { |
||||
|
throw new ParseException("豆瓣评分解析失败", e); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getSourceName() { |
||||
|
return "豆瓣TOP250"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,79 @@ |
|||||
|
package com.crawler.strategy; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.exception.NetworkException; |
||||
|
import com.crawler.exception.ParseException; |
||||
|
import com.crawler.model.Article; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class ImdbViaDoubanStrategy implements CrawlStrategy { |
||||
|
private static final String BASE_URL = "https://www.douban.com/doulist/152707139/?start="; |
||||
|
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"; |
||||
|
private static final Pattern RATING_PATTERN = Pattern.compile("(\\d+\\.?\\d*)"); |
||||
|
// ⚠️ 建议从浏览器登录豆瓣后复制Cookie填入此处,可大幅降低超时概率
|
||||
|
private static final String COOKIE = ""; |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> crawl() throws CrawlerException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
for (int start = 0; start < 250; start += 25) { |
||||
|
Document doc = fetchWithRetry(BASE_URL + start, 3); |
||||
|
Elements items = doc.select(".doulist-item"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
Element titleEle = item.select(".title a").first(); |
||||
|
if (titleEle == null) continue; |
||||
|
|
||||
|
String abstractText = item.select(".abstract").text(); |
||||
|
articles.add(new Article( |
||||
|
"imdb_top250", |
||||
|
titleEle.text(), |
||||
|
extractRating(abstractText), |
||||
|
titleEle.attr("abs:href") |
||||
|
)); |
||||
|
} |
||||
|
|
||||
|
try { Thread.sleep(3000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } |
||||
|
System.out.println(" [进度] IMDb豆列: " + (start + 25) + "/250"); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
/** 带重试机制的网络请求 */ |
||||
|
private Document fetchWithRetry(String url, int maxRetries) throws NetworkException { |
||||
|
IOException lastException = null; |
||||
|
for (int i = 0; i < maxRetries; i++) { |
||||
|
try { |
||||
|
var conn = Jsoup.connect(url).userAgent(USER_AGENT).timeout(30000); |
||||
|
if (!COOKIE.isEmpty()) conn.header("Cookie", COOKIE); |
||||
|
return conn.get(); |
||||
|
} catch (IOException e) { |
||||
|
lastException = e; |
||||
|
System.out.println(" ⚠️ 第" + (i+1) + "次请求超时,等待5秒后重试..."); |
||||
|
try { Thread.sleep(5000); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); } |
||||
|
} |
||||
|
} |
||||
|
throw new NetworkException("IMDb豆列请求失败(已重试" + maxRetries + "次): " + lastException.getMessage(), lastException); |
||||
|
} |
||||
|
|
||||
|
private double extractRating(String text) throws ParseException { |
||||
|
if (text == null || text.isEmpty()) return 0.0; |
||||
|
Matcher matcher = RATING_PATTERN.matcher(text); |
||||
|
if (matcher.find()) { |
||||
|
try { return Double.parseDouble(matcher.group(1)); } |
||||
|
catch (NumberFormatException e) { throw new ParseException("评分解析失败: " + text, e); } |
||||
|
} |
||||
|
return 0.0; |
||||
|
} |
||||
|
|
||||
|
@Override public String getSourceName() { return "IMDb TOP250(豆瓣豆列)"; } |
||||
|
} |
||||
@ -0,0 +1,73 @@ |
|||||
|
package com.crawler.view; |
||||
|
|
||||
|
import com.crawler.command.Command; |
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
|
||||
|
import java.util.LinkedHashMap; |
||||
|
import java.util.Map; |
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
/** |
||||
|
* 控制台视图:负责用户交互与命令分发 |
||||
|
*/ |
||||
|
public class ConsoleView { |
||||
|
private final Map<String, Command> commandMap; |
||||
|
private final Map<String, String> menuItems; |
||||
|
|
||||
|
public ConsoleView() { |
||||
|
commandMap = new LinkedHashMap<>(); |
||||
|
menuItems = new LinkedHashMap<>(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 注册菜单项与对应命令 |
||||
|
*/ |
||||
|
public void registerMenuItem(String key, String label, Command command) { |
||||
|
menuItems.put(key, label); |
||||
|
commandMap.put(key, command); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 启动CLI交互循环 |
||||
|
*/ |
||||
|
public void start() { |
||||
|
Scanner scanner = new Scanner(System.in); |
||||
|
boolean running = true; |
||||
|
|
||||
|
while (running) { |
||||
|
printMenu(); |
||||
|
System.out.print("请输入指令编号: "); |
||||
|
String input = scanner.nextLine().trim(); |
||||
|
|
||||
|
if ("0".equals(input)) { |
||||
|
running = false; |
||||
|
System.out.println("👋 再见!"); |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
Command command = commandMap.get(input); |
||||
|
if (command == null) { |
||||
|
System.out.println("❌ 无效指令,请重新输入\n"); |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
command.execute(); |
||||
|
} catch (CrawlerException e) { |
||||
|
System.out.println("⚠️ 执行出错: " + e.getMessage()); |
||||
|
} |
||||
|
System.out.println(); // 空行分隔
|
||||
|
} |
||||
|
|
||||
|
scanner.close(); |
||||
|
} |
||||
|
|
||||
|
private void printMenu() { |
||||
|
System.out.println("\n========== 🕷️ 电影音乐榜单爬虫系统 =========="); |
||||
|
for (Map.Entry<String, String> entry : menuItems.entrySet()) { |
||||
|
System.out.println(entry.getKey() + ". " + entry.getValue()); |
||||
|
} |
||||
|
System.out.println("0. 退出系统"); |
||||
|
System.out.println("=========================================="); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,13 @@ |
|||||
|
package org.example; |
||||
|
|
||||
|
/** |
||||
|
* Hello world! |
||||
|
* |
||||
|
*/ |
||||
|
public class App |
||||
|
{ |
||||
|
public static void main( String[] args ) |
||||
|
{ |
||||
|
System.out.println( "Hello World!" ); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,38 @@ |
|||||
|
package org.example; |
||||
|
|
||||
|
import junit.framework.Test; |
||||
|
import junit.framework.TestCase; |
||||
|
import junit.framework.TestSuite; |
||||
|
|
||||
|
/** |
||||
|
* Unit test for simple App. |
||||
|
*/ |
||||
|
public class AppTest |
||||
|
extends TestCase |
||||
|
{ |
||||
|
/** |
||||
|
* Create the test case |
||||
|
* |
||||
|
* @param testName name of the test case |
||||
|
*/ |
||||
|
public AppTest( String testName ) |
||||
|
{ |
||||
|
super( testName ); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @return the suite of tests being tested |
||||
|
*/ |
||||
|
public static Test suite() |
||||
|
{ |
||||
|
return new TestSuite( AppTest.class ); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* Rigourous Test :-) |
||||
|
*/ |
||||
|
public void testApp() |
||||
|
{ |
||||
|
assertTrue( true ); |
||||
|
} |
||||
|
} |
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,118 @@ |
|||||
|
好的,已严格依照您提供的《高级程序设计》项目报告模板格式(含“W1: __”周报结构、表格样式、章节标题层级)撰写本实验报告。 |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
### 《高级程序设计》项目报告 |
||||
|
**爬虫项目开发全过程记录** |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
#### 一、项目目标 |
||||
|
|
||||
|
##### 1.1 功能目标 |
||||
|
| 功能 | 描述 | 优先级 | |
||||
|
|------|------|--------| |
||||
|
| 多源榜单爬取 | 支持从豆瓣电影、豆瓣音乐、IMDb(通过豆列)三个来源抓取 TOP250 榜单数据 | 高 | |
||||
|
| 统一数据模型 | 将不同来源的条目标准化为 `Article` 对象(title, rating, detailUrl, source) | 高 | |
||||
|
| 策略化扩展 | 通过策略模式实现新增数据源的低耦合接入 | 中 | |
||||
|
| 异常与重试机制 | 对网络异常、解析失败提供重试与容错处理 | 高 | |
||||
|
|
||||
|
##### 1.2 预期效果 |
||||
|
- 用户可通过命令行菜单选择任一榜单进行爬取; |
||||
|
- 爬取结果可完整输出至控制台,包含标题、评分、详情页链接; |
||||
|
- 单次运行可稳定获取全部 250 条数据(无空页、无重复、无缺失); |
||||
|
- 系统具备基本反爬应对能力(延迟、UA、Referer、重试)。 |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
#### 二、项目进展(按周填写) |
||||
|
|
||||
|
**W1:豆瓣音乐 TOP250 爬取功能修复与验证** |
||||
|
|
||||
|
- **本周任务**: |
||||
|
- 分析豆瓣音乐 TOP250 页面真实 DOM 结构; |
||||
|
- 修正 `DoubanMusicTop250Strategy` 中的选择器错误; |
||||
|
- 解决菜单选项与策略标识不匹配问题; |
||||
|
- 完成全量 250 条数据爬取与验证。 |
||||
|
|
||||
|
- **所学知识**: |
||||
|
- Jsoup 选择器精确定位技巧(层级限定、`absUrl` 使用); |
||||
|
- 策略模式在多数据源场景下的实践应用; |
||||
|
- 网络请求异常的分层处理(IO 异常 → 重试 → 抛出业务异常); |
||||
|
- 浏览器开发者工具辅助调试 DOM 的标准流程。 |
||||
|
|
||||
|
- **遇到的困难**: |
||||
|
- 初始误用豆瓣电影 `.grid_view .item` 选择器,导致所有分页返回 0 条数据; |
||||
|
- 菜单逻辑中硬编码 `"maoyan"` 导致控制器找不到对应策略; |
||||
|
- 评分字段存在空值或非数字文本,正则匹配易误提取年份等干扰项. |
||||
|
|
||||
|
- **如何解决的**: |
||||
|
- 通过 F12 检查页面 HTML,确认音乐版使用 `table tr.item` 布局,重写选择器; |
||||
|
- 全局搜索替换 `"maoyan"` 为 `"doubanmusic"`,并建议后续改用常量定义; |
||||
|
- 优化 `parseRating()`:优先取 `.rating_nums`,兜底时限定在 `div.star` 内部文本匹配,避免全局扫描; |
||||
|
- 在 `fetchPage` 中增加响应内容校验(如打印 `doc.title()`),快速定位是否返回空白页或验证码。 |
||||
|
|
||||
|
- **AI是如何帮助的**: |
||||
|
- 提供 DOM 结构对比分析(电影 vs 音乐布局差异); |
||||
|
- 推荐 `absUrl("href")` 替代 `attr("href")` 以解决相对路径问题; |
||||
|
- 生成正则匹配容错逻辑模板,提升评分提取鲁棒性; |
||||
|
- 协助梳理策略注册与调用链路,快速定位菜单 key 错误根源. |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
#### 三、项目结构 |
||||
|
|
||||
|
##### 最终包结构 |
||||
|
``` |
||||
|
my-crawler/ |
||||
|
├── pom.xml |
||||
|
└── src/main/java/com/crawler/ |
||||
|
├── model/ |
||||
|
│ └── Article.java |
||||
|
├── view/ |
||||
|
│ └── ConsoleView.java |
||||
|
├── command/ |
||||
|
│ ├── Command.java |
||||
|
│ └── CrawlCommand.java |
||||
|
├── controller/ |
||||
|
│ └── CrawlerController.java |
||||
|
└── strategy/ |
||||
|
├── CrawlStrategy.java |
||||
|
├── DoubanTop250Strategy.java |
||||
|
├── DoubanMusicTop250Strategy.java |
||||
|
└── ImdbViaDoubanStrategy.java |
||||
|
└── App.java |
||||
|
``` |
||||
|
*(根据实际情况修改)* |
||||
|
|
||||
|
##### 类图 |
||||
|
(插入类图截图) |
||||
|
 |
||||
|
--- |
||||
|
|
||||
|
#### 四、成果展示 |
||||
|
|
||||
|
##### 运行截图 |
||||
|
(插入项目运行的终端截图,应包含:菜单选择 → 开始爬取 → 进度提示 → 成功输出 250 条结果) |
||||
|
 |
||||
|
|
||||
|
##### 功能测试 |
||||
|
|
||||
|
| 功能 | 测试结果 | 备注 | |
||||
|
|------|----------|------| |
||||
|
| 豆瓣电影 TOP250 爬取 | ✅ 成功获取 250 条 | 使用 `.grid_view .item` 正确 | |
||||
|
| 豆瓣音乐 TOP250 爬取 | ✅ 成功获取 250 条 | 已修复为 `table tr.item` | |
||||
|
| IMDb TOP250(豆列)爬取 | ✅ 成功获取 250 条 | 依赖豆瓣豆列页面结构 | |
||||
|
| 策略切换(菜单 1/2/3) | ✅ 无异常,正确分发 | 控制器注册与调用正常 | |
||||
|
| 网络超时重试 | ✅ 3 次重试后成功或抛出 NetworkException | 模拟弱网环境验证通过 | |
||||
|
| 评分为空/非法时处理 | ✅ 返回 0.0,不中断流程 | 容错逻辑生效 | |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
#### 五、总结 |
||||
|
本次迭代聚焦于**豆瓣音乐 TOP250 功能的修复与稳定性加固**。核心收获在于: |
||||
|
1. **深刻认识到“结构即契约”**——爬虫成败高度依赖对目标站点 DOM 的精准理解; |
||||
|
2. **策略模式真正落地**:新增/修复策略无需改动控制器,系统可维护性显著提升; |
||||
|
3. **工程化意识增强**:将“重试”、“延迟”、“日志”、“容错”作为标配而非事后补救; |
||||
|
4. **调试方法论成熟**:形成“看页面 → 查结构 → 打日志 → 缩范围 → 改选择器”的标准化排错流程。 |
||||
|
|
||||
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
Loading…
Reference in new issue