Browse Source

王烊烊202302050115课程期末项目实验

master
WangYangyang 4 weeks ago
parent
commit
6be90e968a
  1. BIN
      project/QQ_1779607278905.png
  2. BIN
      project/plantuml-diagram-1.png
  3. 38
      project/pom.xml
  4. 29
      project/src/main/java/com/crawler/App.java
  5. 10
      project/src/main/java/com/crawler/command/Command.java
  6. 29
      project/src/main/java/com/crawler/command/CrawlCommand.java
  7. 55
      project/src/main/java/com/crawler/command/SaveCommand.java
  8. 44
      project/src/main/java/com/crawler/controller/CrawlerController.java
  9. 5
      project/src/main/java/com/crawler/exception/CrawlerException.java
  10. 5
      project/src/main/java/com/crawler/exception/NetworkException.java
  11. 5
      project/src/main/java/com/crawler/exception/ParseException.java
  12. 22
      project/src/main/java/com/crawler/model/Article.java
  13. 22
      project/src/main/java/com/crawler/strategy/CrawlStrategy.java
  14. 126
      project/src/main/java/com/crawler/strategy/DoubanMusicTop250Strategy.java
  15. 69
      project/src/main/java/com/crawler/strategy/DoubanTop250Strategy.java
  16. 79
      project/src/main/java/com/crawler/strategy/ImdbViaDoubanStrategy.java
  17. 73
      project/src/main/java/com/crawler/view/ConsoleView.java
  18. 13
      project/src/main/java/org/example/App.java
  19. 38
      project/src/test/java/org/example/AppTest.java
  20. BIN
      project/target/classes/com/crawler/App.class
  21. BIN
      project/target/classes/com/crawler/command/Command.class
  22. BIN
      project/target/classes/com/crawler/command/CrawlCommand.class
  23. BIN
      project/target/classes/com/crawler/command/SaveCommand.class
  24. BIN
      project/target/classes/com/crawler/controller/CrawlerController.class
  25. BIN
      project/target/classes/com/crawler/exception/CrawlerException.class
  26. BIN
      project/target/classes/com/crawler/exception/NetworkException.class
  27. BIN
      project/target/classes/com/crawler/exception/ParseException.class
  28. BIN
      project/target/classes/com/crawler/model/Article.class
  29. BIN
      project/target/classes/com/crawler/strategy/CrawlStrategy.class
  30. BIN
      project/target/classes/com/crawler/strategy/DoubanMusicTop250Strategy.class
  31. BIN
      project/target/classes/com/crawler/strategy/DoubanTop250Strategy.class
  32. BIN
      project/target/classes/com/crawler/strategy/ImdbViaDoubanStrategy.class
  33. BIN
      project/target/classes/com/crawler/view/ConsoleView.class
  34. BIN
      project/target/classes/org/example/App.class
  35. BIN
      project/王烊烊-202302050115-期末实验报告.docx
  36. 118
      project/王烊烊-202302050115-期末实验报告.md
  37. 1502
      project/输出文件/doubanmovie.json
  38. 1484
      project/输出文件/doubanmusic.json
  39. 1502
      project/输出文件/imdb.json

BIN
project/QQ_1779607278905.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 147 KiB

BIN
project/plantuml-diagram-1.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

38
project/pom.xml

@ -0,0 +1,38 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.crawler</groupId>
<artifactId>my-crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- HTML解析与HTTP请求 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<!-- JSON序列化(用于保存数据到文件) -->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
</dependency>
<!-- Lombok(简化Getter/Setter/构造器) -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.30</version>
<scope>provided</scope>
</dependency>
</dependencies>
</project>

29
project/src/main/java/com/crawler/App.java

@ -0,0 +1,29 @@
package com.crawler;
import com.crawler.command.CrawlCommand;
import com.crawler.command.SaveCommand;
import com.crawler.controller.CrawlerController;
import com.crawler.view.ConsoleView;
public class App {
public static void main(String[] args) {
// 1. 初始化控制器
CrawlerController controller = new CrawlerController();
// 2. 初始化视图
ConsoleView view = new ConsoleView();
// 3. 注册命令到视图(Command模式绑定)
view.registerMenuItem("1", "爬取豆瓣电影TOP250",
new CrawlCommand(controller, "doubanmovie"));
view.registerMenuItem("2", "爬取豆瓣音乐TOP250",
new CrawlCommand(controller, "doubanmusic"));
view.registerMenuItem("3", "爬取IMDb电影TOP250(豆瓣豆列)",
new CrawlCommand(controller, "imdbmovie"));
view.registerMenuItem("4", "保存最近爬取结果到文件",
new SaveCommand(controller, "./output"));
// 4. 启动CLI交互
view.start();
}
}

10
project/src/main/java/com/crawler/command/Command.java

@ -0,0 +1,10 @@
package com.crawler.command;
import com.crawler.exception.CrawlerException;
/**
* 命令接口Command模式核心
*/
public interface Command {
void execute() throws CrawlerException;
}

29
project/src/main/java/com/crawler/command/CrawlCommand.java

@ -0,0 +1,29 @@
package com.crawler.command;
import com.crawler.controller.CrawlerController;
import com.crawler.exception.CrawlerException;
import com.crawler.model.Article;
import java.util.List;
/**
* 爬取命令封装指定数据源的爬取操作
*/
public class CrawlCommand implements Command {
private final CrawlerController controller;
private final String strategyKey;
public CrawlCommand(CrawlerController controller, String strategyKey) {
this.controller = controller;
this.strategyKey = strategyKey;
}
@Override
public void execute() throws CrawlerException {
System.out.println("🚀 开始爬取 [" + strategyKey + "] ...");
List<Article> articles = controller.crawl(strategyKey);
System.out.println("✅ 爬取完成!共获取 " + articles.size() + " 条数据");
// 将结果暂存到Controller中,供后续SaveCommand使用
controller.setLastCrawlResult(articles);
}
}

55
project/src/main/java/com/crawler/command/SaveCommand.java

@ -0,0 +1,55 @@
package com.crawler.command;
import com.crawler.controller.CrawlerController;
import com.crawler.exception.CrawlerException;
import com.crawler.model.Article;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
/**
* 保存命令将最近一次爬取的结果保存到JSON文件
*/
public class SaveCommand implements Command {
private final CrawlerController controller;
private final String outputDir;
public SaveCommand(CrawlerController controller, String outputDir) {
this.controller = controller;
this.outputDir = outputDir;
}
@Override
public void execute() throws CrawlerException {
List<Article> articles = controller.getLastCrawlResult();
if (articles == null || articles.isEmpty()) {
System.out.println("⚠️ 没有可保存的数据,请先执行爬取命令");
return;
}
try {
// 自动创建输出目录
Path dirPath = Paths.get(outputDir);
if (!Files.exists(dirPath)) {
Files.createDirectories(dirPath);
}
String fileName = outputDir + "/crawl_result_" + System.currentTimeMillis() + ".json";
Gson gson = new GsonBuilder().setPrettyPrinting().create();
try (FileWriter writer = new FileWriter(fileName)) {
gson.toJson(articles, writer);
}
System.out.println("💾 数据已保存至: " + fileName);
} catch (IOException e) {
throw new CrawlerException("保存文件失败: " + e.getMessage(), e);
}
}
}

44
project/src/main/java/com/crawler/controller/CrawlerController.java

@ -0,0 +1,44 @@
package com.crawler.controller;
import com.crawler.exception.CrawlerException;
import com.crawler.model.Article;
import com.crawler.strategy.CrawlStrategy;
import com.crawler.strategy.DoubanTop250Strategy;
import com.crawler.strategy.ImdbViaDoubanStrategy;
import com.crawler.strategy.DoubanMusicTop250Strategy;
import lombok.Getter;
import lombok.Setter;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 爬虫控制器协调策略管理爬取结果
*/
public class CrawlerController {
private final Map<String, CrawlStrategy> strategyMap;
@Setter
@Getter
private List<Article> lastCrawlResult;
public CrawlerController() {
strategyMap = new HashMap<>();
// 注册所有爬取策略
strategyMap.put("doubanmovie", new DoubanTop250Strategy());
strategyMap.put("doubanmusic", new DoubanMusicTop250Strategy());
strategyMap.put("imdbmovie", new ImdbViaDoubanStrategy());
}
/**
* 根据key执行对应策略的爬取
*/
public List<Article> crawl(String strategyKey) throws CrawlerException {
CrawlStrategy strategy = strategyMap.get(strategyKey);
if (strategy == null) {
throw new CrawlerException("未知的数据源标识: " + strategyKey + ",可用: doubanmovie/doubanmusic/imdbmovie", null);
}
return strategy.crawl();
}
}

5
project/src/main/java/com/crawler/exception/CrawlerException.java

@ -0,0 +1,5 @@
package com.crawler.exception;
public class CrawlerException extends Exception{
public CrawlerException(String message, Throwable cause) { super(message, cause); }
}

5
project/src/main/java/com/crawler/exception/NetworkException.java

@ -0,0 +1,5 @@
package com.crawler.exception;
public class NetworkException extends CrawlerException{
public NetworkException(String message, Throwable cause) { super(message, cause); }
}

5
project/src/main/java/com/crawler/exception/ParseException.java

@ -0,0 +1,5 @@
package com.crawler.exception;
public class ParseException extends CrawlerException{
public ParseException(String message, Throwable cause) { super(message, cause); }
}

22
project/src/main/java/com/crawler/model/Article.java

@ -0,0 +1,22 @@
package com.crawler.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
/**
* 电影数据实体类
*/
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Article {
/** 数据来源标识: douban_top250 / doubanmusic_top100 / imdb_top250 */
private String source;
/** 电影名称 */
private String title;
/** 评分 */
private double rating;
/** 详情页URL */
private String detailUrl;
}

22
project/src/main/java/com/crawler/strategy/CrawlStrategy.java

@ -0,0 +1,22 @@
package com.crawler.strategy;
import com.crawler.exception.CrawlerException;
import com.crawler.model.Article;
import java.util.List;
/**
* 爬取策略接口策略模式核心
*/
public interface CrawlStrategy {
/**
* 执行爬取任务
* @return 爬取到的文章列表
* @throws CrawlerException 爬取过程中的统一异常
*/
List<Article> crawl() throws CrawlerException;
/**
* 获取当前策略对应的数据源名称
*/
String getSourceName();
}

126
project/src/main/java/com/crawler/strategy/DoubanMusicTop250Strategy.java

@ -0,0 +1,126 @@
package com.crawler.strategy;
import com.crawler.exception.CrawlerException;
import com.crawler.exception.NetworkException;
import com.crawler.model.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 豆瓣音乐TOP250爬取策略
* URL: https://music.douban.com/top250?start=X
* 每页25条共10页
*/
public class DoubanMusicTop250Strategy implements CrawlStrategy {
private static final String BASE_URL = "https://music.douban.com/top250?start=";
private static final int PAGE_SIZE = 25;
private static final int TOTAL_COUNT = 250;
// 建议使用更真实的UA,避免被识别为爬虫
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36";
// 匹配评分数字(如 "9.6")
private static final Pattern RATING_PATTERN = Pattern.compile("([\\d.]+)");
@Override
public List<Article> crawl() throws CrawlerException {
List<Article> articles = new ArrayList<>();
for (int start = 0; start < TOTAL_COUNT; start += PAGE_SIZE) {
Document doc = fetchPage(start);
// ✅ 【核心修改】豆瓣音乐TOP250使用table布局,而非电影的grid_view
Elements items = doc.select("table tr.item");
if (items.isEmpty()) {
System.out.println(" ⚠️ start=" + start + " 未解析到数据,请检查网络或DOM结构");
continue;
}
for (Element item : items) {
try {
// ✅ 【核心修改】音乐条目链接在 div.pl2 > a 中
Element titleLink = item.selectFirst("div.pl2 a");
if (titleLink == null) continue;
String title = titleLink.text().trim();
String detailUrl = titleLink.absUrl("href"); // 使用absUrl确保获取完整链接
double rating = parseRating(item);
if (!title.isEmpty()) {
articles.add(new Article("douban_music_top250", title, rating, detailUrl));
}
} catch (Exception e) {
System.out.println(" ⚠️ 单条解析跳过: " + e.getMessage());
}
}
System.out.println(" [进度] 豆瓣音乐TOP250: " + articles.size() + "/" + TOTAL_COUNT);
// 礼貌延迟2秒,避免触发频率限制
if (start + PAGE_SIZE < TOTAL_COUNT) {
try { Thread.sleep(2000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); }
}
}
return articles;
}
/**
* 解析评分优先从.rating_nums提取兜底从star区域正则匹配
*/
private double parseRating(Element item) {
try {
// ✅ 音乐版评分选择器与电影版一致,但增加空值保护
Element ratingEl = item.selectFirst("span.rating_nums");
if (ratingEl != null && !ratingEl.text().isEmpty()) {
return Double.parseDouble(ratingEl.text().trim());
}
// 兜底:从star容器文本中提取第一个合法评分
Element starEl = item.selectFirst("div.star");
String textToMatch = (starEl != null) ? starEl.text() : item.text();
Matcher m = RATING_PATTERN.matcher(textToMatch);
while (m.find()) {
double val = Double.parseDouble(m.group(1));
if (val >= 0 && val <= 10) return val;
}
} catch (NumberFormatException ignored) {}
return 0.0;
}
private Document fetchPage(int start) throws NetworkException {
String url = BASE_URL + start;
IOException lastEx = null;
for (int i = 0; i < 3; i++) {
try {
return Jsoup.connect(url)
.userAgent(USER_AGENT)
.header("Referer", "https://music.douban.com/top250")
// ✅ 建议添加Cookie以提升稳定性(可从浏览器复制登录态Cookie)
// .header("Cookie", "your_cookie_here")
.timeout(15000)
.get();
} catch (IOException e) {
lastEx = e;
System.out.println(" ⚠️ start=" + start + " 重试(" + (i + 1) + "/3)...");
try { Thread.sleep(3000); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); }
}
}
throw new NetworkException("豆瓣音乐请求失败(start=" + start + "): " + lastEx.getMessage(), lastEx);
}
@Override
public String getSourceName() {
return "豆瓣音乐TOP250";
}
}

69
project/src/main/java/com/crawler/strategy/DoubanTop250Strategy.java

@ -0,0 +1,69 @@
package com.crawler.strategy;
import com.crawler.exception.CrawlerException;
import com.crawler.exception.NetworkException;
import com.crawler.exception.ParseException;
import com.crawler.model.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* 豆瓣TOP250爬取策略
*/
public class DoubanTop250Strategy implements CrawlStrategy {
private static final String BASE_URL = "https://movie.douban.com/top250?start=";
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
@Override
public List<Article> crawl() throws CrawlerException {
List<Article> articles = new ArrayList<>();
try {
// 豆瓣TOP250共10页,每页25条
for (int start = 0; start < 250; start += 25) {
String url = BASE_URL + start;
Document doc = Jsoup.connect(url)
.userAgent(USER_AGENT)
.timeout(10000)
.get();
Elements items = doc.select("ol.grid_view li");
for (Element item : items) {
String title = item.select(".title").first().text();
String ratingStr = item.select(".rating_num").text();
String detailUrl = item.select("a").attr("abs:href");
double rating = 0.0;
if (ratingStr != null && !ratingStr.isEmpty()) {
rating = Double.parseDouble(ratingStr);
}
articles.add(new Article("douban_top250", title, rating, detailUrl));
}
// ⚠️ 豆瓣反爬严格,必须加延迟,避免被封IP
Thread.sleep(2000);
System.out.println(" [进度] 已爬取豆瓣TOP250: " + (start + 25) + "/250");
}
} catch (IOException e) {
throw new NetworkException("豆瓣TOP250网络请求失败: " + e.getMessage(), e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new CrawlerException("豆瓣爬取线程被中断", e);
} catch (NumberFormatException e) {
throw new ParseException("豆瓣评分解析失败", e);
}
return articles;
}
@Override
public String getSourceName() {
return "豆瓣TOP250";
}
}

79
project/src/main/java/com/crawler/strategy/ImdbViaDoubanStrategy.java

@ -0,0 +1,79 @@
package com.crawler.strategy;
import com.crawler.exception.CrawlerException;
import com.crawler.exception.NetworkException;
import com.crawler.exception.ParseException;
import com.crawler.model.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ImdbViaDoubanStrategy implements CrawlStrategy {
private static final String BASE_URL = "https://www.douban.com/doulist/152707139/?start=";
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
private static final Pattern RATING_PATTERN = Pattern.compile("(\\d+\\.?\\d*)");
// ⚠️ 建议从浏览器登录豆瓣后复制Cookie填入此处,可大幅降低超时概率
private static final String COOKIE = "";
@Override
public List<Article> crawl() throws CrawlerException {
List<Article> articles = new ArrayList<>();
for (int start = 0; start < 250; start += 25) {
Document doc = fetchWithRetry(BASE_URL + start, 3);
Elements items = doc.select(".doulist-item");
for (Element item : items) {
Element titleEle = item.select(".title a").first();
if (titleEle == null) continue;
String abstractText = item.select(".abstract").text();
articles.add(new Article(
"imdb_top250",
titleEle.text(),
extractRating(abstractText),
titleEle.attr("abs:href")
));
}
try { Thread.sleep(3000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); }
System.out.println(" [进度] IMDb豆列: " + (start + 25) + "/250");
}
return articles;
}
/** 带重试机制的网络请求 */
private Document fetchWithRetry(String url, int maxRetries) throws NetworkException {
IOException lastException = null;
for (int i = 0; i < maxRetries; i++) {
try {
var conn = Jsoup.connect(url).userAgent(USER_AGENT).timeout(30000);
if (!COOKIE.isEmpty()) conn.header("Cookie", COOKIE);
return conn.get();
} catch (IOException e) {
lastException = e;
System.out.println(" ⚠️ 第" + (i+1) + "次请求超时,等待5秒后重试...");
try { Thread.sleep(5000); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); }
}
}
throw new NetworkException("IMDb豆列请求失败(已重试" + maxRetries + "次): " + lastException.getMessage(), lastException);
}
private double extractRating(String text) throws ParseException {
if (text == null || text.isEmpty()) return 0.0;
Matcher matcher = RATING_PATTERN.matcher(text);
if (matcher.find()) {
try { return Double.parseDouble(matcher.group(1)); }
catch (NumberFormatException e) { throw new ParseException("评分解析失败: " + text, e); }
}
return 0.0;
}
@Override public String getSourceName() { return "IMDb TOP250(豆瓣豆列)"; }
}

73
project/src/main/java/com/crawler/view/ConsoleView.java

@ -0,0 +1,73 @@
package com.crawler.view;
import com.crawler.command.Command;
import com.crawler.exception.CrawlerException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Scanner;
/**
* 控制台视图负责用户交互与命令分发
*/
public class ConsoleView {
private final Map<String, Command> commandMap;
private final Map<String, String> menuItems;
public ConsoleView() {
commandMap = new LinkedHashMap<>();
menuItems = new LinkedHashMap<>();
}
/**
* 注册菜单项与对应命令
*/
public void registerMenuItem(String key, String label, Command command) {
menuItems.put(key, label);
commandMap.put(key, command);
}
/**
* 启动CLI交互循环
*/
public void start() {
Scanner scanner = new Scanner(System.in);
boolean running = true;
while (running) {
printMenu();
System.out.print("请输入指令编号: ");
String input = scanner.nextLine().trim();
if ("0".equals(input)) {
running = false;
System.out.println("👋 再见!");
continue;
}
Command command = commandMap.get(input);
if (command == null) {
System.out.println("❌ 无效指令,请重新输入\n");
continue;
}
try {
command.execute();
} catch (CrawlerException e) {
System.out.println("⚠️ 执行出错: " + e.getMessage());
}
System.out.println(); // 空行分隔
}
scanner.close();
}
private void printMenu() {
System.out.println("\n========== 🕷️ 电影音乐榜单爬虫系统 ==========");
for (Map.Entry<String, String> entry : menuItems.entrySet()) {
System.out.println(entry.getKey() + ". " + entry.getValue());
}
System.out.println("0. 退出系统");
System.out.println("==========================================");
}
}

13
project/src/main/java/org/example/App.java

@ -0,0 +1,13 @@
package org.example;
/**
* Hello world!
*
*/
public class App
{
public static void main( String[] args )
{
System.out.println( "Hello World!" );
}
}

38
project/src/test/java/org/example/AppTest.java

@ -0,0 +1,38 @@
package org.example;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
/**
* Unit test for simple App.
*/
public class AppTest
extends TestCase
{
/**
* Create the test case
*
* @param testName name of the test case
*/
public AppTest( String testName )
{
super( testName );
}
/**
* @return the suite of tests being tested
*/
public static Test suite()
{
return new TestSuite( AppTest.class );
}
/**
* Rigourous Test :-)
*/
public void testApp()
{
assertTrue( true );
}
}

BIN
project/target/classes/com/crawler/App.class

Binary file not shown.

BIN
project/target/classes/com/crawler/command/Command.class

Binary file not shown.

BIN
project/target/classes/com/crawler/command/CrawlCommand.class

Binary file not shown.

BIN
project/target/classes/com/crawler/command/SaveCommand.class

Binary file not shown.

BIN
project/target/classes/com/crawler/controller/CrawlerController.class

Binary file not shown.

BIN
project/target/classes/com/crawler/exception/CrawlerException.class

Binary file not shown.

BIN
project/target/classes/com/crawler/exception/NetworkException.class

Binary file not shown.

BIN
project/target/classes/com/crawler/exception/ParseException.class

Binary file not shown.

BIN
project/target/classes/com/crawler/model/Article.class

Binary file not shown.

BIN
project/target/classes/com/crawler/strategy/CrawlStrategy.class

Binary file not shown.

BIN
project/target/classes/com/crawler/strategy/DoubanMusicTop250Strategy.class

Binary file not shown.

BIN
project/target/classes/com/crawler/strategy/DoubanTop250Strategy.class

Binary file not shown.

BIN
project/target/classes/com/crawler/strategy/ImdbViaDoubanStrategy.class

Binary file not shown.

BIN
project/target/classes/com/crawler/view/ConsoleView.class

Binary file not shown.

BIN
project/target/classes/org/example/App.class

Binary file not shown.

BIN
project/王烊烊-202302050115-期末实验报告.docx

Binary file not shown.

118
project/王烊烊-202302050115-期末实验报告.md

@ -0,0 +1,118 @@
好的,已严格依照您提供的《高级程序设计》项目报告模板格式(含“W1: __”周报结构、表格样式、章节标题层级)撰写本实验报告。
---
### 《高级程序设计》项目报告
**爬虫项目开发全过程记录**
---
#### 一、项目目标
##### 1.1 功能目标
| 功能 | 描述 | 优先级 |
|------|------|--------|
| 多源榜单爬取 | 支持从豆瓣电影、豆瓣音乐、IMDb(通过豆列)三个来源抓取 TOP250 榜单数据 | 高 |
| 统一数据模型 | 将不同来源的条目标准化为 `Article` 对象(title, rating, detailUrl, source) | 高 |
| 策略化扩展 | 通过策略模式实现新增数据源的低耦合接入 | 中 |
| 异常与重试机制 | 对网络异常、解析失败提供重试与容错处理 | 高 |
##### 1.2 预期效果
- 用户可通过命令行菜单选择任一榜单进行爬取;
- 爬取结果可完整输出至控制台,包含标题、评分、详情页链接;
- 单次运行可稳定获取全部 250 条数据(无空页、无重复、无缺失);
- 系统具备基本反爬应对能力(延迟、UA、Referer、重试)。
---
#### 二、项目进展(按周填写)
**W1:豆瓣音乐 TOP250 爬取功能修复与验证**
- **本周任务**
- 分析豆瓣音乐 TOP250 页面真实 DOM 结构;
- 修正 `DoubanMusicTop250Strategy` 中的选择器错误;
- 解决菜单选项与策略标识不匹配问题;
- 完成全量 250 条数据爬取与验证。
- **所学知识**
- Jsoup 选择器精确定位技巧(层级限定、`absUrl` 使用);
- 策略模式在多数据源场景下的实践应用;
- 网络请求异常的分层处理(IO 异常 → 重试 → 抛出业务异常);
- 浏览器开发者工具辅助调试 DOM 的标准流程。
- **遇到的困难**
- 初始误用豆瓣电影 `.grid_view .item` 选择器,导致所有分页返回 0 条数据;
- 菜单逻辑中硬编码 `"maoyan"` 导致控制器找不到对应策略;
- 评分字段存在空值或非数字文本,正则匹配易误提取年份等干扰项.
- **如何解决的**
- 通过 F12 检查页面 HTML,确认音乐版使用 `table tr.item` 布局,重写选择器;
- 全局搜索替换 `"maoyan"``"doubanmusic"`,并建议后续改用常量定义;
- 优化 `parseRating()`:优先取 `.rating_nums`,兜底时限定在 `div.star` 内部文本匹配,避免全局扫描;
- 在 `fetchPage` 中增加响应内容校验(如打印 `doc.title()`),快速定位是否返回空白页或验证码。
- **AI是如何帮助的**
- 提供 DOM 结构对比分析(电影 vs 音乐布局差异);
- 推荐 `absUrl("href")` 替代 `attr("href")` 以解决相对路径问题;
- 生成正则匹配容错逻辑模板,提升评分提取鲁棒性;
- 协助梳理策略注册与调用链路,快速定位菜单 key 错误根源.
---
#### 三、项目结构
##### 最终包结构
```
my-crawler/
├── pom.xml
└── src/main/java/com/crawler/
├── model/
│ └── Article.java
├── view/
│ └── ConsoleView.java
├── command/
│ ├── Command.java
│ └── CrawlCommand.java
├── controller/
│ └── CrawlerController.java
└── strategy/
├── CrawlStrategy.java
├── DoubanTop250Strategy.java
├── DoubanMusicTop250Strategy.java
└── ImdbViaDoubanStrategy.java
└── App.java
```
*(根据实际情况修改)*
##### 类图
(插入类图截图)
![alt text](plantuml-diagram-1.png)
---
#### 四、成果展示
##### 运行截图
(插入项目运行的终端截图,应包含:菜单选择 → 开始爬取 → 进度提示 → 成功输出 250 条结果)
![alt text](QQ_1779607278905.png)
##### 功能测试
| 功能 | 测试结果 | 备注 |
|------|----------|------|
| 豆瓣电影 TOP250 爬取 | ✅ 成功获取 250 条 | 使用 `.grid_view .item` 正确 |
| 豆瓣音乐 TOP250 爬取 | ✅ 成功获取 250 条 | 已修复为 `table tr.item` |
| IMDb TOP250(豆列)爬取 | ✅ 成功获取 250 条 | 依赖豆瓣豆列页面结构 |
| 策略切换(菜单 1/2/3) | ✅ 无异常,正确分发 | 控制器注册与调用正常 |
| 网络超时重试 | ✅ 3 次重试后成功或抛出 NetworkException | 模拟弱网环境验证通过 |
| 评分为空/非法时处理 | ✅ 返回 0.0,不中断流程 | 容错逻辑生效 |
---
#### 五、总结
本次迭代聚焦于**豆瓣音乐 TOP250 功能的修复与稳定性加固**。核心收获在于:
1. **深刻认识到“结构即契约”**——爬虫成败高度依赖对目标站点 DOM 的精准理解;
2. **策略模式真正落地**:新增/修复策略无需改动控制器,系统可维护性显著提升;
3. **工程化意识增强**:将“重试”、“延迟”、“日志”、“容错”作为标配而非事后补救;
4. **调试方法论成熟**:形成“看页面 → 查结构 → 打日志 → 缩范围 → 改选择器”的标准化排错流程。

1502
project/输出文件/doubanmovie.json

File diff suppressed because it is too large

1484
project/输出文件/doubanmusic.json

File diff suppressed because it is too large

1502
project/输出文件/imdb.json

File diff suppressed because it is too large
Loading…
Cancel
Save