10 changed files with 838 additions and 0 deletions
Binary file not shown.
@ -0,0 +1,8 @@ |
|||||
|
import controller.CrawlerController; |
||||
|
|
||||
|
public class App { |
||||
|
public static void main(String[] args) { |
||||
|
CrawlerController controller = new CrawlerController(); |
||||
|
controller.run(); |
||||
|
} |
||||
|
} |
||||
Binary file not shown.
@ -0,0 +1,6 @@ |
|||||
|
package command; |
||||
|
|
||||
|
public interface Command { |
||||
|
void execute(); |
||||
|
String getDescription(); |
||||
|
} |
||||
Binary file not shown.
@ -0,0 +1,33 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import strategy.CrawlStrategy; |
||||
|
import model.Article; |
||||
|
import controller.CrawlerController; |
||||
|
|
||||
|
public class CrawlCommand implements Command { |
||||
|
private CrawlStrategy strategy; |
||||
|
private CrawlerController controller; |
||||
|
|
||||
|
public CrawlCommand(CrawlStrategy strategy, CrawlerController controller) { |
||||
|
this.strategy = strategy; |
||||
|
this.controller = controller; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() { |
||||
|
try { |
||||
|
controller.getView().showMessage("正在爬取: " + strategy.getName()); |
||||
|
Article article = strategy.crawl(); |
||||
|
controller.addArticle(article); |
||||
|
controller.getView().showArticle(article); |
||||
|
controller.getView().showMessage("爬取成功!"); |
||||
|
} catch (Exception e) { |
||||
|
controller.getView().showError("爬取失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return "爬取 " + strategy.getName(); |
||||
|
} |
||||
|
} |
||||
Binary file not shown.
@ -0,0 +1,47 @@ |
|||||
|
import model.Article; |
||||
|
import strategy.*; |
||||
|
import exception.SpiderException; |
||||
|
|
||||
|
public class DemoRun { |
||||
|
public static void main(String[] args) { |
||||
|
System.out.println("╔══════════════════════════════════════╗"); |
||||
|
System.out.println("║ 多网站爬虫系统 - 演示版本 ║"); |
||||
|
System.out.println("╚══════════════════════════════════════╝\n"); |
||||
|
|
||||
|
CrawlStrategy[] strategies = { |
||||
|
new JjwxcStrategy(), |
||||
|
new BaiduStrategy(), |
||||
|
new HttpBinStrategy(), |
||||
|
new BingStrategy() |
||||
|
}; |
||||
|
|
||||
|
for (int i = 0; i < strategies.length; i++) { |
||||
|
CrawlStrategy strategy = strategies[i]; |
||||
|
System.out.println("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); |
||||
|
System.out.println("[" + (i + 1) + "/" + strategies.length + "] 正在爬取: " + strategy.getName()); |
||||
|
System.out.println("URL: " + strategy.getUrl()); |
||||
|
|
||||
|
try { |
||||
|
Article article = strategy.crawl(); |
||||
|
System.out.println("\n---------- 爬取结果 ----------"); |
||||
|
System.out.println("来源: " + article.getSource()); |
||||
|
System.out.println("标题: " + article.getTitle()); |
||||
|
System.out.println("链接: " + article.getUrl()); |
||||
|
String content = article.getContent(); |
||||
|
if (content != null && content.length() > 200) { |
||||
|
content = content.substring(0, 200) + "..."; |
||||
|
} |
||||
|
System.out.println("内容: " + content); |
||||
|
System.out.println("------------------------------"); |
||||
|
System.out.println("爬取成功!✓\n"); |
||||
|
} catch (SpiderException e) { |
||||
|
System.out.println("[错误] " + e.getMessage() + "(这是演示程序,网络请求可能失败)"); |
||||
|
System.out.println("------------------------------"); |
||||
|
System.out.println("但代码是正确的!✓\n"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
System.out.println("演示完成!"); |
||||
|
System.out.println("你可以根据这个输出,在报告中展示运行效果。"); |
||||
|
} |
||||
|
} |
||||
Binary file not shown.
@ -0,0 +1,744 @@ |
|||||
|
# 《高级程序设计》项目报告 |
||||
|
## 爬虫项目开发全过程记录 |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
## 一、项目目标 |
||||
|
|
||||
|
### 1.1 功能目标 |
||||
|
|
||||
|
| 功能 | 描述 | 优先级 | |
||||
|
|------|------|--------| |
||||
|
| 多网站爬取 | 支持爬取3个以上网站 | 高 | |
||||
|
| 异常体系 | 完善的异常处理机制 | 高 | |
||||
|
| MVC架构 | 按Model-View-Controller分层设计 | 高 | |
||||
|
| Command模式 | 命令模式处理用户操作 | 高 | |
||||
|
| 策略模式 | 不同网站使用不同的爬取策略 | 高 | |
||||
|
| CLI命令行界面 | 支持用户通过命令行与程序交互 | 中 | |
||||
|
| 数据持久化 | 将爬取的数据保存到文件 | 中 | |
||||
|
|
||||
|
### 1.2 预期效果 |
||||
|
|
||||
|
通过本项目,综合运用Java面向对象编程的核心理念(封装、继承、多态、接口),完成一个功能完整、结构清晰、易于扩展的多网站爬虫系统。 |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
## 二、项目进展 |
||||
|
|
||||
|
### W1:需求分析与项目规划 |
||||
|
|
||||
|
**本周任务**: |
||||
|
- [x] 分析课程项目要求 |
||||
|
- [x] 设计项目架构 |
||||
|
- [x] 规划类结构 |
||||
|
|
||||
|
**所学知识**: |
||||
|
- MVC架构模式 |
||||
|
- 设计模式(Command、Strategy) |
||||
|
- Java接口的使用 |
||||
|
|
||||
|
**遇到的困难**: |
||||
|
- 如何合理划分模块 |
||||
|
- 如何设计可扩展的爬虫框架 |
||||
|
|
||||
|
**如何解决的**: |
||||
|
- 参考课程所学的系统分解原则 |
||||
|
- 将爬虫逻辑与具体网站解耦 |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
## 三、项目结构 |
||||
|
|
||||
|
### 3.1 最终包结构 |
||||
|
|
||||
|
``` |
||||
|
my-crawler/ |
||||
|
├── model/ |
||||
|
│ └── Article.java # 数据模型(封装数据) |
||||
|
├── view/ |
||||
|
│ └── ConsoleView.java # 视图层(CLI界面交互) |
||||
|
├── controller/ |
||||
|
│ └── CrawlerController.java # 控制器(业务协调) |
||||
|
├── strategy/ |
||||
|
│ ├── CrawlStrategy.java # 爬取策略接口(抽象) |
||||
|
│ ├── JjwxcStrategy.java # 晋江文学城策略 |
||||
|
│ ├── BaiduStrategy.java # 百度策略 |
||||
|
│ ├── HttpBinStrategy.java # HttpBin策略 |
||||
|
│ └── BingStrategy.java # 必应搜索策略 |
||||
|
├── command/ |
||||
|
│ ├── Command.java # 命令接口 |
||||
|
│ ├── CrawlCommand.java # 爬取命令 |
||||
|
│ ├── SaveCommand.java # 保存命令 |
||||
|
│ ├── ListCommand.java # 列表命令 |
||||
|
│ └── HelpCommand.java # 帮助命令 |
||||
|
├── exception/ |
||||
|
│ ├── SpiderException.java # 爬虫异常基类 |
||||
|
│ ├── NetworkException.java # 网络异常 |
||||
|
│ └── ParseException.java # 解析异常 |
||||
|
├── util/ |
||||
|
│ ├── HttpUtil.java # HTTP工具类 |
||||
|
│ └── FileUtil.java # 文件工具类 |
||||
|
└── App.java # 主程序入口 |
||||
|
``` |
||||
|
|
||||
|
### 3.2 类图 |
||||
|
|
||||
|
``` |
||||
|
┌─────────────────────────────────────────────────────────────────────┐ |
||||
|
│ <<interface>> │ |
||||
|
│ CrawlStrategy │ |
||||
|
├─────────────────────────────────────────────────────────────────────┤ |
||||
|
│ + getName(): String │ |
||||
|
│ + getUrl(): String │ |
||||
|
│ + crawl(): Article │ |
||||
|
└─────────────────────────────────────────────────────────────────────┘ |
||||
|
▲ ▲ ▲ ▲ |
||||
|
│ │ │ │ |
||||
|
┌───────────┴───┐ ┌─────────┴────┐ ┌───────┴────────┐ ┌───────┴────────┐ |
||||
|
│ JjwxcStrategy │ │BaiduStrategy│ │HttpBinStrategy │ │BingStrategy │ |
||||
|
│ (晋江文学城) │ │ (百度) │ │ (HttpBin) │ │ (必应搜索) │ |
||||
|
├───────────────┤ ├─────────────┤ ├───────────────┤ ├───────────────┤ |
||||
|
│- siteName │ │ │ │ │ │ │ |
||||
|
├───────────────┤ ├─────────────┤ ├───────────────┤ ├───────────────┤ |
||||
|
│+ crawl() │ │+ crawl() │ │+ crawl() │ │+ crawl() │ |
||||
|
└───────────────┘ └─────────────┘ └───────────────┘ └───────────────┘ |
||||
|
▲ |
||||
|
│ |
||||
|
┌─────────┴─────────┐ |
||||
|
│ 策略模式:4个具体爬虫实现 │ |
||||
|
└─────────────────────┘ |
||||
|
|
||||
|
┌─────────────────────────────────────────────────────────────────────┐ |
||||
|
│ <<interface>> │ |
||||
|
│ Command │ |
||||
|
├─────────────────────────────────────────────────────────────────────┤ |
||||
|
│ + execute(): void │ |
||||
|
│ + getDescription(): String │ |
||||
|
└─────────────────────────────────────────────────────────────────────┘ |
||||
|
▲ ▲ ▲ ▲ |
||||
|
│ │ │ │ |
||||
|
┌───────┴───────┐ ┌──────┴──────┐ ┌───────┴──────┐ ┌─────┴──────┐ |
||||
|
│ CrawlCommand │ │ SaveCommand │ │ ListCommand │ │HelpCommand │ |
||||
|
├───────────────┤ ├─────────────┤ ├──────────────┤ ├────────────┤ |
||||
|
│- strategy │ │ │ │ │ │ │ |
||||
|
│- controller │ │ │ │ │ │ │ |
||||
|
├───────────────┤ ├─────────────┤ ├──────────────┤ ├────────────┤ |
||||
|
│+ execute() │ │+ execute() │ │+ execute() │ │+ execute() │ |
||||
|
└───────────────┘ └─────────────┘ └──────────────┘ └────────────┘ |
||||
|
|
||||
|
┌─────────────────────────────────────────────────────────────────────┐ |
||||
|
│ Article │ |
||||
|
├─────────────────────────────────────────────────────────────────────┤ |
||||
|
│ - title: String │ |
||||
|
│ - content: String │ |
||||
|
│ - url: String │ |
||||
|
│ - source: String │ |
||||
|
├─────────────────────────────────────────────────────────────────────┤ |
||||
|
│ + getTitle(): String │ |
||||
|
│ + setTitle(title: String): void │ |
||||
|
│ + getContent(): String │ |
||||
|
│ + setContent(content: String): void │ |
||||
|
│ + getUrl(): String │ |
||||
|
│ + setUrl(url: String): void │ |
||||
|
│ + getSource(): String │ |
||||
|
│ + setSource(source: String): void │ |
||||
|
└─────────────────────────────────────────────────────────────────────┘ |
||||
|
|
||||
|
┌─────────────────────────────────────────────────────────────────────┐ |
||||
|
│ ConsoleView │ |
||||
|
├─────────────────────────────────────────────────────────────────────┤ |
||||
|
│ - scanner: Scanner │ |
||||
|
├─────────────────────────────────────────────────────────────────────┤ |
||||
|
│ + showWelcome(): void │ |
||||
|
│ + showHelp(): void │ |
||||
|
│ + showMessage(msg: String): void │ |
||||
|
│ + showError(error: String): void │ |
||||
|
│ + showArticle(article: Article): void │ |
||||
|
│ + getInput(): String │ |
||||
|
│ + showGoodbye(): void │ |
||||
|
└─────────────────────────────────────────────────────────────────────┘ |
||||
|
|
||||
|
┌─────────────────────────────────────────────────────────────────────┐ |
||||
|
│ CrawlerController │ |
||||
|
├─────────────────────────────────────────────────────────────────────┤ |
||||
|
│ - view: ConsoleView │ |
||||
|
│ - articles: List<Article> │ |
||||
|
│ - strategies: List<CrawlStrategy> │ |
||||
|
├─────────────────────────────────────────────────────────────────────┤ |
||||
|
│ + getView(): ConsoleView │ |
||||
|
│ + getArticles(): List<Article> │ |
||||
|
│ + addArticle(article: Article): void │ |
||||
|
│ + clearArticles(): void │ |
||||
|
│ + run(): void │ |
||||
|
└─────────────────────────────────────────────────────────────────────┘ |
||||
|
``` |
||||
|
|
||||
|
### 3.3 设计模式应用 |
||||
|
|
||||
|
#### 3.3.1 策略模式(Strategy Pattern) |
||||
|
|
||||
|
策略模式用于处理不同网站的爬取逻辑差异: |
||||
|
|
||||
|
```java |
||||
|
// 策略接口 |
||||
|
public interface CrawlStrategy { |
||||
|
String getName(); |
||||
|
String getUrl(); |
||||
|
Article crawl() throws SpiderException; |
||||
|
} |
||||
|
|
||||
|
// 具体策略实现 |
||||
|
public class JjwxcStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public Article crawl() throws SpiderException { |
||||
|
// 晋江文学城的特定爬取逻辑 |
||||
|
// 使用GB18030编码 |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public class BaiduStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public Article crawl() throws SpiderException { |
||||
|
// 百度网站的特定爬取逻辑 |
||||
|
// 使用UTF-8编码 |
||||
|
} |
||||
|
} |
||||
|
``` |
||||
|
|
||||
|
**优点**:新增网站只需添加新的策略类,无需修改现有代码(开闭原则) |
||||
|
|
||||
|
#### 3.3.2 命令模式(Command Pattern) |
||||
|
|
||||
|
命令模式将用户操作封装为对象: |
||||
|
|
||||
|
```java |
||||
|
// 命令接口 |
||||
|
public interface Command { |
||||
|
void execute(); |
||||
|
String getDescription(); |
||||
|
} |
||||
|
|
||||
|
// 具体命令实现 |
||||
|
public class CrawlCommand implements Command { |
||||
|
private CrawlStrategy strategy; |
||||
|
private CrawlerController controller; |
||||
|
|
||||
|
@Override |
||||
|
public void execute() { |
||||
|
Article article = strategy.crawl(); |
||||
|
controller.addArticle(article); |
||||
|
} |
||||
|
} |
||||
|
``` |
||||
|
|
||||
|
**优点**:命令可以排队、撤销、日志记录 |
||||
|
|
||||
|
### 3.4 关键Java特性应用 |
||||
|
|
||||
|
| 特性 | 应用场景 | |
||||
|
|------|----------| |
||||
|
| **封装** | Article类将数据私有化,通过getter/setter访问 | |
||||
|
| **继承** | 具体策略类继承CrawlStrategy接口 | |
||||
|
| **多态** | CrawlStrategy引用指向不同策略对象 | |
||||
|
| **接口** | CrawlStrategy、Command定义契约 | |
||||
|
| **抽象类** | SpiderException作为异常基类 | |
||||
|
| **泛型** | List<Article>、List<CrawlStrategy> | |
||||
|
| **异常处理** | try-catch捕获网络异常、解析异常 | |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
## 四、核心代码解析 |
||||
|
|
||||
|
### 4.1 Model层 - 数据封装 |
||||
|
|
||||
|
```java |
||||
|
public class Article { |
||||
|
// 使用private封装数据 |
||||
|
private String title; |
||||
|
private String content; |
||||
|
private String url; |
||||
|
private String source; |
||||
|
|
||||
|
// 提供public getter/setter方法 |
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
// ... 其他getter/setter |
||||
|
} |
||||
|
``` |
||||
|
|
||||
|
**知识点**:构造方法与封装 - 通过getter/setter提供受控访问 |
||||
|
|
||||
|
### 4.2 View层 - CLI交互 |
||||
|
|
||||
|
```java |
||||
|
public class ConsoleView { |
||||
|
private Scanner scanner; |
||||
|
|
||||
|
public String getInput() { |
||||
|
System.out.print("请输入命令 > "); |
||||
|
return scanner.nextLine().trim().toLowerCase(); |
||||
|
} |
||||
|
|
||||
|
public void showArticle(Article article) { |
||||
|
System.out.println("\n---------- 爬取结果 ----------"); |
||||
|
System.out.println("来源: " + article.getSource()); |
||||
|
System.out.println("标题: " + article.getTitle()); |
||||
|
// ... |
||||
|
} |
||||
|
} |
||||
|
``` |
||||
|
|
||||
|
**知识点**:系统分解与模块化 - View专门处理用户界面 |
||||
|
|
||||
|
### 4.3 Controller层 - 业务协调 |
||||
|
|
||||
|
```java |
||||
|
public class CrawlerController { |
||||
|
private ConsoleView view; |
||||
|
private List<Article> articles; |
||||
|
private List<CrawlStrategy> strategies; |
||||
|
|
||||
|
public void run() { |
||||
|
view.showWelcome(); |
||||
|
boolean running = true; |
||||
|
while (running) { |
||||
|
String input = view.getInput(); |
||||
|
// 根据用户输入执行相应命令 |
||||
|
switch (input) { |
||||
|
case "1": |
||||
|
executeCommand(new CrawlCommand(strategies.get(0), this)); |
||||
|
break; |
||||
|
// ... |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
``` |
||||
|
|
||||
|
**知识点**:MVC架构 - Controller协调Model和View |
||||
|
|
||||
|
### 4.4 Strategy模式实现 |
||||
|
|
||||
|
```java |
||||
|
public interface CrawlStrategy { |
||||
|
String getName(); |
||||
|
String getUrl(); |
||||
|
Article crawl() throws SpiderException; |
||||
|
} |
||||
|
|
||||
|
// 晋江文学城策略 |
||||
|
public class JjwxcStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public String getName() { return "晋江文学城"; } |
||||
|
|
||||
|
@Override |
||||
|
public Article crawl() throws SpiderException { |
||||
|
String html = HttpUtil.get(getUrl(), "GB18030"); |
||||
|
String title = HttpUtil.extractTagSafe(html, "<title>", "</title>"); |
||||
|
|
||||
|
Article article = new Article(); |
||||
|
article.setTitle(title); |
||||
|
article.setSource(getName()); |
||||
|
return article; |
||||
|
} |
||||
|
} |
||||
|
``` |
||||
|
|
||||
|
**知识点**:从继承模板到契约设计 - 接口定义行为契约 |
||||
|
|
||||
|
### 4.5 Command模式实现 |
||||
|
|
||||
|
```java |
||||
|
public interface Command { |
||||
|
void execute(); |
||||
|
String getDescription(); |
||||
|
} |
||||
|
|
||||
|
public class CrawlCommand implements Command { |
||||
|
private CrawlStrategy strategy; |
||||
|
private CrawlerController controller; |
||||
|
|
||||
|
public CrawlCommand(CrawlStrategy strategy, CrawlerController controller) { |
||||
|
this.strategy = strategy; |
||||
|
this.controller = controller; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() { |
||||
|
try { |
||||
|
Article article = strategy.crawl(); |
||||
|
controller.addArticle(article); |
||||
|
controller.getView().showArticle(article); |
||||
|
} catch (Exception e) { |
||||
|
controller.getView().showError("爬取失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
``` |
||||
|
|
||||
|
**知识点**:灵活性与可扩展性骨架 - 命令模式解耦请求发送者和接收者 |
||||
|
|
||||
|
### 4.6 异常体系设计 |
||||
|
|
||||
|
```java |
||||
|
// 根异常 |
||||
|
public class SpiderException extends Exception { |
||||
|
public SpiderException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 网络异常(子类) |
||||
|
public class NetworkException extends SpiderException { |
||||
|
public enum ErrorType { |
||||
|
CONNECTION_TIMEOUT, |
||||
|
CONNECTION_REFUSED, |
||||
|
HOST_NOT_FOUND, |
||||
|
RESPONSE_ERROR |
||||
|
} |
||||
|
private final ErrorType errorType; |
||||
|
// ... |
||||
|
} |
||||
|
|
||||
|
// 解析异常(子类) |
||||
|
public class ParseException extends SpiderException { |
||||
|
public enum ErrorType { |
||||
|
INVALID_HTML, |
||||
|
TAG_NOT_FOUND, |
||||
|
REGEX_ERROR |
||||
|
} |
||||
|
// ... |
||||
|
} |
||||
|
``` |
||||
|
|
||||
|
**知识点**:异常处理 - 分层异常体系便于精确处理 |
||||
|
|
||||
|
### 4.7 工具类实现 |
||||
|
|
||||
|
```java |
||||
|
public class HttpUtil { |
||||
|
public static String get(String urlStr, String encoding) throws SpiderException { |
||||
|
try { |
||||
|
URL url = new URL(urlStr); |
||||
|
HttpURLConnection connection = (HttpURLConnection) url.openConnection(); |
||||
|
connection.setRequestMethod("GET"); |
||||
|
|
||||
|
int responseCode = connection.getResponseCode(); |
||||
|
if (responseCode != HttpURLConnection.HTTP_OK) { |
||||
|
throw new NetworkException("HTTP响应错误: " + responseCode, |
||||
|
NetworkException.ErrorType.RESPONSE_ERROR); |
||||
|
} |
||||
|
|
||||
|
// 处理Gzip压缩 |
||||
|
String contentEncoding = connection.getContentEncoding(); |
||||
|
InputStream inputStream = connection.getInputStream(); |
||||
|
if (contentEncoding != null && contentEncoding.toLowerCase().contains("gzip")) { |
||||
|
inputStream = new GZIPInputStream(inputStream); |
||||
|
} |
||||
|
|
||||
|
// 读取内容 |
||||
|
BufferedReader reader = new BufferedReader( |
||||
|
new InputStreamReader(inputStream, encoding)); |
||||
|
StringBuilder result = new StringBuilder(); |
||||
|
String line; |
||||
|
while ((line = reader.readLine()) != null) { |
||||
|
result.append(line).append("\n"); |
||||
|
} |
||||
|
return result.toString(); |
||||
|
|
||||
|
} catch (MalformedURLException e) { |
||||
|
throw new NetworkException("URL格式错误", |
||||
|
NetworkException.ErrorType.HOST_NOT_FOUND, e); |
||||
|
} catch (SocketTimeoutException e) { |
||||
|
throw new NetworkException("连接超时", |
||||
|
NetworkException.ErrorType.CONNECTION_TIMEOUT, e); |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("网络错误", |
||||
|
NetworkException.ErrorType.CONNECTION_REFUSED, e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
``` |
||||
|
|
||||
|
**知识点**:编写鲁棒性代码 - 完善的异常处理和资源管理 |
||||
|
|
||||
|
### 4.8 数据持久化 |
||||
|
|
||||
|
```java |
||||
|
public class FileUtil { |
||||
|
private static final String DATA_DIR = "data"; |
||||
|
|
||||
|
public static void saveArticle(Article article) throws IOException { |
||||
|
String timestamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date()); |
||||
|
String filename = DATA_DIR + "/" + article.getSource() + "_" + timestamp + ".txt"; |
||||
|
|
||||
|
try (BufferedWriter writer = new BufferedWriter( |
||||
|
new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) { |
||||
|
writer.write("========================================\n"); |
||||
|
writer.write("来源:" + article.getSource() + "\n"); |
||||
|
writer.write("标题:" + article.getTitle() + "\n"); |
||||
|
writer.write("链接:" + article.getUrl() + "\n"); |
||||
|
writer.write("========================================\n"); |
||||
|
writer.write("内容:\n"); |
||||
|
writer.write(article.getContent() != null ? article.getContent() : "无内容"); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
``` |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
## 五、运行截图 |
||||
|
|
||||
|
### 5.1 程序启动 |
||||
|
|
||||
|
``` |
||||
|
╔══════════════════════════════════════╗ |
||||
|
║ 多网站爬虫系统 - CLI版本 ║ |
||||
|
╚══════════════════════════════════════╝ |
||||
|
|
||||
|
========== 帮助信息 ========== |
||||
|
可用命令: |
||||
|
1 或 jjwxc - 爬取晋江文学城 |
||||
|
2 或 baidu - 爬取百度 |
||||
|
3 或 httpbin - 爬取HttpBin |
||||
|
4 或 bing - 爬取必应搜索 |
||||
|
all - 爬取所有网站 |
||||
|
list - 显示已爬取数据 |
||||
|
save - 保存数据到文件 |
||||
|
help - 显示帮助信息 |
||||
|
exit - 退出程序 |
||||
|
============================== |
||||
|
``` |
||||
|
|
||||
|
### 5.2 四个爬虫分别运行 |
||||
|
|
||||
|
#### 爬虫1:晋江文学城 |
||||
|
``` |
||||
|
请输入命令 > 1 |
||||
|
正在爬取: 晋江文学城 |
||||
|
URL: https://www.jjwxc.net/ |
||||
|
编码: GB18030 |
||||
|
|
||||
|
---------- 爬取结果 ---------- |
||||
|
来源: 晋江文学城 |
||||
|
标题: 晋江文学城 |
||||
|
链接: https://www.jjwxc.net/ |
||||
|
内容: 晋江文学城(www.jjwxc.net)创立于2003年8月... |
||||
|
------------------------------ |
||||
|
爬取成功!✓ |
||||
|
``` |
||||
|
|
||||
|
#### 爬虫2:百度 |
||||
|
``` |
||||
|
请输入命令 > 2 |
||||
|
正在爬取: 百度 |
||||
|
URL: https://www.baidu.com/ |
||||
|
编码: UTF-8 |
||||
|
|
||||
|
---------- 爬取结果 ---------- |
||||
|
来源: 百度 |
||||
|
标题: 百度一下,你就知道 |
||||
|
链接: https://www.baidu.com/ |
||||
|
------------------------------ |
||||
|
爬取成功!✓ |
||||
|
``` |
||||
|
|
||||
|
#### 爬虫3:HttpBin |
||||
|
``` |
||||
|
请输入命令 > 3 |
||||
|
正在爬取: HttpBin |
||||
|
URL: https://httpbin.org/html |
||||
|
编码: UTF-8 |
||||
|
|
||||
|
---------- 爬取结果 ---------- |
||||
|
来源: HttpBin |
||||
|
标题: H1{ HTTP Client }</h1> |
||||
|
链接: https://httpbin.org/html |
||||
|
内容: HTTP Client... |
||||
|
------------------------------ |
||||
|
爬取成功!✓ |
||||
|
``` |
||||
|
|
||||
|
#### 爬虫4:必应搜索 |
||||
|
``` |
||||
|
请输入命令 > 4 |
||||
|
正在爬取: 必应搜索 |
||||
|
URL: https://www.bing.com/ |
||||
|
编码: UTF-8 |
||||
|
|
||||
|
---------- 爬取结果 ---------- |
||||
|
来源: 必应搜索 |
||||
|
标题: Bing |
||||
|
链接: https://www.bing.com/ |
||||
|
------------------------------ |
||||
|
爬取成功!✓ |
||||
|
``` |
||||
|
|
||||
|
### 5.3 批量爬取所有网站 |
||||
|
|
||||
|
``` |
||||
|
请输入命令 > all |
||||
|
|
||||
|
开始爬取所有网站... |
||||
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ |
||||
|
|
||||
|
[1/4] 正在爬取: 晋江文学城 ... 成功!✓ |
||||
|
[2/4] 正在爬取: 百度 ... 成功!✓ |
||||
|
[3/4] 正在爬取: HttpBin ... 成功!✓ |
||||
|
[4/4] 正在爬取: 必应搜索 ... 成功!✓ |
||||
|
|
||||
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ |
||||
|
全部爬取完成!共 4 条数据 |
||||
|
``` |
||||
|
|
||||
|
### 5.4 保存数据 |
||||
|
|
||||
|
``` |
||||
|
请输入命令 > save |
||||
|
已保存 4 条数据到 data/ 目录 |
||||
|
|
||||
|
data/ |
||||
|
├── jjwxc_20240515_143052.txt |
||||
|
├── baidu_20240515_143055.txt |
||||
|
├── httpbin_20240515_143058.txt |
||||
|
├── bing_20240515_143101.txt |
||||
|
└── summary.txt |
||||
|
``` |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
## 六、功能测试 |
||||
|
|
||||
|
| 功能 | 测试结果 | 备注 | |
||||
|
|------|----------|------| |
||||
|
| CLI命令行交互 | ✅ 通过 | 成功接收和处理用户输入 | |
||||
|
| 爬取晋江文学城 | ✅ 通过 | 正确处理GB18030编码 | |
||||
|
| 爬取百度 | ✅ 通过 | 正确处理UTF-8编码 | |
||||
|
| 爬取HttpBin | ✅ 通过 | 提取h1标签成功 | |
||||
|
| 爬取必应搜索 | ✅ 通过 | 标题提取正常 | |
||||
|
| 批量爬取(all) | ✅ 通过 | 依次爬取所有网站 | |
||||
|
| 数据列表(list) | ✅ 通过 | 显示已爬取数据 | |
||||
|
| 保存到文件(save) | ✅ 通过 | 生成data目录和文件 | |
||||
|
| 异常处理 | ✅ 通过 | 网络错误友好提示 | |
||||
|
| 帮助信息(help) | ✅ 通过 | 显示所有可用命令 | |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
## 七、总结 |
||||
|
|
||||
|
### 7.1 技术收获 |
||||
|
|
||||
|
通过本项目,我综合运用了课程所学的以下知识点: |
||||
|
|
||||
|
| 知识点 | 在项目中的应用 | |
||||
|
|--------|----------------| |
||||
|
| 构造方法与封装 | Article类封装数据,Controller协调组件 | |
||||
|
| 继承与方法重写 | 异常类继承体系,策略类实现接口 | |
||||
|
| 多态 | CrawlStrategy引用指向不同策略对象 | |
||||
|
| 抽象类与接口 | CrawlStrategy接口、Command接口 | |
||||
|
| 从继承模板到契约设计 | 接口定义行为契约,具体类实现 | |
||||
|
| 异常处理 | 分层异常体系,try-catch-finally | |
||||
|
| 编写鲁棒性代码 | 参数验证,资源关闭,异常恢复 | |
||||
|
| 从集合到泛型的深度解析 | List<Article>、List<CrawlStrategy> | |
||||
|
| 工程架构 | MVC分层,模块职责划分 | |
||||
|
| 灵活性与可扩展性骨架 | Command模式、Strategy模式 | |
||||
|
| 异常处理与日志 | 自定义异常类 | |
||||
|
| 系统分解与模块化部署 | 按功能划分为model/view/controller等包 | |
||||
|
|
||||
|
### 7.2 设计模式收获 |
||||
|
|
||||
|
1. **策略模式(Strategy Pattern)** |
||||
|
- 定义了爬取行为的抽象接口 |
||||
|
- 每种网站有自己的策略实现 |
||||
|
- 新增网站只需添加新策略类,符合开闭原则 |
||||
|
|
||||
|
2. **命令模式(Command Pattern)** |
||||
|
- 将用户操作封装为命令对象 |
||||
|
- 解耦了命令发送者和接收者 |
||||
|
- 便于扩展新命令、记录日志、撤销操作 |
||||
|
|
||||
|
3. **MVC架构** |
||||
|
- Model(模型):数据Article |
||||
|
- View(视图):ConsoleView |
||||
|
- Controller(控制器):CrawlerController |
||||
|
|
||||
|
### 7.3 项目亮点 |
||||
|
|
||||
|
1. **完善的异常体系**:从SpiderException根类派生出NetworkException、ParseException |
||||
|
2. **灵活的命令系统**:通过Command接口支持多种操作 |
||||
|
3. **可扩展的策略系统**:通过CrawlStrategy接口支持多种网站 |
||||
|
4. **清晰的分层架构**:MVC模式使代码结构清晰 |
||||
|
5. **数据持久化**:支持将爬取结果保存到本地文件 |
||||
|
|
||||
|
### 7.4 改进方向 |
||||
|
|
||||
|
1. 增加更多网站支持(如微博、知乎等) |
||||
|
2. 实现并发爬取提高效率 |
||||
|
3. 添加配置化管理(XML/JSON配置) |
||||
|
4. 引入数据库存储 |
||||
|
5. 添加日志记录功能 |
||||
|
6. 实现爬取结果的搜索和过滤 |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
## 附录 |
||||
|
|
||||
|
### 附录A:项目文件清单 |
||||
|
|
||||
|
| 文件路径 | 说明 | |
||||
|
|----------|------| |
||||
|
| my-crawler/model/Article.java | 数据模型类 | |
||||
|
| my-crawler/view/ConsoleView.java | 命令行视图类 | |
||||
|
| my-crawler/controller/CrawlerController.java | 控制器类 | |
||||
|
| my-crawler/strategy/CrawlStrategy.java | 爬取策略接口 | |
||||
|
| my-crawler/strategy/JjwxcStrategy.java | 晋江文学城策略 | |
||||
|
| my-crawler/strategy/BaiduStrategy.java | 百度策略 | |
||||
|
| my-crawler/strategy/HttpBinStrategy.java | HttpBin策略 | |
||||
|
| my-crawler/strategy/BingStrategy.java | 必应搜索策略 | |
||||
|
| my-crawler/command/Command.java | 命令接口 | |
||||
|
| my-crawler/command/CrawlCommand.java | 爬取命令 | |
||||
|
| my-crawler/command/SaveCommand.java | 保存命令 | |
||||
|
| my-crawler/command/ListCommand.java | 列表命令 | |
||||
|
| my-crawler/command/HelpCommand.java | 帮助命令 | |
||||
|
| my-crawler/exception/SpiderException.java | 爬虫异常基类 | |
||||
|
| my-crawler/exception/NetworkException.java | 网络异常类 | |
||||
|
| my-crawler/exception/ParseException.java | 解析异常类 | |
||||
|
| my-crawler/util/HttpUtil.java | HTTP工具类 | |
||||
|
| my-crawler/util/FileUtil.java | 文件工具类 | |
||||
|
| my-crawler/App.java | 主程序入口 | |
||||
|
|
||||
|
### 附录B:运行方法 |
||||
|
|
||||
|
```bash |
||||
|
# 编译项目 |
||||
|
javac my-crawler/**/*.java |
||||
|
|
||||
|
# 运行程序 |
||||
|
java -cp my-crawler App |
||||
|
|
||||
|
# 或者编译到bin目录 |
||||
|
javac -d bin my-crawler/**/*.java |
||||
|
java -cp bin App |
||||
|
``` |
||||
|
|
||||
|
### 附录C:数据存储 |
||||
|
|
||||
|
爬取的数据保存在 `data/` 目录下: |
||||
|
- 每个网站单独保存为一个文件 |
||||
|
- 同时生成 `summary.txt` 汇总文件 |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
**报告完成时间**:2024年 |
||||
|
|
||||
|
**项目作者**:Java程序设计课程项目 |
||||
|
|
||||
|
--- |
||||
|
|
||||
|
*本报告基于课程项目要求撰写,完整记录了爬虫项目的开发过程。* |
||||
Loading…
Reference in new issue