From 23cb3b7f0b2686806656174f65c89b1a4ff76cff Mon Sep 17 00:00:00 2001 From: HuangZhikai <386754646@qq.com> Date: Sun, 31 May 2026 14:55:55 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20'project'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- project/README.md | 358 +++++++++++++++++++++++++++++++++++++++++++ project/pom.xml | 79 ++++++++++ project/temp_doc.zip | Bin 0 -> 6172 bytes project/test.html | 23 +++ 4 files changed, 460 insertions(+) create mode 100644 project/README.md create mode 100644 project/pom.xml create mode 100644 project/temp_doc.zip create mode 100644 project/test.html diff --git a/project/README.md b/project/README.md new file mode 100644 index 0000000..f5ecc7f --- /dev/null +++ b/project/README.md @@ -0,0 +1,358 @@ +# Java爬虫框架 + +基于MVC架构的Java爬虫框架,支持多态扩展,可轻松添加新的网站爬虫。 + +## 功能特性 + +- **MVC架构**:清晰的分层设计,职责分明 +- **多态扩展**:通过继承BaseCrawler实现新爬虫 +- **命令行界面**:支持交互式命令操作 +- **自动识别**:根据URL自动选择合适的爬虫 +- **日期提取**:支持从URL中提取发布日期 + +## 支持的网站 + +| 网站 | 域名 | 爬虫名称 | +|------|------|----------| +| 湖南大学官网 | `*.hnu.edu.cn` | HunanUniversityCrawler | +| 湖南大学新闻网 | `news.hnu.edu.cn` | HunanUniversityNewsCrawler | +| 中国天气网 | `*.weather.com.cn` | ChinaWeatherCrawler | +| 骑砍中文站 | `*.mountblade.com.cn` | MountBladeCrawler | + +## 快速开始 + +### 编译项目 + +```bash +javac -d target/classes src/main/java/com/crawler/**/*.java +``` + +### 运行程序 + +```bash +java -cp target/classes com.crawler.Main +``` + +### 命令行使用 + +``` +======================================== +Java爬虫框架 +======================================== + +======================================== +Java爬虫框架 - 命令行模式 +======================================== +输入 'help' 查看可用指令 +======================================== +> help +可用指令: +--------- +help : 显示所有可用指令 +list : 查看使用过的指令历史 +crawl : 运行爬虫,输入URL自动选择爬虫 +exit : 退出程序 + +> crawl +请输入要爬取的URL: https://www.mountblade.com.cn +使用爬虫: MountBladeCrawler +... +``` + +## 项目结构 + +``` +src/main/java/com/crawler/ +├── Main.java # 主入口 +├── model/ +│ ├── CrawlerData.java # 爬取数据模型(标题、链接、来源、发布日期) +│ └── CrawlerConfig.java # 爬虫配置(超时时间、User-Agent) +├── view/ +│ └── CrawlerView.java # 视图层(输出结果展示) +├── controller/ +│ └── CrawlerController.java # 爬虫控制器 +├── crawler/ +│ ├── Crawler.java # 爬虫接口 +│ ├── BaseCrawler.java # 爬虫抽象基类 +│ ├── CrawlerFactory.java # 爬虫工厂(自动选择爬虫) +│ └── impl/ +│ ├── ExampleCrawler.java # 通用爬虫 +│ ├── TestCrawler.java # 测试爬虫 +│ ├── HunanUniversityCrawler.java +│ ├── HunanUniversityNewsCrawler.java +│ ├── ChinaWeatherCrawler.java +│ └── MountBladeCrawler.java +└── command/ + ├── Command.java # 命令接口 + ├── BaseCommand.java # 命令抽象基类 + ├── CommandHistory.java # 命令历史记录 + ├── HelpCommand.java # 帮助命令 + ├── ListCommand.java # 历史记录命令 + ├── CrawlCommand.java # 爬虫命令 + ├── ExitCommand.java # 退出命令 + └── CommandController.java # 命令控制器 +``` + +## 扩展新爬虫 + +只需继承 `BaseCrawler` 并重写两个方法: + +```java +package com.crawler.crawler.impl; + +import com.crawler.crawler.BaseCrawler; +import com.crawler.model.CrawlerData; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class MyWebsiteCrawler extends BaseCrawler { + private static final String BASE_URL = "https://www.mywebsite.com"; + + @Override + public String getCrawlerName() { + return "MyWebsiteCrawler"; + } + + @Override + protected List parseHtml(String html) { + List results = new ArrayList<>(); + + // 使用正则表达式解析HTML + Pattern pattern = Pattern.compile("([^<]+)"); + Matcher matcher = pattern.matcher(html); + + while (matcher.find()) { + CrawlerData data = new CrawlerData(); + data.setTitle(matcher.group(2)); + data.setUrl(normalizeUrl(matcher.group(1))); + data.setSource(getCrawlerName()); + data.setPublishDate(extractDateFromUrl(matcher.group(1))); + results.add(data); + } + + return results; + } + + private String normalizeUrl(String url) { + if (url.startsWith("/")) { + return BASE_URL + url; + } + return url; + } + + private String extractDateFromUrl(String url) { + Pattern datePattern = Pattern.compile("/(\\d{4}-\\d{2}-\\d{2})/"); + Matcher matcher = datePattern.matcher(url); + return matcher.find() ? matcher.group(1) : null; + } +} +``` + +然后在 `CrawlerFactory.java` 中添加识别规则: + +```java +crawlerPatterns.put("MyWebsiteCrawler", + Pattern.compile(".*mywebsite\\.com.*", Pattern.CASE_INSENSITIVE)); +``` + +并在 `createCrawlerByName` 方法中添加: + +```java +case "MyWebsiteCrawler": + return new MyWebsiteCrawler(); +``` + +## 架构设计 + +### MVC模式 + +- **Model**:`CrawlerData`(数据模型)、`CrawlerConfig`(配置) +- **View**:`CrawlerView`(结果展示) +- **Controller**:`CrawlerController`(爬虫控制)、`CommandController`(命令控制) + +### 多态设计 + +- `Crawler` 接口定义标准方法 +- `BaseCrawler` 提供通用HTTP请求能力 +- 各爬虫实现类继承 `BaseCrawler` 并重写 `parseHtml` 方法 + +### 工厂模式 + +`CrawlerFactory` 根据URL模式自动选择合适的爬虫实现。 + +## 配置说明 + +`CrawlerConfig` 支持以下配置: + +- `timeout`:HTTP请求超时时间(默认30000毫秒) +- `userAgent`:User-Agent(默认模拟Chrome浏览器) + +## 命令列表 + +| 命令 | 功能 | +|------|------| +| `help` | 显示所有可用指令 | +| `list` | 查看使用过的指令历史 | +| `crawl` | 运行爬虫,输入目标URL,爬取后可保存结果 | +| `cache` | 缓存操作:save/load/list/delete | +| `exit` | 退出程序 | + +### cache 命令子操作 + +| 子操作 | 功能 | +|--------|------| +| `save` | 保存当前爬取数据到数据文件 | +| `load` | 从数据文件读取数据 | +| `list` | 列出 `data/` 目录中的所有文件 | +| `delete` | 删除指定的数据文件或所有文件 | + +### 数据目录 + +程序会自动创建 `data/` 目录用于保存爬取的数据文件。 + +### 爬取后自动保存 + +使用 `crawl` 命令爬取完成后,系统会自动询问是否保存结果: + +``` +爬虫运行完成,共获取 10 条数据 +======================================== + +是否保存爬取结果? (y/n): y +请输入保存路径 (默认: data/crawler_data.json): +数据已保存到: data/crawler_data.json +``` + +### 删除缓存文件示例 + +``` +> cache +请输入缓存操作 (save/load/list/delete): delete +======================================== +可选删除的文件: +======================================== +[1] crawler_data.json (1024 bytes) +[2] mountblade_data.json (2048 bytes) +[all] 删除所有文件 +======================================== +请输入要删除的文件序号或 'all': 1 +确定要删除 'crawler_data.json' 吗? (y/n): y +已删除: crawler_data.json +``` + +## 输出示例 + +``` +[12] +标题: 骑砍2《战帆》v1.2.4与本体v1.4.4测试版更新日志 +链接: https://www.mountblade.com.cn/news/Bannerlord/2026-05-13/3175.html +来源: MountBladeCrawler +发布日期: 2026-05-13 +---------------------------------------- +``` + +## 异常处理 + +项目采用分层异常体系设计,区分受检异常和非受检异常: + +### 异常分类 + +| 异常类型 | 说明 | 示例 | +|---------|------|------| +| **受检异常** | 可恢复异常,强制调用者处理 | `HttpRequestException`, `TimeoutException`, `HtmlParseException`, `DataExtractException` | +| **非受检异常** | 编程错误,不可恢复 | `InvalidUrlException`, `UnsupportedCrawlerException` | + +### 异常继承树 + +``` +CrawlerException (爬虫框架根异常 - 受检) +├── NetworkException (网络异常父类) +│ ├── HttpRequestException (HTTP请求失败) +│ └── TimeoutException (连接超时) +└── ParseException (解析异常父类) + ├── HtmlParseException (HTML解析失败) + └── DataExtractException (数据提取失败) + +ConfigurationException (配置异常父类 - 非受检) +├── InvalidUrlException (无效URL) +└── UnsupportedCrawlerException (不支持的爬虫) +``` + +### 异常处理示例 + +```java +try { + List data = crawler.crawl(); + view.showData(data); +} catch (HttpRequestException e) { + view.showErrorMessage("HTTP请求失败: " + e.getStatusCode()); +} catch (TimeoutException e) { + view.showErrorMessage("连接超时,请稍后重试"); +} catch (HtmlParseException e) { + view.showErrorMessage("HTML解析失败: " + e.getSourceUrl()); +} catch (CrawlerException e) { + view.showErrorMessage("爬虫执行失败: " + e.getMessage()); +} +``` + +完整的异常设计文档请参考 [EXCEPTIONS.md](file:///C:/Users/黄志楷/Documents/ocix/学校相关/jwork/w12/EXCEPTIONS.md) + +## 数据序列化 + +项目提供基于Jackson的JSON序列化工具类,支持将爬取数据保存到文件和从文件读取。 + +### 使用示例 + +```java +import com.crawler.util.JsonSerializer; +import com.crawler.model.CrawlerData; +import java.util.List; + +List dataList = crawler.crawl(); + +JsonSerializer.serializeToFile(dataList, "output/crawler_data.json"); + +List loadedData = JsonSerializer.deserializeFromFile("output/crawler_data.json"); +``` + +### JsonSerializer 类方法 + +| 方法 | 功能 | +|------|------| +| `serializeToFile(List, String)` | 将数据列表序列化到指定文件 | +| `deserializeFromFile(String)` | 从文件反序列化数据列表 | +| `toJsonString(List)` | 将数据列表转换为JSON字符串 | +| `toJsonString(CrawlerData)` | 将单条数据转换为JSON字符串 | +| `fromJsonString(String)` | 从JSON字符串反序列化数据列表 | +| `fromJsonStringToSingle(String)` | 从JSON字符串反序列化单条数据 | + +### 输出格式示例 + +```json +[ + { + "title": "新闻标题", + "content": "新闻内容", + "url": "https://example.com/news/1", + "source": "ExampleCrawler", + "publishDate": "2026-05-21" + } +] +``` + +## 技术栈 + +- Java 21+ +- Java HttpClient(内置HTTP客户端) +- Jackson(JSON序列化) +- 正则表达式(HTML解析) + +## 注意事项 + +1. 请遵守目标网站的robots.txt规则 +2. 不要频繁请求,避免给目标服务器造成压力 +3. 某些网站可能有反爬机制,可能需要添加额外的请求头 +4. 建议在爬取前获取网站的爬取授权 \ No newline at end of file diff --git a/project/pom.xml b/project/pom.xml new file mode 100644 index 0000000..f731ef3 --- /dev/null +++ b/project/pom.xml @@ -0,0 +1,79 @@ + + + 4.0.0 + + com.crawler + crawler-framework + 1.0.0 + jar + + crawler-framework + Java MVC Crawler Framework + + + 11 + 11 + UTF-8 + + + + + junit + junit + 4.13.2 + test + + + com.fasterxml.jackson.core + jackson-databind + 2.15.2 + + + com.fasterxml.jackson.datatype + jackson-datatype-jsr310 + 2.15.2 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 11 + 11 + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + jar-with-dependencies + + + + com.crawler.Main + + + + + + make-assembly + package + + single + + + + + + + \ No newline at end of file diff --git a/project/temp_doc.zip b/project/temp_doc.zip new file mode 100644 index 0000000000000000000000000000000000000000..cf4a5a1b8891856e421d8498532bbd8e1c46725f GIT binary patch literal 6172 zcma)=1yCIAmWGjmAwX~l1h+vFCO~j^mx18!1RLC)!QCwhPOtUES4lzCP#u-k$1yAd08}A|wn93?voJP6MPrg7E%WU(ylkW(jpO z*6?<+bTtHfIoN+vZ-w@-W8V@>+L;a=K#0N)u>E5l!cS2FQ4L`_xRFZPzx{2PMpvG* zgg^@py*wFoKH@+J&lAVcjbm;Kh{(JsW6uoPI(U9ur3@yf^4elh*{;HLxTm+6P^=bq zYeAYm-65rB+UojO<=uZdxF%QX>!I+LxCrfFwcZu~rku&@9|4x|r z1mVEZdbXzF@?%7TOY{-RqwpwLdz{w*AgP^FXOr)7A@c??&QO2NR)^}6VlDb_BDVUE z(LeOLuk(lWDY5XsHvWKnncq9--aTPvFv&Gn6e>u z1=!}S@9CQONHr$gKlR}f)o>+8?!8Z;P}|G&eKXc+qUtf-(8zOV0q39G4}rB|DY65H zh09>Ph%WS>19Bgf(PpN73znx8X5w!ieuUiNxi~iZ;3aVt(}Xbjj^SeLd_~Vzmebn#{wBre zE!oyjjmB%~HV(@trD?4#NhpZ$@$tH}gqvc4&nk|*wsN#)W)Zrnz)h{u;h%uSgV?2w zmSK%_76!+r-qTQa<80?#peuAOC)(@&9bd+|kAIp(BUHd4*nfBI!WJjdtbiUG~rh7&B_f zS`cwxVEOMzUqAUC96aGGo*m_%b!#L_W|Aszh zDbrNakHJsLV?31^DKozIbm;>{aPS}#negHYXX?WBP{1sVgjO|Fel_Myuh;F2(pjP- z8V94$O_ED0Uy8_r#0tv2mZFZ(+*!J9!boLc4?!nHWjPMHt;TP|xx;ymP@w9mR-feB zeAVn^Qy3z&xU}$M$QqW_2}b6fmLDkw;)~}qto-aLmgz_tI6f|4Tn|@kKx}(!k+50b z#yU6nR}NNP>0Y`wbTrF_od)4}&b(|C(cVxU8DVXhIaq7RNs&Ptf8l)i^Yq3 zXSh4u{~tYQcwngiid>c?V7dGkE-Te+t)+QYihG}m<*fZISkRf#V~H4I&q$*t!6VO# zOOQt{Cr8QvUIXDhL47J7EB=JxSB6E4^%hH@sp7r=vUB+Jv}GXy>L%oO^aG39!_V)eD)8#{wog^b zclplAgSsxk1=vi){q^%J^vd}Im~g_oU-4ZrImjjCMmPQI2cDNJRZFtxv!>HPr2tzw@X2H^%@A zUZWn-ql+r?M(>6F_xXD;QJzYJ_VP~U&S}q+`1LZ$sQq$RK`!zF#fSzh9yWb-P6A4O zrWEsiBkDUy)o+>GuQTR*!NE>G#tDbSnU$4V?46e-U(Pz?51&^rc@I_^*h#ZI`PF%i zYGJQSSymgY_b4|obqV{sviWJshXP0avV2H96ew;DP$m^yXh&{cvZabI8fu8V`RM? zR%I}nkF`kE_+@Wr&(&vIOi5S=Cb+W?xCwoyt0oEDmWSqCqjFup-7h&pzCE!O{-D8Q zQ~z6aAg^wIj}#?QN>3eh{@MqZk|-x^R0Xm5$gjlfTZ(3L@~b)yM&d~lZGIO4!@Qax zgHgn$^#)&X<5*dffgw_py%wD;1Bc4jitg3&RR_+7ViF04T9}Tu0vA~Lb`9K7a8>Ab z%P%j(b;qvrIKIXy)y?W1`1zz%GQ-3c{zoMwNOR_Zwwzfqk?fJt){nlnL2pqiP#+y4 zjun9yAy-7IOv$dgUNtz)vMJqn#7pq(C9cx&7`Lc5dU|cq-KZmpD7;KK(_LL9aS%Id zm2_xYAajdS_KPQss_|enBR5dy^D7tsqz!B28og3@B1{;bdkJ05OIv3N4+HwtN9zzh zPGwmCyhUXu?ezS)(;G53A@{i(p?D+TT)Q#rh|=$;{Nqph1~6F^LiBglf~RCm4|k3Q z&|7A6Z2~lfDGFog%Qa@KNPQTeb)Ag4=~PlA_cn;&V(GWzo=A z)f_HWKdnY0p^o2DPm>mhLj8XF(3`n3Mp`oZz>K&EVh@1{s@nX(jiAXo$PuT{WXD;%#|q>PJJ5cMVoWeU+7Og~cxj}YF#y1oNa zs}EMYA>#NZh))7oxmao+IV%+S*F?kpDmFgbt9!L{chY^bKO!|vpw<4Gnik|!T^vRT zDzHm5POtF#TCA^2V~q54IbY-biY8!$>Ia%1oK&xDuy=16Ft1ssZ*s!-$Qff}Ucp^` zHfyqOZ#bXF6)8U_Yq3}02*AxF!3ks_vUoW*ILl|aLXPLy`x2OgxI{G(=)uaII-LsR z?Et%Ti1H;}NQuY_DAn1GsO`q6ieAmMt9AvcsPnZMR)AJnvgD_CAufpIJ^ABPMhenrT=xZ_qK*Nls=~yj?%iD_})>h~D>^dFOW^?FB@309# zwaE0DF_L+ccu=C2(*d&|Pfzx0%3E?RUmR!yP@*kNRNO+G>s3JEeF4JpvM;5_{KM@E zQ1U5m@~38aCSs+WlOqf1=q*E5yJgpAV2JqbNyZ8!X1x z_(C&5hxO}#j};mK*B^;h%rkXr#QXJei=D+X%|dQLyKbj(@%Re8<;DO`@xd914G)A5 zX0bl6ztU^1+j`T#ax85ERjODpC{&csHl|PFUzxD|*l+Pw=Mn3(W>ThAJjmFnit`D| zl&1bWH1)0yVKPY`Tt1GtWcn!l&kLYug4rBBmxwfXsd#Qq%16m#iosc)u5qD8NN&#y zhrxYb&>AJIM}=ZtMR~D5SOwR^1+Z0WYSz z@fWGJaA6QB?s{FU;h;DzOXOJbSs(EtZzLEzuK>22F#6%Ot2`q$BKz%N@+@F7K(m0? z)a*rXni4*ZK6?_VEDDtD!d7_oy(_GlI#vUrgTAQlIrj@QS`8KFN8U+#RVVHl^C`ii z?-jjXil#`jGEexXZFDc8#XdiF-6dUQLcW?05};)zqhbZFxSIrzjHS=hmNJMGQ<2pM zNoQDoKIM#K|EO9SXI?GK_V^gp@B3m9Avz@)3;o}G0W z@7X6&I$RkcJWuDr%rB2#( zH{-ATpR0O^drH_!OTR_inWSy3#d#5QDNsE#7ekQd@(7oZV4CQ<5G&lexFSX8)6F+A zp?nfimZl1Do@wf2wE8q6v|OI>R^hEY)L2-jv6pq*g}{o!8~r>dsfO*dPo(ygZCYrE z6e|arnKKU9%;^uTl(eq;5>`1etR)b#(Tfj{(6IvC>EUzaE+=lhF^08s!@&l|8tSeBXX`Xl6#ZsN)dP4!}%Snaj(EM6=Di$ABaD(q1$b}$2Ish z>YKB~6GJq+62{YOR7K9*LW& zwoYg{++tb+8LpdBYt6tPlV%jPU6_vF}l_)CN+Afhr zNh0RNdBPoqodp+g7r_CMupbd$-b%uJ2oZgbv|M^APXd4v=r0D|9K)m!sK4#OHKX5~ z46wkqs!F+o2{4aH2(l%q$9N;Tzm_`hVC}gDH;e-N+ahw1*SH@mM4eh9Wu}W6g$bz8 zXVooAMm7D^CVAWh_Um`PPGKR_K3hucPghH(N{b=nGdlo9+<)7m0=xGut#!e#UHeAU zV5*%zoN+DRQJ&rt^ zg2447rT%{lgeK+wHFeRVYFwIJ1m@A@5efX+FIfdZ*|yu38(%@Wq)^Z+>_yBaB9rg#)EoL@IsA z#k;$0z4C0u(D^(Tp4p28uWW$AZ1&x)DBFvu9)C_y&tR1$V6CU&^kW-v9!1Lx{ zf4QZ-X_FZc9A+3GDUt)UP>yh_IKw8*l5Wq|(dCo_F;Wynbm#_Av(w?knM!VXC}c7R z6+;2NS{+dgQ4wS;+{B&M6bX0A6W;|#w%s1>hrL&1nBB(3PrIh2{ha3Beh@HrVu6H0 zH6IcC=3LG@+VWC~5*vZ3Tpxov#e0^46I`&)-&z%o0o~yuRs(fQ;2DZbRn0zmt$}^* zAqOsAc`v6{Hb+_|v&e0E`k0s2?=3J z{`^JUl8zi{5ri&^?;D9BQp?#nW-(=^2%NTiQcx?S-U-Fw4XuLxx>>7WxF=Bkc;Qcu2%aS&>kBdC3R>q!I$rb_S|*t9F(GK{=>o-$)pl8J=8c z$)L_0w($p<)lAJZ;yuknQZ_Vns!8Vc%BHVS!XwX=T2r(0AoLlMj+FD}4Bhy+!5Jr@ z=UGR;Yp8qZe$8XzP~O0sI2V9bUS}i40cTE5SfhK>d=q`CYDOA9yUNnkDw|B*b-f-^ z`KoYUhXmU>k^7jZ>*lOY$Rz7aNezFoUaRIL zuGS3bo81VGT8|21k0z_BF(GxsZlKmQrFM%hSmP!er%l&)V1piYH1MI|itLM2m-~YK z?gvB>8HEVxFOd8v1$_X?{~G@XR{r}0f6~GSSo&8O-s|%RI{ka%pH%JvCjJ#3_u%pG zXz}mafAXjYa`#ur-t*l5&Hf*b_xJQajr*_ZF@S%X^-r7rJ^fD$Jh=O>$S3;GUjO?{ if3D5H7Lrc<&zb%-5JV9T{o!W7`>pd{Us;lev;PA5F3Obv literal 0 HcmV?d00001 diff --git a/project/test.html b/project/test.html new file mode 100644 index 0000000..6121e0b --- /dev/null +++ b/project/test.html @@ -0,0 +1,23 @@ + + + + Test Page + + +

测试页面

+
+ + + +
+ + \ No newline at end of file