diff --git a/project/.DS_Store b/project/.DS_Store
new file mode 100644
index 0000000..dd6c2d8
Binary files /dev/null and b/project/.DS_Store differ
diff --git a/project/my-crawler/.DS_Store b/project/my-crawler/.DS_Store
new file mode 100644
index 0000000..81066eb
Binary files /dev/null and b/project/my-crawler/.DS_Store differ
diff --git a/project/my-crawler/.idea/.gitignore b/project/my-crawler/.idea/.gitignore
new file mode 100644
index 0000000..b6b1ecf
--- /dev/null
+++ b/project/my-crawler/.idea/.gitignore
@@ -0,0 +1,10 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 已忽略包含查询文件的默认文件夹
+/queries/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/
diff --git a/project/my-crawler/.idea/compiler.xml b/project/my-crawler/.idea/compiler.xml
new file mode 100644
index 0000000..4af8308
--- /dev/null
+++ b/project/my-crawler/.idea/compiler.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/project/my-crawler/.idea/encodings.xml b/project/my-crawler/.idea/encodings.xml
new file mode 100644
index 0000000..aa00ffa
--- /dev/null
+++ b/project/my-crawler/.idea/encodings.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/project/my-crawler/.idea/jarRepositories.xml b/project/my-crawler/.idea/jarRepositories.xml
new file mode 100644
index 0000000..712ab9d
--- /dev/null
+++ b/project/my-crawler/.idea/jarRepositories.xml
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/project/my-crawler/.idea/misc.xml b/project/my-crawler/.idea/misc.xml
new file mode 100644
index 0000000..9dc782b
--- /dev/null
+++ b/project/my-crawler/.idea/misc.xml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/project/my-crawler/202506010204-孟鑫垚-期末实验报告.docx b/project/my-crawler/202506010204-孟鑫垚-期末实验报告.docx
new file mode 100644
index 0000000..9bb98ff
Binary files /dev/null and b/project/my-crawler/202506010204-孟鑫垚-期末实验报告.docx differ
diff --git a/project/my-crawler/data/.DS_Store b/project/my-crawler/data/.DS_Store
new file mode 100644
index 0000000..233a7b2
Binary files /dev/null and b/project/my-crawler/data/.DS_Store differ
diff --git a/project/my-crawler/data/articles_20260531_003057.txt b/project/my-crawler/data/articles_20260531_003057.txt
new file mode 100644
index 0000000..011c021
--- /dev/null
+++ b/project/my-crawler/data/articles_20260531_003057.txt
@@ -0,0 +1,20 @@
+========================================
+ 文章数据批次保存
+========================================
+
+保存时间: 2026-05-31 00:30:57
+文章数量: 1
+
+========================================
+
+----------------------------------------
+文章 1
+----------------------------------------
+ID: 1
+标题: 豆瓣音乐 Top 250
+URL: https://music.douban.com/top250
+来源: https://music.douban.com/top250
+爬取时间: 2026-05-31 00:30:52
+
+内容:
+2025年度榜单 豆瓣音乐 Top 250 We Sing. We Dance. We Steal Things. Jason Mraz / 2008-05-13 / Import / Audio CD / 民谣 9.1 ( 117008人评价 ) Viva La Vida Death And All His Friends Coldplay / 2008-06-17 / 专辑 / CD / 摇滚 9.0 ( 121109人评价 ) 华丽的冒险 華麗的冒險 陈绮贞 / 2005-09-23 / 专辑 / CD / 流行 9.0 ( 93952人评价 ) 范特西 Fantasy 周杰伦 / 2001-09-14 / 专辑 / CD / 流行 9.5 ( 190826人评价 ) 后青春期的诗 後。青春期的詩 五月天 / 2008-10-23 / 专辑 / CD / 摇滚 9.0 ( 97666人评价 ) 是时候 It's Time 孙燕姿 / 2011-03-08 / 专辑 / CD / 流行 8.7 ( 84411人评价 ) Lenka Lenka / 2008-09-23 / 专辑 / Audio CD / 流行 8.6 ( 83946人评价 ) Start from Here 从这里开始 王若琳 / 2008-01-11 / 专辑 / CD / 爵士 8.8 ( 77184人评价 ) 旅行的意义 陈绮贞 / 2004-02-02 / 单曲 / CD / 流行 9.1 ( 101953人评价 ) 太阳 Immortal 陈绮贞 / 2009-01-22 / 专辑 / CD / 流行 8.8 ( 79731人评价 ) Once (Soundtrack) Once / 电影《曾经》原声大碟 Glen Hansard,Marketa Irglova / 2007-05-22 / Soundtrack / CD / 原声 9.2 ( 73664人评价 ) Not Going Anywhere 守候 Keren Ann / 2004-08-24 / Import / Audio CD / 民谣 8.9 ( 62733人评价 ) American Idiot Green Day / 2004-09-21 / Explicit Lyrics / Audio CD / 摇滚 9.0 ( 75471人评价 ) 思念是一种病 OK 张震岳 / 2007-07-06 / 专辑 / CD / 流行 8.9 ( 85810人评价 ) 無與倫比的美麗 无与伦比的美丽 苏打绿 / 2007-11-02 / 专辑 / CD / 流行 8.8 ( 92327人评价 ) 亲爱的...我还不知道 親愛的…我還不知道 张悬 / 2007-07-20 / 专辑 / CD / 流行 8.8 ( 69689人评价 ) 城市 The City 张悬 / 2009-05-22 / 专辑 / CD / 摇滚 8.7 ( 69137人评价 ) O Damien Rice / 2002-02-01 / 专辑 / CD / 流行 9.1 ( 53450人评价 ) Wake Me Up When September Ends 九月结束的时候叫醒我 Green Day / 2005-06-13 / 单曲 / CD / 摇滚 9.4 ( 55101人评价 ) 叶惠美 葉惠美 周杰伦 / 2003-07-31 / 专辑 / CD / 流行 9.3 ( 118425人评价 ) 七里香 Common Jasmin Orange 周杰伦 / 2004 / 专辑 / CD / 流行 9.2 ( 179593人评价 ) 21 Adele / 2011-01-24 / 专辑 / CD / 流行 9.3 ( 77434人评价 ) My Life Will... 张悬 / 2006-06-09 / 专辑 / CD / 流行 8.8 ( 61363人评价 ) 寓言 王菲 / 2000 / 专辑 / CD / 流行 9.4 ( 73631人评价 ) 你在煩惱什麼 你在烦恼什么 苏打绿 / 2011-11-11 / 专辑 / CD / 流行 9.0 ( 59752人评价 ) <
diff --git a/project/my-crawler/data/articles_20260531_004841.txt b/project/my-crawler/data/articles_20260531_004841.txt
new file mode 100644
index 0000000..8e06fb4
--- /dev/null
+++ b/project/my-crawler/data/articles_20260531_004841.txt
@@ -0,0 +1,21 @@
+========================================
+ 文章数据批次保存
+========================================
+
+保存时间: 2026-05-31 00:48:41
+文章数量: 1
+
+========================================
+
+----------------------------------------
+文章 1
+----------------------------------------
+ID: 1
+标题: 推荐名句_古文岛_原古诗文网
+URL: https://www.gushiwen.cn/mingjus/
+来源: https://www.gushiwen.cn/mingjus/
+爬取时间: 2026-05-31 00:48:07
+
+内容:
+荐名句 类型: 春天 夏天 秋天 冬天 爱国 写雪 思念 爱情 思乡 离别 月亮 梅花 励志 荷花 写雨 友情 感恩 写风 西湖 读书 菊花 长江 黄河 竹子 哲理 泰山 边塞 柳树 写鸟 桃花 老师 母亲 伤感 田园 写云 庐山 山水 星星 老子 史记 论语 庄子 孟子 中庸 易传 左传 荀子 礼记 尚书 汉书 墨子 列子 管子 晋书 节日 春节 元宵节 寒食节 清明节 端午节 七夕节 中秋节 重阳节 菜根谭 红楼梦 鬼谷子 三国志 韩非子 战国策 淮南子 三字经 后汉书 商君书 增广贤文 资治通鉴 孙子兵法 小窗幽记 围炉夜话 格言联璧 文心雕龙 三国演义 吕氏春秋 幼学琼林 警世通言 作者: 李白 苏轼 杜甫 王维 屈原 陆游 荀子 韩愈 曹操 杜牧 李贺 李煜 柳永 晏殊 秦观 曹植 高适 王勃 岳飞 朱熹 岑参 姜夔 孟郊 韦庄 元稹 曾巩 苏辙 唐寅 张先 曹丕 鲍照 张岱 李益 苏洵 贾岛 于谦 杨慎 宋玉 阮籍 张籍 辛弃疾 李清照 白居易 李商隐 陶渊明 孟浩然 刘禹锡 诸葛亮 欧阳修 王安石 范仲淹 杨万里 黄庭坚 王昌龄 龚自珍 温庭筠 谢灵运 文天祥 柳宗元 曾国藩 韦应物 刘长卿 司马光 晏几道 司马迁 元好问 曹雪芹 范成大 卢照邻 陈子昂 周邦彦 张九龄 骆宾王 王守仁 关汉卿 马致远 朱敦儒 顾炎武 纳兰性德 司马相如 朝代: 先秦 两汉 魏晋 南北朝 隋代 唐代 五代 宋代 金朝 元代 明代 清代 形式: 诗文 古籍 谚语 对联 故音乐者,所以动荡血脉,通流精神而和正心也。 —— 《史记·乐书》 凉州女儿满高楼,梳头已学京都样。 —— 陆游《五月十一日夜且半梦从大驾亲征尽复汉唐故地》 懒向沙头醉玉瓶,唤君同赏小窗明。 —— 陆游《浣溪沙·和无咎韵》 京华结交尽奇士,意气相期共生死。 —— 陆游《金错刀行》 耿斜河,疏星淡月,断云微度。 —— 张元干《贺新郎·送胡邦衡待制赴新州》 树在道边而多子,必苦李也。 —— 《晋书·列传·第十三章》 残柳宫前空露叶,夕阳川上浩烟波。 —— 刘沧《经炀帝行宫》 年年今夜,月华如练,长是人千里。 —— 范仲淹《御街行·秋日怀旧》 饮君酒,为君吟。 —— 李白《扶风豪士歌》 青春花姊不同时。凄凉生较迟。 —— 吴文英《醉桃源·芙蓉》 落叶满空山,何处寻行迹。 —— 韦应物《寄全椒山中道士》 千万恨,为君剖。 —— 顾贞观《金缕曲词二首·其二》 反水不收,后悔无及。 —— 《后汉书·本纪·光武帝纪上》 沉静自居,必不招物议。 —— 《北齐书·列传·卷十四》 直辞正谏,论道佐时。 —— 《贞观政要·卷一·论君道第一》 空自倚,清香未减,风流不在人知。 —— 晁冲之《汉宫春·梅》 亭皋木叶下,陇首秋云飞。 —— 柳恽《捣衣诗》 事无终始,无务多业;举物而暗,无务博闻。 —— 《墨子·02章 修身》 陌上少年郎,满身兰麝扑人香。 —— 顾夐《荷叶杯·弱柳好花尽拆》 龟灵未免刳肠患,马失应无折足忧。 —— 白居易《放言五首·其二》 胡风吹朔雪,千里度龙山。 —— 鲍照《学刘公干体五首·其三》 未及施政教,所忧变炎凉。 —— 韦应物《夏至避暑北池》 隔壁岂无耳,窗外岂无人? —— 《增广贤文·下集》 生怕离怀别苦,多少事、欲说还休。 —— 李清照《凤凰台上忆吹箫·香冷金猊》 单于若问君家世,莫道中朝第一人。 —— 苏轼《送子由使契丹》 春在城南芳草路。 —— 辛弃疾《玉楼春·风前欲劝春光住》 齐人有一妻一妾而处室者,其良人出,则必餍酒肉而后反。 —— 孟子及弟子《齐人有一妻一妾》 贤不贤,才也;遇不遇,时也。 —— 《论衡·卷一·逢遇篇》 群贤毕至,少长咸集。 —— 王羲之《兰亭集序 / 兰亭序》 今日乐相乐,别后莫相忘。 —— 曹植《怨歌行》 夫缀文者情动而辞发,观文者披文以入情,沿波讨源,虽幽必显。 —— 《文心雕龙·知音》 自爱残妆晓镜中,环钗漫篸绿丝丛。 —— 元稹《离思五首》 故春非我春,夏非我夏,秋非我秋,冬非我冬。 —— 佚名《日出入》 如彼萱草兮,使我忧忘。 —— 贯休《善哉行·伤古曲无知音》 遥看汉水鸭头绿,恰似葡萄初酦醅。 —— 李白《襄阳歌》 宦游吾倦矣,玉人留我醉,明日万花寒食,得且住,为佳耳。 —— 辛弃疾《霜天晓角·旅兴》 强人之所不能,事必不立;禁人之所必犯,法必不得矣。 —— 《资治通鉴·唐纪·唐纪五十八》 总一样,文人宿草。 —— 黄景仁《贺新郎·太白墓和雅存韵》 风露浩然,山河影转,今古照凄凉。 —— 陈亮《一丛花·溪堂玩月作》 严霜九月中,送我出远郊。 —— 陶渊明《拟挽歌辞三首》 红妆肯为苍生计,女妖娆能有几? —— 张可久《水仙子·怀古》 小大之狱,虽不能察,必以情。 —— 《左传·庄公·庄公十年》 曲岸持觞,垂杨系马,此地曾经别。 —— 辛弃疾《念奴娇·书东流村壁》 何必桑乾方是远,中流以北即天涯! —— 杨万里《初入淮河四绝句》 耳闻之不如目见之,目见之不如足践之 —— 《说苑·政理》 可怜此地无车马,颠倒青苔落绛英。 —— 韩愈《题张十一旅舍三咏榴花 / 题榴花》 绿桑高下映平川,赛罢田神笑语喧。 —— 欧阳修《田家》 白鸟忽点破,残阳还照开。 —— 范仲淹《野色》 堪笑一场颠倒梦,元来恰似浮云。 —— 朱敦儒《临江仙·堪笑一场颠倒梦》 愿言思伯,甘心首疾。 —— 佚名《伯兮》
+
diff --git a/project/my-crawler/data/articles_20260531_010148.txt b/project/my-crawler/data/articles_20260531_010148.txt
new file mode 100644
index 0000000..6053f9a
--- /dev/null
+++ b/project/my-crawler/data/articles_20260531_010148.txt
@@ -0,0 +1,21 @@
+========================================
+ 文章数据批次保存
+========================================
+
+保存时间: 2026-05-31 01:01:48
+文章数量: 1
+
+========================================
+
+----------------------------------------
+文章 1
+----------------------------------------
+ID: 1
+标题: 长沙市, 湖南省月度天气预报 - weather.com
+URL: https://weather.com/zh-CN/weather/monthly/l/2add4f77b58b85fbed59ae07151489bce5504e7ae027a17fa94ad0149eceef0f
+来源: https://weather.com/zh-CN/weather/monthly/l/2add4f77b58b85fbed59ae07151489bce5504e7ae027a17fa94ad0149eceef0f
+爬取时间: 2026-05-31 01:01:32
+
+内容:
+长沙市, 湖南省月度天气预报 - weather.com Hamburger The Weather Company Today Moon Phase - Day 8 26 Not Available -- -- Moon Phase - Day 9 27 Not Available -- -- Moon Phase - Day 10 28 Not Available -- -- Moon Phase - Day 11 29 Not Available -- -- Moon Phase - Day 12 30 Not Available -- -- Moon Phase - Day 13 1 Not Available -- -- Moon Phase - Day 14 2 Partly Cloudy Day 27 ° 19 ° Moon Phase - Day 15 3 Scattered Showers Day 26 ° 15 ° Moon Phase - Day 16 4 Partly Cloudy Day 25 ° 15 ° Moon Phase - Day 17 5 Partly Cloudy Day 28 ° 16 ° Moon Phase - Day 19 6 Partly Cloudy Day 30 ° 18 ° Moon Phase - Day 20 7 Partly Cloudy Day 32 ° 21 ° Moon Phase - Day 21 8 Showers 22 ° 18 ° Moon Phase - Day 22 9 Showers 21 ° 15 ° Moon Phase - Day 23 10 Partly Cloudy Day 24 ° 16 ° Moon Phase - Day 24 11 Partly Cloudy Day 27 ° 18 ° Moon Phase - Day 25 12 Partly Cloudy Day 30 ° 20 ° Moon Phase - Day 26 13 Showers 27 ° 19 ° Moon Phase - Day 27 14 Showers 23 ° 20 ° Moon Phase - Day 28 15 Cloudy 28 ° 21 ° Moon Phase - Day 29 16 Partly Cloudy Day 31 ° 23 ° Moon Phase - Day 1 17 Partly Cloudy Day 29 ° 22 ° Moon Phase - Day 2 18 Partly Cloudy Day 31 ° 23 ° Moon Phase - Day 3 19 Partly Cloudy Day 30 ° 21 ° Moon Phase - Day 4 20 Showers 23 ° 19 ° Moon Phase - Day 5 21 Showers 23 ° 20 ° Moon Phase - Day 5 22 Showers 26 ° 22 ° Moon Phase - Day 6 23 Scattered Showers Day 29 ° 23 ° Moon Phase - Day 7 24 Scattered Showers Day 30 ° 25 ° Moon Phase - Day 8 25 Windy 33 ° 27 ° Moon Phase - Day 9 26 Partly Cloudy Day 35 ° 26 ° Moon Phase - Day 10 27 Partly Cloudy Day 30 ° 22 ° Moon Phase - Day 11 28 Showers 27 ° 22 ° Moon Phase - Day 12 29 Partly Cloudy Day 29 ° 22 ° Moon Phase - Day 13 30 Showers 24 ° 19 ° Moon Phase - Day 14 31 Partly Cloudy Day 31 ° 22 ° Moon Phase - Day 15 1 Mostly Clear Day 33 ° 23 ° Moon Phase - Day 16 2 Mostly Clear Day 34 ° 25 ° Moon Phase - Day 17 3 Scattered Thunderstorms 30 ° 24 ° Moon Phase - Day 18 4 Scattered Showers Day 30 ° 24 ° Moon Phase - Day 19 5 Showers 31 ° 24 ° Moon Phase - Day 20 6 Scattered Thunderstorms 31 ° 24 ° Close 白天 31 ° Partly Cloudy Day Rain drop 3% 西 6 公里/小时 少云。 最高 31°C。 微风且风向多变。 Record High 最高纪录 36 ° Average High 平均最高 28 ° Sunrise 日出 5:31 Sunset 日落 19:19 夜间 22 ° Clear Night Rain drop 4% 北 6 公里/小时 大部晴朗。 最低 22°C。 微风且风向多变。 Record Low 最低记录 18 ° Average Low 平均最低 21 ° Moonrise 月出 19:32 Moonset 月落 4:57 Moon Phase - Day 14 满月 Moon Phase - Day 21 7 Showers 30 ° 23 ° Moon Phase - Day 22 8 Showers 29 ° 22 ° Moon Phase - Day 23 9 Scattered Showers Day 27 ° 21 ° Moon Phase - Day 24 10 Scattered Showers Day 28 ° 22 ° Moon Phase - Day 25 11 Partly Cloudy Day 29 ° 23 ° Moon Phase - Day 26 12 Scattered Showers Day 30 ° 24 ° Moon Phase - Day 27 13 Showers 30 ° 24 ° Not Available 14 Not Available -- -- Moon Phase - Day 0 15 平均气温 -- -- Moon Phase - Day 1 16 平均气温 -- -- Moon Phase - Day 2 17 平均气温 -- -- Moon Phase - Day 3 18 平均气温 -- -- Moon Phase - Day 5 19 平均气温 -- -- Moon Phase - Day 6 20 平均气温 -- -- Moon Phase - Day 6 21 平均气温 -- -- Moon Phase - Day 7 22 平均气温 -- -- Moon Phase - Day 8 23 平均气温 -- -- Moon Phase - Day 9 24 平均气温 -- -- Not Available 25 平均气温 -- -- Not Available 26 平均气温 -- -- Not Available 27 平均气温 -- -- Not Available 28 平均气温 -- -- Not Available 29 平均气温 -- -- Not Available 30 平均气温 -- -- Not Available 1 平均气温 -- -- Not Available 2 平均气温 -- -- Not Available 3 平均气温 -- -- Not Available 4 平均气温 -- -- Advertisement Advertisement 历史记录 5月31日 高 低 降水量 平均值 28 ° C 21 ° -- 记录 34 ° ( 2011 ) 17 ° ( 1993 ) -- 历史气温状况 昨日 24 ° 19 ° 0.25 毫米 过去七天 35 ° 19 ° 6.84 当月气温 35 ° 15 ° 61.66 历史月平均气温 五月 27 ° 19 ° 201.68 六月 30 ° 23 ° 224.28 七月 33 ° 26 ° 162.81
+
diff --git a/project/my-crawler/data/articles_20260531_105816.txt b/project/my-crawler/data/articles_20260531_105816.txt
new file mode 100644
index 0000000..747fc6d
--- /dev/null
+++ b/project/my-crawler/data/articles_20260531_105816.txt
@@ -0,0 +1,20 @@
+========================================
+ 文章数据批次保存
+========================================
+
+保存时间: 2026-05-31 10:58:16
+文章数量: 1
+
+========================================
+
+----------------------------------------
+文章 1
+----------------------------------------
+ID: 1
+标题: 豆瓣电影 Top 250
+URL: https://movie.douban.com/top250
+来源: https://movie.douban.com/top250
+爬取时间: 2026-05-31 10:58:08
+
+内容:
+豆瓣电影 Top 250 我没看过的 1 肖申克的救赎 / The Shawshank Redemption / 月黑高飞(港) / 刺激1995(台) [可播放] 导演: 弗兰克·德拉邦特 Frank Darabont 主演: 蒂姆·罗宾斯 Tim Robbins /... 1994 / 美国 / 犯罪 剧情 9.7 3291320人评价 希望让人自由。 2 霸王别姬 / 再见,我的妾 / Farewell My Concubine [可播放] 导演: 陈凯歌 Kaige Chen 主演: 张国荣 Leslie Cheung / 张丰毅 Fengyi Zha... 1993 / 中国大陆 中国香港 / 剧情 爱情 同性 9.6 2428769人评价 风华绝代。 3 泰坦尼克号 / Titanic / 铁达尼号(港 / 台) [可播放] 导演: 詹姆斯·卡梅隆 James Cameron 主演: 莱昂纳多·迪卡普里奥 Leonardo... 1997 / 美国 / 剧情 爱情 灾难 9.5 2503890人评价 失去的才是永恒的。 4 阿甘正传 / Forrest Gump / 福雷斯特·冈普 [可播放] 导演: 罗伯特·泽米吉斯 Robert Zemeckis 主演: 汤姆·汉克斯 Tom Hanks / ... 1994 / 美国 / 剧情 爱情 9.5 2434861人评价 一部美国近现代史。 5 千与千寻 / 千と千尋の神隠し / 神隐少女(台) / 千与千寻的神隐 导演: 宫崎骏 Hayao Miyazaki 主演: 柊瑠美 Rumi Hîragi / 入野自由 Miy... 2001 / 日本 / 剧情 动画 奇幻 9.4 2542350人评价 最好的宫崎骏,最好的久石让。 6 美丽人生 / La vita è bella / 一个快乐的传说(港) / Life Is Beautiful [可播放] 导演: 罗伯托·贝尼尼 Roberto Benigni 主演: 罗伯托·贝尼尼 Roberto Beni... 1997 / 意大利 / 剧情 喜剧 爱情 战争 9.5 1485065人评价 最美的谎言。 7 星际穿越 / Interstellar / 星际启示录(港) / 星际效应(台) [可播放] 导演: 克里斯托弗·诺兰 Christopher Nolan 主演: 马修·麦康纳 Matthew Mc... 2014 / 美国 英国 加拿大 / 剧情 科幻 冒险 9.4 2189832人评价 爱是一种力量,让我们超越时空感知它的存在。 8 这个杀手不太冷 / Léon / 终极追杀令(台) / 杀手莱昂 [可播放] 导演: 吕克·贝松 Luc Besson 主演: 让·雷诺 Jean Reno / 娜塔莉·波特曼 ... 1994 / 法国 美国 / 剧情 动作 犯罪 9.4 2556406人评价 怪蜀黍和小萝莉不得不说的故事。 9 盗梦空间 / Inception / 潜行凶间(港) / 全面启动(台) [可播放] 导演: 克里斯托弗·诺兰 Christopher Nolan 主演: 莱昂纳多·迪卡普里奥 Le... 2010 / 美国 英国 / 剧情 科幻 悬疑 冒险 9.4 2329971人评价 诺兰给了我们一场无法盗取的梦。 10 楚门的世界 / The Truman Show / 真人Show(港) / 真人戏 [可播放] 导演: 彼得·威尔 Peter Weir 主演: 金·凯瑞 Jim Carrey / 劳拉·琳妮 Lau... 1998 / 美国 / 剧情 科幻 9.4 2024358人评价 如果再也不能见到你,祝你早安,午安,晚安。 11 辛德勒的名单 / Schindler's List / 舒特拉的名单(港) / 辛德勒名单 [可播放] 导演: 史蒂文·斯皮尔伯格 Steven Spielberg 主演: 连姆·尼森 Liam Neeson... 1993 / 美国 / 剧情 历史 战争 9.5 1249331人评价 拯救一个人,就是拯救整个世界。 12 忠犬八公的故事 / Hachi: A Dog's Tale / 秋田犬八千(港) / 忠犬小八(台) 导演: 莱塞·霍尔斯道姆 Lasse Hallström 主演: 理查·基尔 Richard Ger... 2009 / 美国 英国 / 剧情 9.4 1545994人评价 永远都不能忘记你所爱的人。 13 海上钢琴师 / La leggenda del pianista sull'oceano / 声光伴我飞(港) / 一九零零的传奇 [可播放] 导演: 朱塞佩·托纳多雷 Giuseppe Tornatore 主演: 蒂姆·罗斯 Tim Roth / ... 1998 / 意大利 / 剧情 音乐 9.3 1882971人评价 每个人都要走一条自己坚定了的路,就算是粉身碎骨。 14 疯狂动物城 / Zootopia / 优兽大都会(港) / 动物方城市(台) [可播放] 导演: 拜伦·霍华德 Byron Howard / 瑞奇·摩尔 Rich Moore 主演: 金妮弗·... 2016 / 美国 / 喜剧 动画 冒险 9.3 2325984人评价 迪士尼给我们营造的乌托邦就是这样,永远善良勇敢,永远出乎意料。 15 三傻大闹宝莱坞 / 3 Idiots / 三个傻瓜(台) / 作死不离3兄弟(港) 导演: 拉库马·希拉尼 Rajkumar Hirani 主演: 阿米尔·汗 Aamir Khan / 卡... 2009 / 印度 / 剧情 喜剧 爱情 歌舞 9.2 2082426人评价 英俊版憨豆,高情商版谢耳朵。 16 机器人总动员 / WALL·E / 太空奇兵·威E(港) / 瓦力(台) [可播放] 导演: 安德鲁·斯坦顿 Andrew Stanton 主演: 本·贝尔特 Ben Burtt / 艾丽... 2008 / 美国 / 科幻 动画 冒险 9.3 1496808人评价 小瓦力,大人生。 17 放牛班的春天 / Les choristes / 歌声伴我心(港) / 唱诗班男孩 [可播放] 导演: 克里斯托夫·巴拉蒂 Christophe Barratier 主演: 让-巴蒂斯特·莫尼... 2004 / 法国 瑞士 德国 / 剧情 音乐 9.3 1476718人评价 天籁一般的童声,是最接近上帝的存在。 18 无间道 / 無間道 / Infernal Affairs / Mou gaan dou [可播放] 导演: 刘伟强 / 麦兆辉 主演: 刘德华 Andy Lau / 梁朝伟 Tony Leung Chiu W... 2002 / 中国香港 / 剧情 犯罪 惊悚 9.3 1568857人评价 香港电影史上永不过时的杰作。 19 控方证人 / Witness for the Prosecution / 雄才伟略 / 情妇 [可播放] 导演: 比利·怀尔德 Billy Wilder 主演: 泰隆·鲍华 Tyrone Power / 玛琳·... 1957 / 美国 / 剧情 犯罪 悬疑 惊悚 9.6 746252人评价 比利·怀德满分作品。 20 寻梦环游记 / Coco / 玩转极乐园(港) / 可可夜总会(台) [可播放] 导演: 李·昂克里奇 Lee Unkrich / 阿德里安·莫利纳 Adrian Molina 主演: ... 2017 / 美国 / 喜剧 动画 奇幻 音乐 9.1 1995362人评价 死亡不是真的逝去,遗忘才是永恒的消亡。 21 大话西游之大圣娶亲 / 西遊記大結局之仙履奇緣 / 西游记完结篇仙履奇缘 / 齐天大圣西游记 [可播放] 导演: 刘镇伟 Jeffrey Lau 主演: 周星驰 Stephen Chow / 吴孟达 Man Tat Ng... 1995 / 中国香港 中国大陆 / 喜剧 爱情 奇幻 古装 9.2 1712805人评价 一生所爱。 22 熔炉 / 도가니 / 无声呐喊(港) / 漩涡 导演: 黄东赫 Dong-hyuk Hwang 主演: 孔侑 Yoo Gong / 郑有美 Yu-mi Jung /... 2011 / 韩国 / 剧情 9.3 1037989人评价 我们一路奋战不是为了改变世界,而是为了不让世界改变我们。 23 触不可及 / Intouchables / 闪亮人生(港) / 逆转人生(台) [可播放] 导演: 奥利维·那卡什 Olivier Nakache / 艾力克·托兰达 Eric Toledano 主... 2011 / 法国 / 剧情 喜剧 9.3 1299097人评价 满满温情的高雅喜剧。 24 教父 / The Godfather / Mario Puzo's The Godfather [可播放] 导演: 弗朗西斯·福特·科波拉 Francis Ford Coppola 主演: 马龙·白兰度 M... 1972 / 美国 / 剧情 犯罪 9.3 1110066人评价 千万不要记恨你的对手,这样会让你失去理智。 25 末代皇帝 / The Last Emperor / 末代皇帝溥仪(港) / L'ultimo imperatore [可播放] 导演: 贝纳尔多·贝托鲁奇 Bernardo Bertolucci 主演: 尊龙 John Lone / 陈... 1987 / 英国 意大利 中国大陆 法国 / 剧情 传记 历史 9.3 1031583人评价 “不要跟我比惨,我比你更惨”再适合这部电影不过了。 <
diff --git a/project/my-crawler/data/index.txt b/project/my-crawler/data/index.txt
new file mode 100644
index 0000000..fa20714
--- /dev/null
+++ b/project/my-crawler/data/index.txt
@@ -0,0 +1,31 @@
+========================================
+ 文章索引
+========================================
+
+共有 1 篇文章
+
+[1] 豆瓣音乐 Top 250
+ URL: https://music.douban.com/top250
+ 文件名: article_1.txt
+ 爬取时间: 2026-05-31 00:24:38
+
+[保存记录] 2026-05-31 00:30:57
+ 批次文件: articles_20260531_003057.txt
+ 文章数量: 1
+
+[保存记录] 2026-05-31 00:48:41
+ 批次文件: articles_20260531_004841.txt
+ 文章数量: 1
+
+[保存记录] 2026-05-31 01:01:48
+ 批次文件: articles_20260531_010148.txt
+ 文章数量: 1
+
+[保存记录] 2026-05-31 10:58:16
+ 批次文件: articles_20260531_105816.txt
+ 文章数量: 1
+
+[保存记录] 2026-05-31 11:00:37
+ 批次文件: articles_20260531_110037.txt
+ 文章数量: 1
+
diff --git a/project/my-crawler/pom.xml b/project/my-crawler/pom.xml
new file mode 100644
index 0000000..58ffd40
--- /dev/null
+++ b/project/my-crawler/pom.xml
@@ -0,0 +1,50 @@
+
+
+ 4.0.0
+
+ com.crawler
+ my-crawler
+ 1.0-SNAPSHOT
+ jar
+
+ My Crawler
+ A simple web crawler application
+
+
+ 11
+ 11
+ UTF-8
+
+
+
+
+ org.jsoup
+ jsoup
+ 1.16.1
+
+
+ com.google.code.gson
+ gson
+ 2.10.1
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+ 3.3.0
+
+
+
+ com.crawler.App
+
+
+
+
+
+
+
diff --git a/project/my-crawler/src/.DS_Store b/project/my-crawler/src/.DS_Store
new file mode 100644
index 0000000..14fa703
Binary files /dev/null and b/project/my-crawler/src/.DS_Store differ
diff --git a/project/my-crawler/src/main/.DS_Store b/project/my-crawler/src/main/.DS_Store
new file mode 100644
index 0000000..02cd51b
Binary files /dev/null and b/project/my-crawler/src/main/.DS_Store differ
diff --git a/project/my-crawler/src/main/java/.DS_Store b/project/my-crawler/src/main/java/.DS_Store
new file mode 100644
index 0000000..62211a2
Binary files /dev/null and b/project/my-crawler/src/main/java/.DS_Store differ
diff --git a/project/my-crawler/src/main/java/com/.DS_Store b/project/my-crawler/src/main/java/com/.DS_Store
new file mode 100644
index 0000000..a1294a8
Binary files /dev/null and b/project/my-crawler/src/main/java/com/.DS_Store differ
diff --git a/project/my-crawler/src/main/java/com/crawler/App.java b/project/my-crawler/src/main/java/com/crawler/App.java
new file mode 100644
index 0000000..a7b17d4
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/App.java
@@ -0,0 +1,63 @@
+package com.crawler;
+
+import com.crawler.command.*;
+import com.crawler.controller.CrawlerController;
+import com.crawler.repository.ArticleRepository;
+import com.crawler.repository.InMemoryArticleRepository;
+import com.crawler.view.ConsoleView;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+public class App {
+ private final Map commands = new HashMap<>();
+ private final ConsoleView view;
+ private final AtomicBoolean running = new AtomicBoolean(true);
+
+ public App() {
+ view = new ConsoleView();
+ ArticleRepository repository = new InMemoryArticleRepository();
+ CrawlerController controller = new CrawlerController(repository, view);
+
+ commands.put("crawl", new CrawlCommand(controller, view));
+ commands.put("list", new ListCommand(controller));
+ commands.put("save", new SaveCommand(controller));
+ commands.put("load", new LoadCommand(controller));
+ commands.put("help", new HelpCommand(view));
+ commands.put("exit", new ExitCommand(view, () -> running.set(false)));
+ }
+
+ public void run() {
+ view.displayWelcome();
+ view.displayHelp();
+
+ while (running.get()) {
+ try {
+ String input = view.readInput();
+ if (input.isEmpty()) {
+ continue;
+ }
+
+ String[] parts = input.split("\\s+", 3);
+ String commandName = parts[0].toLowerCase();
+ String[] args = parts.length > 1 ? java.util.Arrays.copyOfRange(parts, 1, parts.length) : new String[0];
+
+ Command command = commands.get(commandName);
+ if (command != null) {
+ command.execute(args);
+ } else {
+ view.displayError("Unknown command: " + commandName);
+ view.displayInfo("Type 'help' for available commands");
+ }
+ } catch (Exception e) {
+ view.displayError("Error: " + e.getMessage());
+ }
+ }
+ }
+
+ public static void main(String[] args) {
+ App app = new App();
+ app.run();
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/command/Command.java b/project/my-crawler/src/main/java/com/crawler/command/Command.java
new file mode 100644
index 0000000..17d3c1d
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/command/Command.java
@@ -0,0 +1,7 @@
+package com.crawler.command;
+
+public interface Command {
+ void execute(String[] args) throws Exception;
+ String getCommandName();
+ String getDescription();
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/command/CrawlCommand.java b/project/my-crawler/src/main/java/com/crawler/command/CrawlCommand.java
new file mode 100644
index 0000000..24b1739
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/command/CrawlCommand.java
@@ -0,0 +1,41 @@
+package com.crawler.command;
+
+import com.crawler.controller.CrawlerController;
+import com.crawler.view.ConsoleView;
+
+public class CrawlCommand implements Command {
+ private final CrawlerController controller;
+ private final ConsoleView view;
+
+ public CrawlCommand(CrawlerController controller, ConsoleView view) {
+ this.controller = controller;
+ this.view = view;
+ }
+
+ @Override
+ public void execute(String[] args) {
+ if (args.length < 1) {
+ view.displayError("Usage: crawl [strategy]");
+ return;
+ }
+
+ String url = args[0];
+ String strategy = args.length > 1 ? args[1] : "jsoup";
+
+ try {
+ controller.crawl(url, strategy);
+ } catch (Exception e) {
+ view.displayError("Crawl failed: " + e.getMessage());
+ }
+ }
+
+ @Override
+ public String getCommandName() {
+ return "crawl";
+ }
+
+ @Override
+ public String getDescription() {
+ return "Crawl a website";
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/command/ExitCommand.java b/project/my-crawler/src/main/java/com/crawler/command/ExitCommand.java
new file mode 100644
index 0000000..760c7af
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/command/ExitCommand.java
@@ -0,0 +1,31 @@
+package com.crawler.command;
+
+import com.crawler.view.ConsoleView;
+
+public class ExitCommand implements Command {
+ private final ConsoleView view;
+ private Runnable exitCallback;
+
+ public ExitCommand(ConsoleView view, Runnable exitCallback) {
+ this.view = view;
+ this.exitCallback = exitCallback;
+ }
+
+ @Override
+ public void execute(String[] args) {
+ view.displayGoodbye();
+ if (exitCallback != null) {
+ exitCallback.run();
+ }
+ }
+
+ @Override
+ public String getCommandName() {
+ return "exit";
+ }
+
+ @Override
+ public String getDescription() {
+ return "Exit the application";
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/command/HelpCommand.java b/project/my-crawler/src/main/java/com/crawler/command/HelpCommand.java
new file mode 100644
index 0000000..9e624d6
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/command/HelpCommand.java
@@ -0,0 +1,26 @@
+package com.crawler.command;
+
+import com.crawler.view.ConsoleView;
+
+public class HelpCommand implements Command {
+ private final ConsoleView view;
+
+ public HelpCommand(ConsoleView view) {
+ this.view = view;
+ }
+
+ @Override
+ public void execute(String[] args) {
+ view.displayHelp();
+ }
+
+ @Override
+ public String getCommandName() {
+ return "help";
+ }
+
+ @Override
+ public String getDescription() {
+ return "Show help message";
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/command/ListCommand.java b/project/my-crawler/src/main/java/com/crawler/command/ListCommand.java
new file mode 100644
index 0000000..5f26db5
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/command/ListCommand.java
@@ -0,0 +1,26 @@
+package com.crawler.command;
+
+import com.crawler.controller.CrawlerController;
+
+public class ListCommand implements Command {
+ private final CrawlerController controller;
+
+ public ListCommand(CrawlerController controller) {
+ this.controller = controller;
+ }
+
+ @Override
+ public void execute(String[] args) {
+ controller.listArticles();
+ }
+
+ @Override
+ public String getCommandName() {
+ return "list";
+ }
+
+ @Override
+ public String getDescription() {
+ return "List all crawled articles";
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/command/LoadCommand.java b/project/my-crawler/src/main/java/com/crawler/command/LoadCommand.java
new file mode 100644
index 0000000..d90160c
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/command/LoadCommand.java
@@ -0,0 +1,26 @@
+package com.crawler.command;
+
+import com.crawler.controller.CrawlerController;
+
+public class LoadCommand implements Command {
+ private final CrawlerController controller;
+
+ public LoadCommand(CrawlerController controller) {
+ this.controller = controller;
+ }
+
+ @Override
+ public void execute(String[] args) {
+ controller.loadData();
+ }
+
+ @Override
+ public String getCommandName() {
+ return "load";
+ }
+
+ @Override
+ public String getDescription() {
+ return "Load articles from data file";
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/command/SaveCommand.java b/project/my-crawler/src/main/java/com/crawler/command/SaveCommand.java
new file mode 100644
index 0000000..6f14760
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/command/SaveCommand.java
@@ -0,0 +1,26 @@
+package com.crawler.command;
+
+import com.crawler.controller.CrawlerController;
+
+public class SaveCommand implements Command {
+ private final CrawlerController controller;
+
+ public SaveCommand(CrawlerController controller) {
+ this.controller = controller;
+ }
+
+ @Override
+ public void execute(String[] args) {
+ controller.saveData();
+ }
+
+ @Override
+ public String getCommandName() {
+ return "save";
+ }
+
+ @Override
+ public String getDescription() {
+ return "Save articles to data file";
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/controller/CrawlerController.java b/project/my-crawler/src/main/java/com/crawler/controller/CrawlerController.java
new file mode 100644
index 0000000..4609d44
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/controller/CrawlerController.java
@@ -0,0 +1,67 @@
+package com.crawler.controller;
+
+import java.util.List;
+
+import com.crawler.factory.StrategyFactory;
+import com.crawler.model.Article;
+import com.crawler.repository.ArticleRepository;
+import com.crawler.strategy.CrawlStrategy;
+import com.crawler.util.DataPersistence;
+import com.crawler.view.ConsoleView;
+
+public class CrawlerController {
+ private final ArticleRepository repository;
+ private final ConsoleView view;
+
+ public CrawlerController(ArticleRepository repository, ConsoleView view) {
+ this.repository = repository;
+ this.view = view;
+ loadSavedData();
+ }
+
+ private void loadSavedData() {
+ List savedArticles = DataPersistence.loadArticles();
+ if (!savedArticles.isEmpty()) {
+ repository.saveAll(savedArticles);
+ view.displayInfo("Loaded " + savedArticles.size() + " saved articles");
+ }
+ }
+
+ public void crawl(String url, String strategyName) throws Exception {
+ if (url == null || url.trim().isEmpty()) {
+ throw new IllegalArgumentException("URL cannot be empty");
+ }
+
+ if (!url.startsWith("http://") && !url.startsWith("https://")) {
+ url = "https://" + url;
+ }
+
+ // 移除URL重复检查,允许重复爬取同一URL
+ view.displayInfo("Crawling: " + url);
+ view.displayInfo("Using strategy: " + strategyName);
+
+ CrawlStrategy strategy = StrategyFactory.getStrategy(strategyName);
+ List articles = strategy.crawl(url);
+
+ for (Article article : articles) {
+ repository.save(article);
+ view.displaySuccess("Crawled: " + article.getTitle());
+ }
+ }
+
+ public void listArticles() {
+ List articles = repository.findAll();
+ view.displayArticleList(articles);
+ }
+
+ public void saveData() {
+ List articles = repository.findAll();
+ DataPersistence.saveArticles(articles);
+ }
+
+ public void loadData() {
+ repository.deleteAll();
+ List savedArticles = DataPersistence.loadArticles();
+ repository.saveAll(savedArticles);
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/exception/CrawlerException.java b/project/my-crawler/src/main/java/com/crawler/exception/CrawlerException.java
new file mode 100644
index 0000000..ff2583c
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/exception/CrawlerException.java
@@ -0,0 +1,11 @@
+package com.crawler.exception;
+
+public class CrawlerException extends RuntimeException {
+ public CrawlerException(String message) {
+ super(message);
+ }
+
+ public CrawlerException(String message, Throwable cause) {
+ super(message, cause);
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/exception/NetworkException.java b/project/my-crawler/src/main/java/com/crawler/exception/NetworkException.java
new file mode 100644
index 0000000..8d8b9e7
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/exception/NetworkException.java
@@ -0,0 +1,11 @@
+package com.crawler.exception;
+
+public class NetworkException extends CrawlerException {
+ public NetworkException(String message) {
+ super(message);
+ }
+
+ public NetworkException(String message, Throwable cause) {
+ super(message, cause);
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/exception/ParseException.java b/project/my-crawler/src/main/java/com/crawler/exception/ParseException.java
new file mode 100644
index 0000000..9248f23
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/exception/ParseException.java
@@ -0,0 +1,11 @@
+package com.crawler.exception;
+
+public class ParseException extends CrawlerException {
+ public ParseException(String message) {
+ super(message);
+ }
+
+ public ParseException(String message, Throwable cause) {
+ super(message, cause);
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/exception/UrlFormatException.java b/project/my-crawler/src/main/java/com/crawler/exception/UrlFormatException.java
new file mode 100644
index 0000000..3a60fab
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/exception/UrlFormatException.java
@@ -0,0 +1,11 @@
+package com.crawler.exception;
+
+public class UrlFormatException extends CrawlerException {
+ public UrlFormatException(String message) {
+ super(message);
+ }
+
+ public UrlFormatException(String message, Throwable cause) {
+ super(message, cause);
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/factory/StrategyFactory.java b/project/my-crawler/src/main/java/com/crawler/factory/StrategyFactory.java
new file mode 100644
index 0000000..b8a0310
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/factory/StrategyFactory.java
@@ -0,0 +1,27 @@
+package com.crawler.factory;
+
+import com.crawler.strategy.*;
+import java.util.HashMap;
+import java.util.Map;
+
+public class StrategyFactory {
+ private static final Map strategies = new HashMap<>();
+
+ static {
+ strategies.put("blog", new BlogCrawlStrategy());
+ strategies.put("news", new NewsCrawlStrategy());
+ strategies.put("jsoup", new JsoupCrawlStrategy());
+ }
+
+ public static CrawlStrategy getStrategy(String strategyName) {
+ return strategies.getOrDefault(strategyName.toLowerCase(), new JsoupCrawlStrategy());
+ }
+
+ public static boolean hasStrategy(String strategyName) {
+ return strategies.containsKey(strategyName.toLowerCase());
+ }
+
+ public static String[] getAvailableStrategies() {
+ return strategies.keySet().toArray(new String[0]);
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/model/Article.java b/project/my-crawler/src/main/java/com/crawler/model/Article.java
new file mode 100644
index 0000000..cb5599e
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/model/Article.java
@@ -0,0 +1,104 @@
+package com.crawler.model;
+
+import java.io.Serializable;
+import java.time.LocalDateTime;
+
+public class Article implements Serializable {
+ private static final long serialVersionUID = 1L;
+ private String id;
+ private String title;
+ private String url;
+ private String content;
+ private String author;
+ private LocalDateTime publishDate;
+ private LocalDateTime crawlDate;
+ private String source;
+
+ public Article() {
+ this.crawlDate = LocalDateTime.now();
+ }
+
+ public Article(String title, String url, String content) {
+ this.title = title;
+ this.url = url;
+ this.content = content;
+ this.crawlDate = LocalDateTime.now();
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public void setTitle(String title) {
+ this.title = title;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public void setUrl(String url) {
+ this.url = url;
+ }
+
+ public String getContent() {
+ return content;
+ }
+
+ public void setContent(String content) {
+ this.content = content;
+ }
+
+ public String getAuthor() {
+ return author;
+ }
+
+ public void setAuthor(String author) {
+ this.author = author;
+ }
+
+ public LocalDateTime getPublishDate() {
+ return publishDate;
+ }
+
+ public void setPublishDate(LocalDateTime publishDate) {
+ this.publishDate = publishDate;
+ }
+
+ public LocalDateTime getCrawlDate() {
+ return crawlDate;
+ }
+
+ public void setCrawlDate(LocalDateTime crawlDate) {
+ this.crawlDate = crawlDate;
+ }
+
+ public String getSource() {
+ return source;
+ }
+
+ public void setSource(String source) {
+ this.source = source;
+ }
+
+ @Override
+ public String toString() {
+ return "Article{" +
+ "id='" + id + '\'' +
+ ", title='" + title + '\'' +
+ ", url='" + url + '\'' +
+ ", author='" + author + '\'' +
+ ", publishDate=" + publishDate +
+ ", crawlDate=" + crawlDate +
+ ", source='" + source + '\'' +
+ '}';
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/repository/ArticleRepository.java b/project/my-crawler/src/main/java/com/crawler/repository/ArticleRepository.java
new file mode 100644
index 0000000..0feb516
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/repository/ArticleRepository.java
@@ -0,0 +1,18 @@
+package com.crawler.repository;
+
+import com.crawler.model.Article;
+import java.util.List;
+import java.util.Optional;
+
+public interface ArticleRepository {
+ void save(Article article);
+ void saveAll(List articles);
+ Optional findById(String id);
+ Optional findByUrl(String url);
+ List findAll();
+ List findBySource(String source);
+ void deleteById(String id);
+ void deleteAll();
+ int count();
+ boolean existsByUrl(String url);
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/repository/InMemoryArticleRepository.java b/project/my-crawler/src/main/java/com/crawler/repository/InMemoryArticleRepository.java
new file mode 100644
index 0000000..31c5578
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/repository/InMemoryArticleRepository.java
@@ -0,0 +1,78 @@
+package com.crawler.repository;
+
+import com.crawler.model.Article;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+
+public class InMemoryArticleRepository implements ArticleRepository {
+ private final Map articles = new ConcurrentHashMap<>();
+ private final Map urlToIdMap = new ConcurrentHashMap<>();
+ private final AtomicInteger idGenerator = new AtomicInteger(1);
+
+ @Override
+ public void save(Article article) {
+ if (article.getId() == null) {
+ article.setId(String.valueOf(idGenerator.getAndIncrement()));
+ }
+ articles.put(article.getId(), article);
+ if (article.getUrl() != null) {
+ urlToIdMap.put(article.getUrl(), article.getId());
+ }
+ }
+
+ @Override
+ public void saveAll(List articleList) {
+ for (Article article : articleList) {
+ save(article);
+ }
+ }
+
+ @Override
+ public Optional findById(String id) {
+ return Optional.ofNullable(articles.get(id));
+ }
+
+ @Override
+ public Optional findByUrl(String url) {
+ String id = urlToIdMap.get(url);
+ return id != null ? Optional.ofNullable(articles.get(id)) : Optional.empty();
+ }
+
+ @Override
+ public List findAll() {
+ return new ArrayList<>(articles.values());
+ }
+
+ @Override
+ public List findBySource(String source) {
+ return articles.values().stream()
+ .filter(a -> source.equals(a.getSource()))
+ .collect(Collectors.toList());
+ }
+
+ @Override
+ public void deleteById(String id) {
+ Article article = articles.remove(id);
+ if (article != null && article.getUrl() != null) {
+ urlToIdMap.remove(article.getUrl());
+ }
+ }
+
+ @Override
+ public void deleteAll() {
+ articles.clear();
+ urlToIdMap.clear();
+ }
+
+ @Override
+ public int count() {
+ return articles.size();
+ }
+
+ @Override
+ public boolean existsByUrl(String url) {
+ return urlToIdMap.containsKey(url);
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/strategy/BlogCrawlStrategy.java b/project/my-crawler/src/main/java/com/crawler/strategy/BlogCrawlStrategy.java
new file mode 100644
index 0000000..f85782a
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/strategy/BlogCrawlStrategy.java
@@ -0,0 +1,76 @@
+package com.crawler.strategy;
+
+import com.crawler.model.Article;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class BlogCrawlStrategy implements CrawlStrategy {
+
+ @Override
+ public List crawl(String url) {
+ List articles = new ArrayList<>();
+ try {
+ URL urlObj = new URL(url);
+ HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
+ connection.setRequestMethod("GET");
+ connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
+ connection.setConnectTimeout(10000);
+ connection.setReadTimeout(10000);
+
+ StringBuilder content = new StringBuilder();
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ content.append(line).append("\n");
+ }
+ }
+
+ Article article = new Article();
+ article.setTitle("Blog: " + extractTitle(content.toString()));
+ article.setUrl(url);
+ article.setSource("blog");
+ article.setContent(extractText(content.toString()));
+ article.setAuthor("Blog Author");
+
+ articles.add(article);
+
+ } catch (Exception e) {
+ Article errorArticle = new Article();
+ errorArticle.setTitle("Error crawling blog: " + url);
+ errorArticle.setUrl(url);
+ errorArticle.setContent("Error details: " + e.getMessage());
+ errorArticle.setSource("blog");
+ articles.add(errorArticle);
+ }
+ return articles;
+ }
+
+ private String extractTitle(String html) {
+ Pattern pattern = Pattern.compile("]*>([^<]+)", Pattern.CASE_INSENSITIVE);
+ Matcher matcher = pattern.matcher(html);
+ if (matcher.find()) {
+ return matcher.group(1).trim();
+ }
+ return "Untitled Blog";
+ }
+
+ private String extractText(String html) {
+ return html.replaceAll("", "")
+ .replaceAll("", "")
+ .replaceAll("<[^>]+>", " ")
+ .replaceAll("\\s+", " ")
+ .trim();
+ }
+
+ @Override
+ public String getStrategyName() {
+ return "blog";
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/strategy/CrawlStrategy.java b/project/my-crawler/src/main/java/com/crawler/strategy/CrawlStrategy.java
new file mode 100644
index 0000000..1aa1e59
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/strategy/CrawlStrategy.java
@@ -0,0 +1,9 @@
+package com.crawler.strategy;
+
+import com.crawler.model.Article;
+import java.util.List;
+
+public interface CrawlStrategy {
+ List crawl(String url) throws Exception;
+ String getStrategyName();
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/strategy/DoubanTop250Strategy.java b/project/my-crawler/src/main/java/com/crawler/strategy/DoubanTop250Strategy.java
new file mode 100644
index 0000000..8446a9c
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/strategy/DoubanTop250Strategy.java
@@ -0,0 +1,170 @@
+package com.crawler.strategy;
+
+import com.crawler.model.Article;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class DoubanTop250Strategy implements CrawlStrategy {
+
+ private static final int TOTAL_MOVIES = 250;
+ private static final int MOVIES_PER_PAGE = 25;
+
+ @Override
+ public List crawl(String url) {
+ List allMovies = new ArrayList<>();
+ try {
+ System.out.println("🎬 开始爬取豆瓣电影 Top 250...");
+ System.out.println("⏳ 预计需要爬取 " + (TOTAL_MOVIES / MOVIES_PER_PAGE) + " 页");
+
+ for (int page = 0; page < TOTAL_MOVIES; page += MOVIES_PER_PAGE) {
+ String pageUrl = "https://movie.douban.com/top250?start=" + page + "&filter=";
+ System.out.println("📄 正在爬取第 " + (page / MOVIES_PER_PAGE + 1) + " 页...");
+
+ List pageMovies = crawlPage(pageUrl, page / MOVIES_PER_PAGE + 1);
+ allMovies.addAll(pageMovies);
+
+ System.out.println("✅ 第 " + (page / MOVIES_PER_PAGE + 1) + " 页完成,已获取 " + allMovies.size() + " 部电影");
+
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ break;
+ }
+ }
+
+ System.out.println("🎉 完成!共爬取 " + allMovies.size() + " 部电影");
+ } catch (Exception e) {
+ System.err.println("❌ 爬取失败: " + e.getMessage());
+ Article errorArticle = new Article();
+ errorArticle.setTitle("Error crawling Douban Top 250");
+ errorArticle.setUrl(url);
+ errorArticle.setContent("Error details: " + e.getMessage());
+ errorArticle.setSource("douban");
+ allMovies.add(errorArticle);
+ }
+ return allMovies;
+ }
+
+ private List crawlPage(String url, int pageNum) {
+ List movies = new ArrayList<>();
+ try {
+ URL urlObj = new URL(url);
+ HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
+ connection.setRequestMethod("GET");
+ connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
+ connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
+ connection.setConnectTimeout(15000);
+ connection.setReadTimeout(15000);
+
+ StringBuilder html = new StringBuilder();
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ html.append(line).append("\n");
+ }
+ }
+
+ movies = parseMovies(html.toString());
+ } catch (Exception e) {
+ System.err.println("⚠️ 第 " + pageNum + " 页爬取失败: " + e.getMessage());
+ }
+ return movies;
+ }
+
+ private List parseMovies(String html) {
+ List movies = new ArrayList<>();
+
+ String moviePattern = "[\\s\\S]*?
\\s*\\s*";
+ Pattern pattern = Pattern.compile(moviePattern, Pattern.DOTALL);
+ Matcher matcher = pattern.matcher(html);
+
+ while (matcher.find()) {
+ try {
+ Article movie = parseSingleMovie(matcher.group());
+ if (movie != null) {
+ movies.add(movie);
+ }
+ } catch (Exception e) {
+ continue;
+ }
+ }
+ return movies;
+ }
+
+ private Article parseSingleMovie(String movieHtml) {
+ Article movie = new Article();
+ movie.setSource("douban");
+
+ try {
+ Pattern titlePattern = Pattern.compile("(.*?)");
+ Matcher titleMatcher = titlePattern.matcher(movieHtml);
+ if (titleMatcher.find()) {
+ movie.setTitle(titleMatcher.group(1));
+ }
+
+ Pattern linkPattern = Pattern.compile("(.*?)");
+ Matcher ratingMatcher = ratingPattern.matcher(movieHtml);
+ String rating = "";
+ if (ratingMatcher.find()) {
+ rating = ratingMatcher.group(1);
+ }
+
+ Pattern yearPattern = Pattern.compile("(\\d{4})\\s*/");
+ Matcher yearMatcher = yearPattern.matcher(movieHtml);
+ String year = "";
+ if (yearMatcher.find()) {
+ year = yearMatcher.group(1);
+ }
+
+ Pattern quotePattern = Pattern.compile("(.*?)");
+ Matcher quoteMatcher = quotePattern.matcher(movieHtml);
+ String quote = "";
+ if (quoteMatcher.find()) {
+ quote = quoteMatcher.group(1);
+ }
+
+ Pattern infoPattern = Pattern.compile("(.*?)
", Pattern.DOTALL);
+ Matcher infoMatcher = infoPattern.matcher(movieHtml);
+ String info = "";
+ if (infoMatcher.find()) {
+ info = infoMatcher.group(1).replaceAll("
", "\n").replaceAll("<[^>]+>", "").trim();
+ }
+
+ StringBuilder content = new StringBuilder();
+ content.append("🎬 电影名称: ").append(movie.getTitle()).append("\n");
+ content.append("⭐ 评分: ").append(rating).append("\n");
+ content.append("📅 年份: ").append(year).append("\n");
+ if (!quote.isEmpty()) {
+ content.append("💬 简介: ").append(quote).append("\n");
+ }
+ content.append("\n📝 详细信息:\n").append(info);
+
+ movie.setContent(content.toString());
+ movie.setAuthor("豆瓣电影");
+
+ } catch (Exception e) {
+ return null;
+ }
+
+ return movie;
+ }
+
+ @Override
+ public String getStrategyName() {
+ return "douban";
+ }
+}
\ No newline at end of file
diff --git a/project/my-crawler/src/main/java/com/crawler/strategy/JsoupCrawlStrategy.java b/project/my-crawler/src/main/java/com/crawler/strategy/JsoupCrawlStrategy.java
new file mode 100644
index 0000000..e02fe25
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/strategy/JsoupCrawlStrategy.java
@@ -0,0 +1,75 @@
+package com.crawler.strategy;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.crawler.model.Article;
+
+public class JsoupCrawlStrategy implements CrawlStrategy {
+
+ @Override
+ public List crawl(String url) {
+ List articles = new ArrayList<>();
+ try {
+ URL urlObj = new URL(url);
+ HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
+ connection.setRequestMethod("GET");
+ connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
+ connection.setConnectTimeout(10000);
+ connection.setReadTimeout(10000);
+
+ StringBuilder content = new StringBuilder();
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ content.append(line).append("\n");
+ }
+ }
+
+ Article article = new Article();
+ article.setTitle(extractTitle(content.toString()));
+ article.setUrl(url);
+ article.setSource(url);
+ article.setContent(extractText(content.toString()));
+
+ articles.add(article);
+
+ } catch (Exception e) {
+ Article errorArticle = new Article();
+ errorArticle.setTitle("Error crawling: " + url);
+ errorArticle.setUrl(url);
+ errorArticle.setContent("Error details: " + e.getMessage());
+ errorArticle.setSource(url);
+ articles.add(errorArticle);
+ }
+ return articles;
+ }
+
+ private String extractTitle(String html) {
+ Pattern pattern = Pattern.compile("]*>([^<]+)", Pattern.CASE_INSENSITIVE);
+ Matcher matcher = pattern.matcher(html);
+ if (matcher.find()) {
+ return matcher.group(1).trim();
+ }
+ return "Untitled Page";
+ }
+
+ private String extractText(String html) {
+ return html.replaceAll("", "")
+ .replaceAll("", "")
+ .replaceAll("<[^>]+>", " ")
+ .replaceAll("\\s+", " ")
+ .trim();
+ }
+
+ @Override
+ public String getStrategyName() {
+ return "jsoup";
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/strategy/NewsCrawlStrategy.java b/project/my-crawler/src/main/java/com/crawler/strategy/NewsCrawlStrategy.java
new file mode 100644
index 0000000..82a5450
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/strategy/NewsCrawlStrategy.java
@@ -0,0 +1,76 @@
+package com.crawler.strategy;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.crawler.model.Article;
+
+public class NewsCrawlStrategy implements CrawlStrategy {
+
+ @Override
+ public List crawl(String url) {
+ List articles = new ArrayList<>();
+ try {
+ URL urlObj = new URL(url);
+ HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
+ connection.setRequestMethod("GET");
+ connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
+ connection.setConnectTimeout(10000);
+ connection.setReadTimeout(10000);
+
+ StringBuilder content = new StringBuilder();
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ content.append(line).append("\n");
+ }
+ }
+
+ Article article = new Article();
+ article.setTitle("News: " + extractTitle(content.toString()));
+ article.setUrl(url);
+ article.setSource("news");
+ article.setContent(extractText(content.toString()));
+ article.setAuthor("News Reporter");
+
+ articles.add(article);
+
+ } catch (Exception e) {
+ Article errorArticle = new Article();
+ errorArticle.setTitle("Error crawling news: " + url);
+ errorArticle.setUrl(url);
+ errorArticle.setContent("Error details: " + e.getMessage());
+ errorArticle.setSource("news");
+ articles.add(errorArticle);
+ }
+ return articles;
+ }
+
+ private String extractTitle(String html) {
+ Pattern pattern = Pattern.compile("]*>([^<]+)", Pattern.CASE_INSENSITIVE);
+ Matcher matcher = pattern.matcher(html);
+ if (matcher.find()) {
+ return matcher.group(1).trim();
+ }
+ return "Untitled News";
+ }
+
+ private String extractText(String html) {
+ return html.replaceAll("", "")
+ .replaceAll("", "")
+ .replaceAll("<[^>]+>", " ")
+ .replaceAll("\\s+", " ")
+ .trim();
+ }
+
+ @Override
+ public String getStrategyName() {
+ return "news";
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/util/ColorUtil.java b/project/my-crawler/src/main/java/com/crawler/util/ColorUtil.java
new file mode 100644
index 0000000..6003abc
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/util/ColorUtil.java
@@ -0,0 +1,54 @@
+package com.crawler.util;
+
+public class ColorUtil {
+ public static final String RESET = "\u001B[0m";
+ public static final String BLACK = "\u001B[30m";
+ public static final String RED = "\u001B[31m";
+ public static final String GREEN = "\u001B[32m";
+ public static final String YELLOW = "\u001B[33m";
+ public static final String BLUE = "\u001B[34m";
+ public static final String PURPLE = "\u001B[35m";
+ public static final String CYAN = "\u001B[36m";
+ public static final String WHITE = "\u001B[37m";
+
+ public static final String BLACK_BG = "\u001B[40m";
+ public static final String RED_BG = "\u001B[41m";
+ public static final String GREEN_BG = "\u001B[42m";
+ public static final String YELLOW_BG = "\u001B[43m";
+ public static final String BLUE_BG = "\u001B[44m";
+ public static final String PURPLE_BG = "\u001B[45m";
+ public static final String CYAN_BG = "\u001B[46m";
+ public static final String WHITE_BG = "\u001B[47m";
+
+ public static String colorize(String text, String color) {
+ return color + text + RESET;
+ }
+
+ public static String green(String text) {
+ return colorize(text, GREEN);
+ }
+
+ public static String red(String text) {
+ return colorize(text, RED);
+ }
+
+ public static String yellow(String text) {
+ return colorize(text, YELLOW);
+ }
+
+ public static String blue(String text) {
+ return colorize(text, BLUE);
+ }
+
+ public static String cyan(String text) {
+ return colorize(text, CYAN);
+ }
+
+ public static String purple(String text) {
+ return colorize(text, PURPLE);
+ }
+
+ public static String bold(String text) {
+ return "\u001B[1m" + text + RESET;
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/util/DataPersistence.java b/project/my-crawler/src/main/java/com/crawler/util/DataPersistence.java
new file mode 100644
index 0000000..5c9cd3d
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/util/DataPersistence.java
@@ -0,0 +1,193 @@
+package com.crawler.util;
+
+import com.crawler.model.Article;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.ArrayList;
+import java.util.List;
+
+public class DataPersistence {
+ private static final String DATA_FOLDER = "data";
+ private static final String INDEX_FILE = DATA_FOLDER + File.separator + "index.txt";
+ private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+ private static final DateTimeFormatter FILE_TIMESTAMP_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss");
+
+ static {
+ File folder = new File(DATA_FOLDER);
+ if (!folder.exists()) {
+ folder.mkdirs();
+ }
+ }
+
+ public static void saveArticles(List articles) {
+ try {
+ String timestamp = LocalDateTime.now().format(FILE_TIMESTAMP_FORMATTER);
+ String batchFileName = DATA_FOLDER + File.separator + "articles_" + timestamp + ".txt";
+
+ try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(batchFileName), StandardCharsets.UTF_8))) {
+ writer.write("========================================\n");
+ writer.write(" 文章数据批次保存\n");
+ writer.write("========================================\n\n");
+ writer.write("保存时间: " + LocalDateTime.now().format(DATE_FORMATTER) + "\n");
+ writer.write("文章数量: " + articles.size() + "\n\n");
+ writer.write("========================================\n\n");
+
+ for (int i = 0; i < articles.size(); i++) {
+ Article article = articles.get(i);
+ writer.write("----------------------------------------\n");
+ writer.write("文章 " + (i + 1) + "\n");
+ writer.write("----------------------------------------\n");
+ writer.write("ID: " + article.getId() + "\n");
+ writer.write("标题: " + article.getTitle() + "\n");
+ writer.write("URL: " + article.getUrl() + "\n");
+ if (article.getAuthor() != null) {
+ writer.write("作者: " + article.getAuthor() + "\n");
+ }
+ if (article.getSource() != null) {
+ writer.write("来源: " + article.getSource() + "\n");
+ }
+ if (article.getPublishDate() != null) {
+ writer.write("发布时间: " + article.getPublishDate().format(DATE_FORMATTER) + "\n");
+ }
+ writer.write("爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n");
+ writer.write("\n内容:\n");
+ if (article.getContent() != null) {
+ writer.write(article.getContent());
+ }
+ writer.write("\n\n");
+ }
+ }
+
+ updateIndex(timestamp, articles.size());
+ System.out.println(ColorUtil.green("✓ Saved " + articles.size() + " articles to '" + batchFileName + "'"));
+ } catch (Exception e) {
+ System.err.println(ColorUtil.red("✗ Failed to save articles: " + e.getMessage()));
+ }
+ }
+
+ private static void updateIndex(String timestamp, int articleCount) throws IOException {
+ boolean fileExists = new File(INDEX_FILE).exists();
+
+ try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(INDEX_FILE, true), StandardCharsets.UTF_8))) {
+ if (!fileExists) {
+ writer.write("========================================\n");
+ writer.write(" 文章保存历史记录索引\n");
+ writer.write("========================================\n\n");
+ }
+
+ writer.write("[保存记录] " + LocalDateTime.now().format(DATE_FORMATTER) + "\n");
+ writer.write(" 批次文件: articles_" + timestamp + ".txt\n");
+ writer.write(" 文章数量: " + articleCount + "\n");
+ writer.write("\n");
+ }
+ }
+
+ private static void saveIndex(List articles) throws IOException {
+ try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(INDEX_FILE), StandardCharsets.UTF_8))) {
+ writer.write("========================================\n");
+ writer.write(" 文章索引\n");
+ writer.write("========================================\n\n");
+ writer.write("共有 " + articles.size() + " 篇文章\n\n");
+
+ for (Article article : articles) {
+ writer.write("[" + article.getId() + "] " + article.getTitle() + "\n");
+ writer.write(" URL: " + article.getUrl() + "\n");
+ writer.write(" 文件名: article_" + article.getId() + ".txt\n");
+ if (article.getCrawlDate() != null) {
+ writer.write(" 爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n");
+ }
+ writer.write("\n");
+ }
+ }
+ }
+
+ public static List loadArticles() {
+ List articles = new ArrayList<>();
+ File folder = new File(DATA_FOLDER);
+
+ if (!folder.exists()) {
+ return articles;
+ }
+
+ File[] files = folder.listFiles((dir, name) -> name.startsWith("articles_") && name.endsWith(".txt"));
+
+ if (files != null) {
+ for (File file : files) {
+ if (file.getName().equals("index.txt")) {
+ continue;
+ }
+ try {
+ List batchArticles = loadBatchArticle(file);
+ if (batchArticles != null) {
+ articles.addAll(batchArticles);
+ }
+ } catch (Exception e) {
+ System.err.println(ColorUtil.yellow("⚠ 无法加载文件: " + file.getName()));
+ }
+ }
+ }
+
+ System.out.println(ColorUtil.green("✓ Loaded " + articles.size() + " articles from '" + DATA_FOLDER + "' folder"));
+ return articles;
+ }
+
+ private static List loadBatchArticle(File file) throws IOException {
+ List articles = new ArrayList<>();
+ Article currentArticle = null;
+ StringBuilder content = new StringBuilder();
+ boolean inContent = false;
+
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8))) {
+ String line;
+
+ while ((line = reader.readLine()) != null) {
+ if (line.contains("文章 ")) {
+ if (currentArticle != null) {
+ currentArticle.setContent(content.toString());
+ articles.add(currentArticle);
+ }
+ currentArticle = new Article();
+ content = new StringBuilder();
+ inContent = false;
+ continue;
+ }
+
+ if (line.startsWith("ID: ")) {
+ currentArticle.setId(line.substring(4));
+ } else if (line.startsWith("标题: ")) {
+ currentArticle.setTitle(line.substring(4));
+ } else if (line.startsWith("URL: ")) {
+ currentArticle.setUrl(line.substring(5));
+ } else if (line.startsWith("作者: ")) {
+ currentArticle.setAuthor(line.substring(4));
+ } else if (line.startsWith("来源: ")) {
+ currentArticle.setSource(line.substring(4));
+ } else if (line.startsWith("爬取时间: ")) {
+ String crawlDateStr = line.substring(6);
+ try {
+ currentArticle.setCrawlDate(LocalDateTime.parse(crawlDateStr, DATE_FORMATTER));
+ } catch (Exception e) {
+ // Ignore parse errors
+ }
+ } else if (line.equals("内容:")) {
+ inContent = true;
+ } else if (inContent && !line.startsWith("-----") && !line.startsWith("=====")) {
+ if (content.length() > 0) {
+ content.append("\n");
+ }
+ content.append(line);
+ }
+ }
+
+ if (currentArticle != null) {
+ currentArticle.setContent(content.toString());
+ articles.add(currentArticle);
+ }
+ }
+
+ return articles;
+ }
+}
diff --git a/project/my-crawler/src/main/java/com/crawler/view/ConsoleView.java b/project/my-crawler/src/main/java/com/crawler/view/ConsoleView.java
new file mode 100644
index 0000000..71d86e9
--- /dev/null
+++ b/project/my-crawler/src/main/java/com/crawler/view/ConsoleView.java
@@ -0,0 +1,101 @@
+package com.crawler.view;
+
+import java.time.format.DateTimeFormatter;
+import java.util.List;
+import java.util.Scanner;
+
+import com.crawler.model.Article;
+import com.crawler.util.ColorUtil;
+
+public class ConsoleView {
+ private static final Scanner scanner = new Scanner(System.in);
+ private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+
+ public void displayWelcome() {
+ System.out.println(ColorUtil.cyan("========================================"));
+ System.out.println(ColorUtil.cyan(" Welcome to My Crawler "));
+ System.out.println(ColorUtil.cyan("========================================"));
+ System.out.println();
+ }
+
+ public void displayHelp() {
+ System.out.println(ColorUtil.yellow("Available commands:"));
+ System.out.println(ColorUtil.green(" crawl [strategy] - Crawl a website"));
+ System.out.println(ColorUtil.green(" list - List all crawled articles"));
+ System.out.println(ColorUtil.green(" save - Save articles to data file"));
+ System.out.println(ColorUtil.green(" load - Load articles from data file"));
+ System.out.println(ColorUtil.green(" help - Show this help message"));
+ System.out.println(ColorUtil.green(" exit - Exit the application"));
+ System.out.println();
+ System.out.println(ColorUtil.yellow("Available strategies:"));
+ System.out.println(ColorUtil.cyan(" blog - Blog crawling strategy"));
+ System.out.println(ColorUtil.cyan(" news - News crawling strategy"));
+ System.out.println(ColorUtil.cyan(" jsoup - Generic JSoup strategy (default)"));
+ System.out.println();
+ }
+
+ public void displayArticleList(List articles) {
+ if (articles.isEmpty()) {
+ System.out.println(ColorUtil.yellow("No articles found."));
+ return;
+ }
+
+ System.out.println(ColorUtil.cyan("=== Crawled Articles (" + articles.size() + ") ==="));
+ System.out.println();
+
+ for (int i = 0; i < articles.size(); i++) {
+ displayArticleDetail(articles.get(i), i + 1);
+ }
+ }
+
+ public void displayArticleDetail(Article article, int index) {
+ System.out.println(ColorUtil.bold(ColorUtil.green("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")));
+ System.out.println(ColorUtil.bold(ColorUtil.yellow("[" + index + "] " + article.getTitle())));
+ System.out.println(ColorUtil.bold(ColorUtil.green("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")));
+ System.out.println(ColorUtil.cyan(" ID: ") + article.getId());
+ System.out.println(ColorUtil.cyan(" URL: ") + article.getUrl());
+ if (article.getAuthor() != null) {
+ System.out.println(ColorUtil.cyan(" Author: ") + article.getAuthor());
+ }
+ if (article.getSource() != null) {
+ System.out.println(ColorUtil.cyan(" Source: ") + article.getSource());
+ }
+ if (article.getPublishDate() != null) {
+ System.out.println(ColorUtil.cyan(" Published: ") + article.getPublishDate().format(DATE_FORMATTER));
+ }
+ System.out.println(ColorUtil.cyan(" Crawled: ") + article.getCrawlDate().format(DATE_FORMATTER));
+ System.out.println(ColorUtil.cyan(" Content: "));
+ if (article.getContent() != null) {
+ String[] lines = article.getContent().split("(?<=\\G.{80})");
+ for (String line : lines) {
+ System.out.println(" " + line);
+ }
+ }
+ System.out.println();
+ }
+
+ public void displaySuccess(String message) {
+ System.out.println(ColorUtil.green("✓ " + message));
+ }
+
+ public void displayError(String message) {
+ System.out.println(ColorUtil.red("✗ " + message));
+ }
+
+ public void displayInfo(String message) {
+ System.out.println(ColorUtil.blue("ℹ " + message));
+ }
+
+ public void displayWarning(String message) {
+ System.out.println(ColorUtil.yellow("⚠ " + message));
+ }
+
+ public String readInput() {
+ System.out.print(ColorUtil.purple("> "));
+ return scanner.nextLine().trim();
+ }
+
+ public void displayGoodbye() {
+ System.out.println(ColorUtil.cyan("Goodbye! Thank you for using My Crawler."));
+ }
+}
diff --git a/project/my-crawler/target/classes/.DS_Store b/project/my-crawler/target/classes/.DS_Store
new file mode 100644
index 0000000..62211a2
Binary files /dev/null and b/project/my-crawler/target/classes/.DS_Store differ
diff --git a/project/my-crawler/target/classes/com/.DS_Store b/project/my-crawler/target/classes/com/.DS_Store
new file mode 100644
index 0000000..a1294a8
Binary files /dev/null and b/project/my-crawler/target/classes/com/.DS_Store differ
diff --git a/project/my-crawler/target/classes/com/crawler/App.class b/project/my-crawler/target/classes/com/crawler/App.class
new file mode 100644
index 0000000..2f36f15
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/App.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/command/Command.class b/project/my-crawler/target/classes/com/crawler/command/Command.class
new file mode 100644
index 0000000..20bfa18
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/command/Command.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/command/CrawlCommand.class b/project/my-crawler/target/classes/com/crawler/command/CrawlCommand.class
new file mode 100644
index 0000000..eb4c208
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/command/CrawlCommand.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/command/ExitCommand.class b/project/my-crawler/target/classes/com/crawler/command/ExitCommand.class
new file mode 100644
index 0000000..10fcca7
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/command/ExitCommand.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/command/HelpCommand.class b/project/my-crawler/target/classes/com/crawler/command/HelpCommand.class
new file mode 100644
index 0000000..ccb9710
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/command/HelpCommand.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/command/ListCommand.class b/project/my-crawler/target/classes/com/crawler/command/ListCommand.class
new file mode 100644
index 0000000..5af58ff
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/command/ListCommand.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/command/LoadCommand.class b/project/my-crawler/target/classes/com/crawler/command/LoadCommand.class
new file mode 100644
index 0000000..39549b7
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/command/LoadCommand.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/command/SaveCommand.class b/project/my-crawler/target/classes/com/crawler/command/SaveCommand.class
new file mode 100644
index 0000000..45e2819
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/command/SaveCommand.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/controller/CrawlerController.class b/project/my-crawler/target/classes/com/crawler/controller/CrawlerController.class
new file mode 100644
index 0000000..fbc4a44
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/controller/CrawlerController.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/exception/CrawlerException.class b/project/my-crawler/target/classes/com/crawler/exception/CrawlerException.class
new file mode 100644
index 0000000..c6470c6
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/exception/CrawlerException.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/exception/NetworkException.class b/project/my-crawler/target/classes/com/crawler/exception/NetworkException.class
new file mode 100644
index 0000000..eec2756
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/exception/NetworkException.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/exception/ParseException.class b/project/my-crawler/target/classes/com/crawler/exception/ParseException.class
new file mode 100644
index 0000000..3050d1f
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/exception/ParseException.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/exception/UrlFormatException.class b/project/my-crawler/target/classes/com/crawler/exception/UrlFormatException.class
new file mode 100644
index 0000000..db5e12a
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/exception/UrlFormatException.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/factory/StrategyFactory.class b/project/my-crawler/target/classes/com/crawler/factory/StrategyFactory.class
new file mode 100644
index 0000000..74b23de
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/factory/StrategyFactory.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/model/Article.class b/project/my-crawler/target/classes/com/crawler/model/Article.class
new file mode 100644
index 0000000..cab1018
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/model/Article.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/repository/ArticleRepository.class b/project/my-crawler/target/classes/com/crawler/repository/ArticleRepository.class
new file mode 100644
index 0000000..9c22bb5
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/repository/ArticleRepository.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/repository/InMemoryArticleRepository.class b/project/my-crawler/target/classes/com/crawler/repository/InMemoryArticleRepository.class
new file mode 100644
index 0000000..7484a6f
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/repository/InMemoryArticleRepository.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/strategy/BlogCrawlStrategy.class b/project/my-crawler/target/classes/com/crawler/strategy/BlogCrawlStrategy.class
new file mode 100644
index 0000000..8fa840f
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/strategy/BlogCrawlStrategy.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/strategy/CrawlStrategy.class b/project/my-crawler/target/classes/com/crawler/strategy/CrawlStrategy.class
new file mode 100644
index 0000000..e04b917
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/strategy/CrawlStrategy.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/strategy/DoubanTop250Strategy.class b/project/my-crawler/target/classes/com/crawler/strategy/DoubanTop250Strategy.class
new file mode 100644
index 0000000..575d6c2
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/strategy/DoubanTop250Strategy.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/strategy/JsoupCrawlStrategy.class b/project/my-crawler/target/classes/com/crawler/strategy/JsoupCrawlStrategy.class
new file mode 100644
index 0000000..e4dd2bd
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/strategy/JsoupCrawlStrategy.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/strategy/NewsCrawlStrategy.class b/project/my-crawler/target/classes/com/crawler/strategy/NewsCrawlStrategy.class
new file mode 100644
index 0000000..2af3d70
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/strategy/NewsCrawlStrategy.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/util/ColorUtil.class b/project/my-crawler/target/classes/com/crawler/util/ColorUtil.class
new file mode 100644
index 0000000..b00b7cf
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/util/ColorUtil.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/util/DataPersistence.class b/project/my-crawler/target/classes/com/crawler/util/DataPersistence.class
new file mode 100644
index 0000000..b3327e9
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/util/DataPersistence.class differ
diff --git a/project/my-crawler/target/classes/com/crawler/view/ConsoleView.class b/project/my-crawler/target/classes/com/crawler/view/ConsoleView.class
new file mode 100644
index 0000000..6b23724
Binary files /dev/null and b/project/my-crawler/target/classes/com/crawler/view/ConsoleView.class differ