Browse Source

提交Java爬虫项目代码与报告

main
Wangyifei 3 weeks ago
parent
commit
9abdaf4a41
  1. BIN
      project/202506050305-王亦菲-期末实验报告.docx
  2. BIN
      project/charts/book_author_bar.png
  3. BIN
      project/charts/book_price_pie.png
  4. BIN
      project/charts/movie_director_bar.png
  5. BIN
      project/charts/movie_genre_pie.png
  6. BIN
      project/charts/movie_rating_bar.png
  7. BIN
      project/charts/music_artist_bar.png
  8. BIN
      project/charts/music_duration_pie.png
  9. BIN
      project/charts/news_sentiment_pie.png
  10. BIN
      project/charts/news_word_bar.png
  11. 81
      project/data/book_data.json
  12. 141
      project/data/movie_data.json
  13. 81
      project/data/music_data.json
  14. 101
      project/data/news_data.json
  15. 105
      project/pom.xml
  16. 48
      project/run.bat
  17. 1084
      project/src/main/java/com/example/Main.java
  18. 173
      project/src/main/java/com/example/analysis/BookAnalyzer.java
  19. 331
      project/src/main/java/com/example/analysis/ChartGenerator.java
  20. 187
      project/src/main/java/com/example/analysis/DataCleaner.java
  21. 285
      project/src/main/java/com/example/analysis/MovieAnalyzer.java
  22. 189
      project/src/main/java/com/example/analysis/MusicAnalyzer.java
  23. 298
      project/src/main/java/com/example/analysis/NewsAnalyzer.java
  24. 399
      project/src/main/java/com/example/chart/JFreeChartGenerator.java
  25. 6
      project/src/main/java/com/example/command/Command.java
  26. 51
      project/src/main/java/com/example/command/CommandInvoker.java
  27. 39
      project/src/main/java/com/example/command/ExportCommand.java
  28. 29
      project/src/main/java/com/example/command/GetHotCommand.java
  29. 45
      project/src/main/java/com/example/command/ImportCommand.java
  30. 35
      project/src/main/java/com/example/command/SearchCommand.java
  31. 91
      project/src/main/java/com/example/controller/SpiderController.java
  32. 47
      project/src/main/java/com/example/core/CrawlResult.java
  33. 260
      project/src/main/java/com/example/core/MusicSpider.java
  34. 33
      project/src/main/java/com/example/core/Platform.java
  35. 47
      project/src/main/java/com/example/exception/ExceptionHandler.java
  36. 12
      project/src/main/java/com/example/exception/NetworkException.java
  37. 12
      project/src/main/java/com/example/exception/ParseException.java
  38. 19
      project/src/main/java/com/example/exception/SpiderException.java
  39. 12
      project/src/main/java/com/example/exception/StorageException.java
  40. 56
      project/src/main/java/com/example/invoker/SpiderInvoker.java
  41. 37
      project/src/main/java/com/example/model/Article.java
  42. 121
      project/src/main/java/com/example/model/BookItem.java
  43. 86
      project/src/main/java/com/example/model/Chart.java
  44. 99
      project/src/main/java/com/example/model/ChartItem.java
  45. 39
      project/src/main/java/com/example/model/ChartType.java
  46. 43
      project/src/main/java/com/example/model/Comment.java
  47. 78
      project/src/main/java/com/example/model/MovieItem.java
  48. 29
      project/src/main/java/com/example/model/NewsItem.java
  49. 54
      project/src/main/java/com/example/model/Song.java
  50. 198
      project/src/main/java/com/example/service/impl/EnhancedHttpClient.java
  51. 391
      project/src/main/java/com/example/spider/NetEaseMusicSpider.java
  52. 494
      project/src/main/java/com/example/spider/book/DangdangBookSpider.java
  53. 355
      project/src/main/java/com/example/spider/movie/DoubanMovieSpider.java
  54. 172
      project/src/main/java/com/example/spider/news/ChinanewsSpider.java
  55. 90
      project/src/main/java/com/example/storage/DataExporter.java
  56. 433
      project/src/main/java/com/example/storage/DatabaseManager.java
  57. 47
      project/src/main/java/com/example/storage/JsonExporter.java
  58. 44
      project/src/main/java/com/example/storage/JsonImporter.java
  59. 20
      project/src/main/java/com/example/strategy/AntiBlockStrategy.java
  60. 114
      project/src/main/java/com/example/strategy/DefaultAntiBlockStrategy.java
  61. 194
      project/src/main/java/com/example/strategy/EnhancedAntiBlockStrategy.java
  62. 99
      project/src/main/java/com/example/strategy/RequestThrottler.java
  63. 16
      project/src/main/java/com/example/strategy/SpiderStrategy.java
  64. 86
      project/src/main/java/com/example/strategy/UserAgentPool.java
  65. 112
      project/src/main/java/com/example/view/ConsoleView.java
  66. 46
      project/src/main/resources/spider-config.json

BIN
project/202506050305-王亦菲-期末实验报告.docx

Binary file not shown.

BIN
project/charts/book_author_bar.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

BIN
project/charts/book_price_pie.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

BIN
project/charts/movie_director_bar.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

BIN
project/charts/movie_genre_pie.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

BIN
project/charts/movie_rating_bar.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
project/charts/music_artist_bar.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

BIN
project/charts/music_duration_pie.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

BIN
project/charts/news_sentiment_pie.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

BIN
project/charts/news_word_bar.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

81
project/data/book_data.json

@ -0,0 +1,81 @@
[ {
"id" : "12100008885",
"title" : "小王子(畅销500万册,罗翔老师推荐,李继宏口碑译作,作者基金会官方认证)【果麦经典】",
"author" : "",
"rating" : "9.5",
"publisher" : "当当图书",
"publishDate" : "",
"price" : "19.90"
}, {
"id" : "24019291",
"title" : "神奇的答案之书(2024全新修订版,足足784页)畅销92个月的神奇之书,已给百万读者带去神秘指引,愿一切无解都有解。",
"author" : "梅森 知书达礼 出品",
"rating" : "9.5",
"publisher" : "当当图书",
"publishDate" : "",
"price" : "26.30"
}, {
"id" : "29913529",
"title" : "云边有个小卖部新版 畅销1000万册新版 新增长篇后记 番外插画 多重精美赠品 三封面设计 当当自营",
"author" : "张嘉佳 著,博集天卷 出品",
"rating" : "9.5",
"publisher" : "当当图书",
"publishDate" : "",
"price" : "21.80"
}, {
"id" : "12100014884",
"title" : "学为贵真经教学体系畅销教材 刘洪波雅思真经5 雅思听力王陆雅思王听力真题语料库 杨帅口语 杨帅雅思口语900句 等名师编",
"author" : "",
"rating" : "9.5",
"publisher" : "当当图书",
"publishDate" : "",
"price" : "14.25"
}, {
"id" : "12100018678",
"title" : "北科保健畅销书",
"author" : "",
"rating" : "9.5",
"publisher" : "当当图书",
"publishDate" : "",
"price" : "65.60"
}, {
"id" : "12100020748",
"title" : "小学生畅销课外阅读书小鲤鱼跳龙门小巴掌童话快乐读书吧一年级课外阅读书2年级课外阅读漫画十万个为什么百科科普漫画版四大名著",
"author" : "",
"rating" : "8.5",
"publisher" : "当当图书",
"publishDate" : "",
"price" : "12.00"
}, {
"id" : "29816257",
"title" : "堂吉诃德(全2册):现象级畅销书全本全译无删减 上下两册共920页 插图版・收录古斯塔夫・多雷插画 著名翻译家傅东华经典",
"author" : "(西)塞万提斯 著 傅东华 译",
"rating" : "9.2",
"publisher" : "当当图书",
"publishDate" : "",
"price" : "78.40"
}, {
"id" : "29936526",
"title" : "龛世 (畅销书作家木苏里古风奇幻经典代表作,网络原名《铜钱龛世》,薛闲×玄悯,一人一龙,踏险遇奇,破阵解局,浮生共梦)",
"author" : "木苏里 著;欣欣向爱 出品",
"rating" : "9.0",
"publisher" : "当当图书",
"publishDate" : "",
"price" : "78.00"
}, {
"id" : "29873524",
"title" : "图解黄帝内经 挂图版:畅销百万册图解经典系列全集,忠于原著!附赠两张全彩挂图 人人都能看得懂用得上",
"author" : "(明)李时珍著,健康大学堂编委会 编著",
"rating" : "9.5",
"publisher" : "当当图书",
"publishDate" : "",
"price" : "37.40"
}, {
"id" : "27848420",
"title" : "畅销套装-救命饮食三部曲:非药而愈+我医我素+极简全蔬食(逆转和预防疾病的健康饮食)",
"author" : "徐嘉、卢丽爱、素愫",
"rating" : "9.2",
"publisher" : "当当图书",
"publishDate" : "",
"price" : "130.90"
} ]

141
project/data/movie_data.json

@ -0,0 +1,141 @@
[ {
"id" : "1292052",
"title" : "肖申克的救赎",
"rating" : "9.7",
"releaseDate" : "1994",
"genre" : "剧情",
"director" : "弗兰克·德拉邦特 F..."
}, {
"id" : "1291546",
"title" : "霸王别姬",
"rating" : "9.6",
"releaseDate" : "1993",
"genre" : "剧情",
"director" : "陈凯歌 Kaige ..."
}, {
"id" : "1292722",
"title" : "泰坦尼克号",
"rating" : "9.5",
"releaseDate" : "1997",
"genre" : "剧情",
"director" : "詹姆斯·卡梅隆 Ja..."
}, {
"id" : "1292720",
"title" : "阿甘正传",
"rating" : "9.5",
"releaseDate" : "1994",
"genre" : "剧情",
"director" : "罗伯特·泽米吉斯 R..."
}, {
"id" : "1291561",
"title" : "千与千寻",
"rating" : "9.4",
"releaseDate" : "2001",
"genre" : "剧情",
"director" : "宫崎骏 Hayao ..."
}, {
"id" : "1292063",
"title" : "美丽人生",
"rating" : "9.5",
"releaseDate" : "1997",
"genre" : "剧情",
"director" : "罗伯托·贝尼尼 Ro..."
}, {
"id" : "1889243",
"title" : "星际穿越",
"rating" : "9.4",
"releaseDate" : "2014",
"genre" : "剧情",
"director" : "克里斯托弗·诺兰 C..."
}, {
"id" : "1295644",
"title" : "这个杀手不太冷",
"rating" : "9.4",
"releaseDate" : "1994",
"genre" : "剧情",
"director" : "吕克·贝松 Luc ..."
}, {
"id" : "3541415",
"title" : "盗梦空间",
"rating" : "9.4",
"releaseDate" : "2010",
"genre" : "剧情",
"director" : "克里斯托弗·诺兰 C..."
}, {
"id" : "1292064",
"title" : "楚门的世界",
"rating" : "9.4",
"releaseDate" : "1998",
"genre" : "剧情",
"director" : "彼得·威尔 Pete..."
}, {
"id" : "1295124",
"title" : "辛德勒的名单",
"rating" : "9.5",
"releaseDate" : "1993",
"genre" : "剧情",
"director" : "史蒂文·斯皮尔伯格 ..."
}, {
"id" : "3011091",
"title" : "忠犬八公的故事",
"rating" : "9.4",
"releaseDate" : "2009",
"genre" : "剧情",
"director" : "莱塞·霍尔斯道姆 L..."
}, {
"id" : "1292001",
"title" : "海上钢琴师",
"rating" : "9.3",
"releaseDate" : "1998",
"genre" : "剧情",
"director" : "朱塞佩·托纳多雷 G..."
}, {
"id" : "25662329",
"title" : "疯狂动物城",
"rating" : "9.3",
"releaseDate" : "2016",
"genre" : "喜剧",
"director" : "拜伦·霍华德 Byr..."
}, {
"id" : "3793023",
"title" : "三傻大闹宝莱坞",
"rating" : "9.2",
"releaseDate" : "2009",
"genre" : "剧情",
"director" : "拉库马·希拉尼 Ra..."
}, {
"id" : "2131459",
"title" : "机器人总动员",
"rating" : "9.3",
"releaseDate" : "2008",
"genre" : "科幻",
"director" : "安德鲁·斯坦顿 An..."
}, {
"id" : "1291549",
"title" : "放牛班的春天",
"rating" : "9.3",
"releaseDate" : "2004",
"genre" : "剧情",
"director" : "克里斯托夫·巴拉蒂 ..."
}, {
"id" : "1307914",
"title" : "无间道",
"rating" : "9.3",
"releaseDate" : "2002",
"genre" : "剧情",
"director" : "刘伟强 / 麦兆辉"
}, {
"id" : "1296141",
"title" : "控方证人",
"rating" : "9.6",
"releaseDate" : "1957",
"genre" : "剧情",
"director" : "比利·怀尔德 Bil..."
}, {
"id" : "20495023",
"title" : "寻梦环游记",
"rating" : "9.1",
"releaseDate" : "2017",
"genre" : "喜剧",
"director" : "李·昂克里奇 Lee..."
} ]

81
project/data/music_data.json

@ -0,0 +1,81 @@
[ {
"songId" : 30953009,
"name" : "See You Again",
"artists" : [ "Wiz Khalifa", "Charlie Puth" ],
"album" : "Furious 7: Original Motion Picture Soundtrack (Deluxe)",
"duration" : "3:49",
"platform" : "网易云音乐",
"artistsString" : "Wiz Khalifa, Charlie Puth"
}, {
"songId" : 491943377,
"name" : "See You Again",
"artists" : [ "Tyler, The Creator", "Kali Uchis" ],
"album" : "Flower Boy",
"duration" : "3:00",
"platform" : "网易云音乐",
"artistsString" : "Tyler, The Creator, Kali Uchis"
}, {
"songId" : 32009001,
"name" : "See You Again (Piano Demo Version)",
"artists" : [ "Charlie Puth" ],
"album" : "See You Again (Piano Demo)",
"duration" : "3:48",
"platform" : "网易云音乐",
"artistsString" : "Charlie Puth"
}, {
"songId" : 2154142235,
"name" : "牢大のSee You Again",
"artists" : [ "Apxl林" ],
"album" : "牢大の小曲",
"duration" : "1:35",
"platform" : "网易云音乐",
"artistsString" : "Apxl林"
}, {
"songId" : 1885677249,
"name" : "See You Again",
"artists" : [ "See You Again" ],
"album" : "Heaven's Voice: Pop Selection, Vol. 2",
"duration" : "3:48",
"platform" : "网易云音乐",
"artistsString" : "See You Again"
}, {
"songId" : 1893104207,
"name" : "See You Again",
"artists" : [ "王馨娴", "张雪峰" ],
"album" : "育才2017级6班毕业电影原声带",
"duration" : "3:47",
"platform" : "网易云音乐",
"artistsString" : "王馨娴, 张雪峰"
}, {
"songId" : 1418865944,
"name" : "See You Again(Piano Demo Version)(翻自 Charlie Puth)",
"artists" : [ "Mllano" ],
"album" : "科比·布莱恩特 Mamba never out",
"duration" : "3:51",
"platform" : "网易云音乐",
"artistsString" : "Mllano"
}, {
"songId" : 509115922,
"name" : "See You Again",
"artists" : [ "Charlie Puth", "Nick G" ],
"album" : "Ultimate Pop Hits, Vol. 12",
"duration" : "3:48",
"platform" : "网易云音乐",
"artistsString" : "Charlie Puth, Nick G"
}, {
"songId" : 2148709173,
"name" : "See You Again",
"artists" : [ "Tyler, The Creator" ],
"album" : "The Sunseeker",
"duration" : "4:33",
"platform" : "网易云音乐",
"artistsString" : "Tyler, The Creator"
}, {
"songId" : 526470566,
"name" : "SEE YOU AGAIN",
"artists" : [ "窦靖童" ],
"album" : "BTV跨年冰雪盛典",
"duration" : "4:49",
"platform" : "网易云音乐",
"artistsString" : "窦靖童"
} ]

101
project/data/news_data.json

@ -0,0 +1,101 @@
[ {
"title" : "东西问|大窑遗址:50万年后被唤醒的文明密码",
"url" : "https://www.chinanews.com.cn/dxw/2026/05-27/10629810.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新社呼和浩特5月27日电 题:大窑遗址:50万年后被唤醒的文明密码"
}, {
"title" : "国务院成立山西长治山西通洲集团留神峪煤业有限公司“5·22”特别重大瓦斯爆炸事故调查组",
"url" : "https://www.chinanews.com.cn/sh/2026/05-27/10629814.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  5月22日19时29分许,山西省长治市沁源县山西通洲集团留神峪煤业有限公司井下发生瓦斯爆炸事故,造成重大人员伤亡。为认真贯彻落实习近平总书记重要指示精神,按照李强总理等中央领导同志批示要求,根据国家有关法律法规规定,国务院成立事故调查组,由应急管理部牵头,公安部、自然资源部、全国总工会、国家能源局、国家矿山安监局和山西省人民政府等相关方面参加,对山西长治山西通洲集团留神峪煤业有限公司“5·22”特别重大瓦斯爆炸事故进行调查。"
}, {
"title" : "东风着陆场完成最后一次全系统综合演练 准备就绪迎神二十一航天员回家",
"url" : "https://www.chinanews.com.cn/gn/2026/05-27/10629811.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  神舟二十一号航天员乘组将于近日乘神舟二十二号载人飞船返回地球。东风着陆场于27日晚完成第二次全系统综合演练,全面检验搜救回收任务组织指挥、协同配合以及应急保障等能力。"
}, {
"title" : "成都医保通报药房销售“回流药”、药店套取医保基金等问题",
"url" : "https://www.chinanews.com.cn/sh/2026/05-27/10629797.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网5月27日电 据“成都医保”微信公众号消息,成都市医疗保障局27日发布情况通报称,近日,国家医保局在开展定点零售药店违法违规使用医保基金专项飞检工作中,发现天府新区泰安堂医药连锁公司正兴丽园大药房销售“回流药”,郫都区康惠仁堂药店违规留存社会保障卡、协助套取医保基金等问题。对此,成都市医疗保障局高度重视,组成专门工作组,会同天府新区及郫都区医保、市场监管、卫健等部门,对涉事药店和人员进行调查核实。"
}, {
"title" : "中国企业助力柬埔寨橡胶产业转型升级",
"url" : "https://www.chinanews.com.cn/gj/2026/05-27/10629806.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新社金边5月27日电 (记者 杨强)由中国企业投资建设的美兰集团柬埔寨橡胶工厂( KIMS RUBBER)27日在柬埔寨特本克蒙省宣布投产。"
}, {
"title" : "哈萨克斯坦一架安-2飞机坠毁致1死1伤",
"url" : "https://www.chinanews.com.cn/gj/2026/05-27/10629775.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新社阿斯塔纳5月27日电 据哈通社27日消息,一架安-2型飞机当天在哈萨克斯坦巴甫洛达尔州坠毁,造成1人死亡、1人受伤。"
}, {
"title" : "重庆市纪委监委驻市交通运输委纪检监察组原一级巡视员贾如兴接受审查调查",
"url" : "https://www.chinanews.com.cn/gn/2026/05-27/10629746.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网5月27日电 据重庆市纪委监委消息,重庆市纪委监委驻市交通运输委纪检监察组原一级巡视员贾如兴涉嫌严重违纪违法,目前正接受重庆市纪委监委纪律审查和监察调查。"
}, {
"title" : "2026西藏“体育赛事季”活动将于6至9月举办",
"url" : "https://www.chinanews.com.cn/ty/2026/05-27/10629756.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网拉萨5月27日电 (李林)27日,西藏自治区人民政府新闻办公室举行2026“体育赛事季”活动新闻发布会。会议介绍,2026西藏“体育赛事季”活动将于6至9月举办。目前,列入2026“体育赛事季”活动计划的重点赛事活动有23场次。"
}, {
"title" : "中国哲学社会科学自主知识体系与田野调查学术沙龙暨白玛措博士新书分享会在拉萨举行",
"url" : "https://www.chinanews.com.cn/gn/2026/05-27/10629755.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网拉萨5月27日电 (李林)27日,中国哲学社会科学自主知识体系与田野调查学术沙龙暨白玛措博士新书分享会在西藏自治区社科院举行。活动以“长时段田野调查与中国自主知识体系构建”为主题,围绕西藏自治区社科院研究员、人类学博士白玛措新近出版的《大地艺术家:北方牧人》和《草原上的敬老院》两部专著展开深入研讨。"
}, {
"title" : "复旦大学“十大科技进展”评选结果揭晓",
"url" : "https://www.chinanews.com.cn/sh/2026/05-27/10629738.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网上海5月27日电 (记者 陈静)27日,在复旦大学第60届校庆科学报告会上,2025年度复旦大学“十大科技进展”评选结果揭晓。"
}, {
"title" : "去年以来北京警方破获涉医保基金类案件70余起",
"url" : "https://www.chinanews.com.cn/sh/2026/05-27/10629757.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网北京5月27日电 (记者 吕少威)记者27日从北京市公安局获悉,去年以来,北京警方依托“公安+行政”执法协作机制,围绕诈骗医保基金和非法倒卖医保回收药等违法犯罪行为,开展多波次打击整治行动,截至目前,全局共破获70余起相关案件,对400余名犯罪嫌疑人依法采取刑事强制措施,切实维护了北京医保基金安全。"
}, {
"title" : "菲律宾执法部门近来频频抓扣中国公民 中国驻菲使馆表示严重关切",
"url" : "https://www.chinanews.com.cn/hr/2026/05-27/10629760.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网马尼拉5月27日电 中国驻菲律宾大使馆发言人季凌鹏27日就菲执法部门近来频频抓扣中国公民表明立场。"
}, {
"title" : "香港广州暨菁荟在港成立 助力穗港青年成长",
"url" : "https://www.chinanews.com.cn/dwq/2026/05-27/10629768.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网香港5月27日电 香港广州暨菁荟成立大会暨第一届理事会就职仪式26日在港举行。"
}, {
"title" : "西藏芒康县举办现代生产生活技能县级复赛",
"url" : "https://www.chinanews.com.cn/sh/2026/05-27/10629731.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网拉萨5月27日电 (李林)26日,西藏昌都市芒康县举办第二届“红色昌都·振兴奋进”比赛活动现代生产生活技能县级复赛。比赛设主赛场和5个分赛场,涵盖清洁能源施工、电焊等11项现代技艺工种。"
}, {
"title" : "香港特区政府委任香港科技园公司、新田科技城公司董事局主席",
"url" : "https://www.chinanews.com.cn/dwq/2026/05-27/10629783.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网香港5月27日电 香港特区政府27日公布,经香港特区行政长官李家超批准,委任钟郝仪为香港科技园公司(科技园公司)董事局主席,自7月1日起生效,任期2年,接替任期将于6月30日届满的现任科技园公司董事局主席查毅超;委任查毅超为新田科技城公司董事局主席,自7月1日起生效,任期3年。"
}, {
"title" : "海南将以主宾省身份亮相第四届链博会",
"url" : "https://www.chinanews.com.cn/cj/2026/05-27/10629784.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网海口5月27日电 (记者 王子谦)第四届中国国际供应链促进博览会(简称“链博会”)将于6月22日至26日在北京举行。记者27日从海南省贸促会了解到,海南作为主宾省,将以“链上自贸港 共享新机遇”为主题,全方位推介海南自贸港政策制度优势、开放机遇与营商环境,精准邀约客商赴琼实地考察、洽谈合作。"
}, {
"title" : "《成都市养犬管理条例》获批:城市“人宠共处”难题的立法求解",
"url" : "https://www.chinanews.com.cn/sh/2026/05-27/10629786.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网成都5月27日电 (单鹏)《成都市养犬管理条例》27日经四川省十四届人大常委会第二十七次会议批准,将于今年8月1日起正式施行。"
}, {
"title" : "第二十届深圳国际金融博览会启幕 以AI赋能产融协同",
"url" : "https://www.chinanews.com.cn/cj/2026/05-27/10629794.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网深圳5月27日电 (记者 程景伟 索有为)由深圳市人民政府主办的第二十届深圳国际金融博览会27日在深圳会展中心(福田)开幕。博览会同期举办深交所全球投资者大会、深港金融合作委员会第四次会议、中国基金报全球资产管理论坛等系列活动。"
}, {
"title" : "山东省政协委员建言助推稳岗扩容提质 促进重点群体就业",
"url" : "https://www.chinanews.com.cn/cj/2026/05-27/10629730.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网济南5月27日电(王采怡)中国人民政治协商会议山东省委员会5月27日召开“助推稳岗扩容提质 促进重点群体就业”月度协商会。会上,政协委员、专家学者、基层代表围绕助力重点群体就业岗位扩容增量、破解中小微企业用工难题、优化金融服务保障等提出意见建议,并与山东省发展和改革委员会、省教育厅、省人力资源和社会保障厅、省退役军人事务厅负责同志互动交流,共商务实举措。会前,山东省政协通过网络议政等方式,收集意见建议410余条,“民声连线”访问量14万人次。"
}, {
"title" : "云南一医院推行“续诊”微改革 3日内一次免挂号复诊",
"url" : "https://www.chinanews.com.cn/sh/2026/05-27/10629795.shtml",
"publishTime" : "Wed, 27 May 2026 22",
"summary" : "  中新网昆明5月27日电 (陈静)昆明医科大学第一附属医院27日消息,为破解门诊重复挂号痛点、保障诊疗连续性,该医院将于5月28日在云南省级医疗机构率先推行门诊“续诊”微改革,出台“首诊后3日内一次免挂号续诊”举措,降低患者就医时间与经济成本。"
} ]

105
project/pom.xml

@ -0,0 +1,105 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>netease-spider</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>netease-spider</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>21</maven.compiler.source>
<maven.compiler.target>21</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.16.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.16.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.16.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
<version>2.16.1</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.12.0</version>
</dependency>
<dependency>
<groupId>com.belerweb</groupId>
<artifactId>pinyin4j</artifactId>
<version>2.5.1</version>
</dependency>
<dependency>
<groupId>org.jfree</groupId>
<artifactId>jfreechart</artifactId>
<version>1.5.4</version>
</dependency>
<dependency>
<groupId>org.xerial</groupId>
<artifactId>sqlite-jdbc</artifactId>
<version>3.45.1.0</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>5.10.2</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.12.1</version>
<configuration>
<source>21</source>
<target>21</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<archive>
<manifest>
<mainClass>com.example.Main</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>3.1.1</version>
</plugin>
</plugins>
</build>
</project>

48
project/run.bat

@ -0,0 +1,48 @@
@echo off
chcp 65001 >nul
title 网络爬虫系统
echo ========================================
echo 正在启动...
echo ========================================
echo.
cd /d "%~dp0"
echo [1/3] 检查项目...
if not exist "pom.xml" (
echo [错误] 找不到 pom.xml!
pause
exit /b 1
)
if not exist "apache-maven-3.9.14" (
echo [错误] 找不到 apache-maven-3.9.14 文件夹!
pause
exit /b 1
)
echo [2/3] 编译项目...
call .\apache-maven-3.9.14\bin\mvn.cmd clean compile -DskipTests >nul 2>&1
if %errorlevel% neq 0 (
echo [错误] 编译失败!
echo.
echo 正在尝试重新下载依赖...
call .\apache-maven-3.9.14\bin\mvn.cmd dependency:resolve
if %errorlevel% neq 0 (
echo [错误] 依赖下载失败!
pause
exit /b 1
)
call .\apache-maven-3.9.14\bin\mvn.cmd clean compile -DskipTests
)
echo [3/3] 启动程序...
echo.
call .\apache-maven-3.9.14\bin\mvn.cmd exec:java -Dexec.mainClass="com.example.Main"
echo.
echo ========================================
echo 程序已退出
echo ========================================
pause

1084
project/src/main/java/com/example/Main.java

File diff suppressed because it is too large

173
project/src/main/java/com/example/analysis/BookAnalyzer.java

@ -0,0 +1,173 @@
package com.example.analysis;
import com.example.model.BookItem;
import java.util.*;
import java.util.stream.Collectors;
/**
* 图书数据分析器
*/
public class BookAnalyzer {
public static AnalysisResult analyzeBooks(List<BookItem> books) {
AnalysisResult result = new AnalysisResult();
if (books == null || books.isEmpty()) {
return result;
}
result.totalCount = books.size();
// 数据清洗
List<BookItem> cleanedBooks = books.stream()
.map(BookAnalyzer::cleanBookItem)
.filter(b -> !b.getTitle().isEmpty())
.collect(Collectors.toList());
result.cleanedCount = cleanedBooks.size();
// 统计作者出现次数
Map<String, Integer> authorCount = cleanedBooks.stream()
.filter(b -> b.getAuthor() != null && !b.getAuthor().isEmpty())
.collect(Collectors.groupingBy(
b -> b.getAuthor(),
Collectors.summingInt(b -> 1)
));
result.topAuthors = authorCount.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(5)
.collect(Collectors.toList());
// 统计出版社
Map<String, Integer> publisherCount = cleanedBooks.stream()
.filter(b -> b.getPublisher() != null && !b.getPublisher().isEmpty())
.collect(Collectors.groupingBy(
b -> b.getPublisher(),
Collectors.summingInt(b -> 1)
));
result.topPublishers = publisherCount.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(5)
.collect(Collectors.toList());
// 计算评分统计
List<Double> validRatings = cleanedBooks.stream()
.map(b -> {
try {
return Double.parseDouble(b.getRating());
} catch (Exception e) {
return null;
}
})
.filter(Objects::nonNull)
.sorted()
.collect(Collectors.toList());
result.ratingCount = validRatings.size();
if (!validRatings.isEmpty()) {
result.averageRating = String.format("%.2f", validRatings.stream().mapToDouble(Double::doubleValue).average().orElse(0));
result.minRating = String.format("%.1f", validRatings.get(0));
result.maxRating = String.format("%.1f", validRatings.get(validRatings.size() - 1));
} else {
result.averageRating = "N/A";
result.minRating = "N/A";
result.maxRating = "N/A";
}
// 统计价格范围
Map<String, Integer> priceDistribution = new LinkedHashMap<>();
priceDistribution.put("50元以下", 0);
priceDistribution.put("50-100元", 0);
priceDistribution.put("100-200元", 0);
priceDistribution.put("200元以上", 0);
int totalPrice = 0;
int priceCount = 0;
for (BookItem book : cleanedBooks) {
try {
double price = Double.parseDouble(book.getPrice());
totalPrice += price;
priceCount++;
if (price < 50) {
priceDistribution.merge("50元以下", 1, Integer::sum);
} else if (price <= 100) {
priceDistribution.merge("50-100元", 1, Integer::sum);
} else if (price <= 200) {
priceDistribution.merge("100-200元", 1, Integer::sum);
} else {
priceDistribution.merge("200元以上", 1, Integer::sum);
}
} catch (Exception e) {
// ignore
}
}
result.priceDistribution = priceDistribution;
result.averagePrice = priceCount > 0 ? String.format("%.2f", (double) totalPrice / priceCount) : "N/A";
return result;
}
private static BookItem cleanBookItem(BookItem item) {
String title = DataCleaner.trimAndNormalize(item.getTitle());
String author = DataCleaner.trimAndNormalize(item.getAuthor());
String publisher = DataCleaner.trimAndNormalize(item.getPublisher());
String rating = DataCleaner.handleMissingValue(item.getRating(), "0");
String price = DataCleaner.handleMissingValue(item.getPrice(), "0");
return new BookItem(title, author, publisher, rating, price);
}
public static class AnalysisResult {
public int totalCount;
public int cleanedCount;
public List<Map.Entry<String, Integer>> topAuthors;
public List<Map.Entry<String, Integer>> topPublishers;
public String averageRating;
public String minRating;
public String maxRating;
public int ratingCount;
public Map<String, Integer> priceDistribution;
public String averagePrice;
public void print() {
System.out.println("\n============================================================");
System.out.println(" [ 图书数据分析报告 ]");
System.out.println("============================================================");
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 基本统计 ]");
System.out.println("------------------------------------------------------------");
System.out.printf(" %-22s | %12s%n", "项目", "数值");
System.out.printf(" %-22s | %12d%n", "原始数据量", totalCount);
System.out.printf(" %-22s | %12d%n", "清洗后数据量", cleanedCount);
System.out.printf(" %-22s | %12s%n", "平均评分", averageRating);
System.out.printf(" %-22s | %12s%n", "最低评分", minRating);
System.out.printf(" %-22s | %12s%n", "最高评分", maxRating);
System.out.printf(" %-22s | %12s%n", "平均价格", averagePrice + " 元");
System.out.println("------------------------------------------------------------");
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 热门作者 TOP5 ]");
System.out.println("------------------------------------------------------------");
System.out.println(ChartGenerator.generateTextBarChart(
topAuthors != null ? topAuthors.stream()
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)) : new HashMap<>(), 30));
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 热门出版社 TOP5 ]");
System.out.println("------------------------------------------------------------");
System.out.println(ChartGenerator.generateTextBarChart(
topPublishers != null ? topPublishers.stream()
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)) : new HashMap<>(), 30));
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 价格分布 ]");
System.out.println("------------------------------------------------------------");
System.out.println(ChartGenerator.generateTextPieChart(priceDistribution != null ? priceDistribution : new HashMap<>()));
System.out.println("\n============================================================");
}
}
}

331
project/src/main/java/com/example/analysis/ChartGenerator.java

@ -0,0 +1,331 @@
package com.example.analysis;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import java.util.Map;
/**
* 图表生成工具类
* 支持文本图表和HTML图表生成
*/
public class ChartGenerator {
private static final String HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>数据分析报告</title>
<script src="https://cdn.jsdelivr.net/npm/echarts@5.4.3/dist/echarts.min.js"></script>
<style>
body { font-family: 'Microsoft YaHei', sans-serif; margin: 20px; background: #f5f5f5; }
.chart-container { width: 800px; height: 400px; margin: 30px auto; background: white; border-radius: 8px; box-shadow: 0 2px 12px rgba(0,0,0,0.1); }
.report-title { text-align: center; color: #333; margin-bottom: 20px; }
.section { margin: 20px 0; padding: 20px; background: white; border-radius: 8px; }
.section-title { color: #666; border-bottom: 2px solid #007bff; padding-bottom: 10px; }
.stats-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; }
.stat-card { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 8px; }
.stat-value { font-size: 32px; font-weight: bold; }
.stat-label { font-size: 14px; opacity: 0.9; }
table { width: 100%; border-collapse: collapse; margin-top: 15px; }
th, td { padding: 12px; text-align: left; border-bottom: 1px solid #eee; }
th { background: #f8f9fa; color: #495057; }
</style>
</head>
<body>
<h1 class="report-title">%s</h1>
<div class="stats-grid">
%s
</div>
%s
<script>
%s
</script>
</body>
</html>
""";
/**
* 生成文本柱状图
*/
public static String generateTextBarChart(Map<String, Integer> data, int maxBarWidth) {
if (data == null || data.isEmpty()) {
return " [无数据]";
}
StringBuilder sb = new StringBuilder();
sb.append("\n");
int maxValue = data.values().stream().max(Integer::compare).orElse(1);
int maxKeyLen = data.keySet().stream().mapToInt(k -> k.length()).max().orElse(10);
int keyWidth = Math.min(Math.max(maxKeyLen, 8), 20);
List<Map.Entry<String, Integer>> sorted = data.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(10)
.toList();
int maxCountWidth = sorted.stream().mapToInt(e -> String.valueOf(e.getValue()).length()).max().orElse(3);
sb.append(" +--").append("-".repeat(keyWidth + 2)).append("+--").append("-".repeat(maxCountWidth + 2)).append("+--").append("-".repeat(Math.min(maxBarWidth, 30) + 2)).append("+\n");
sb.append(" | ").append(String.format("%-" + keyWidth + "s", "指标")).append(" | ").append(String.format("%" + maxCountWidth + "s", "数量")).append(" | 分布图").append(" |\n");
sb.append(" +--").append("-".repeat(keyWidth + 2)).append("+--").append("-".repeat(maxCountWidth + 2)).append("+--").append("-".repeat(Math.min(maxBarWidth, 30) + 2)).append("+\n");
for (Map.Entry<String, Integer> entry : sorted) {
int barLength = (int) ((double) entry.getValue() / maxValue * Math.min(maxBarWidth, 30));
String bar = "#".repeat(Math.max(1, barLength));
sb.append(" | ").append(String.format("%-" + keyWidth + "s", truncate(entry.getKey(), keyWidth)))
.append(" | ").append(String.format("%" + maxCountWidth + "d", entry.getValue()))
.append(" | ").append(bar).append(" |\n");
}
sb.append(" +--").append("-".repeat(keyWidth + 2)).append("+--").append("-".repeat(maxCountWidth + 2)).append("+--").append("-".repeat(Math.min(maxBarWidth, 30) + 2)).append("+\n");
return sb.toString();
}
/**
* 生成文本饼图
*/
public static String generateTextPieChart(Map<String, Integer> data) {
if (data == null || data.isEmpty()) {
return " [无数据]";
}
StringBuilder sb = new StringBuilder();
sb.append("\n");
int total = data.values().stream().mapToInt(Integer::intValue).sum();
if (total == 0) {
return " [无数据]";
}
List<Map.Entry<String, Integer>> sorted = data.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(8)
.toList();
int maxKeyLen = sorted.stream().mapToInt(e -> e.getKey().length()).max().orElse(10);
int keyWidth = Math.min(Math.max(maxKeyLen, 8), 16);
sb.append(" +--").append("-".repeat(keyWidth + 2)).append("+--").append("-".repeat(8)).append("+--").append("-".repeat(35)).append("+\n");
sb.append(" | ").append(String.format("%-" + keyWidth + "s", "类别")).append(" | 占比 | 分布图").append(" |\n");
sb.append(" +--").append("-".repeat(keyWidth + 2)).append("+--").append("-".repeat(8)).append("+--").append("-".repeat(35)).append("+\n");
for (Map.Entry<String, Integer> entry : sorted) {
double percentage = (double) entry.getValue() / total * 100;
int sliceLength = (int) (percentage / 100 * 30);
String slice = "#".repeat(Math.max(1, sliceLength));
sb.append(" | ").append(String.format("%-" + keyWidth + "s", truncate(entry.getKey(), keyWidth)))
.append(" | ").append(String.format("%5.1f%%", percentage))
.append(" | ").append(slice).append(" |\n");
}
sb.append(" +--").append("-".repeat(keyWidth + 2)).append("+--").append("-".repeat(8)).append("+--").append("-".repeat(35)).append("+\n");
sb.append(" 总数: ").append(total).append("\n");
return sb.toString();
}
/**
* 生成文本表格
*/
public static String generateTextTable(String[] headers, List<String[]> rows) {
StringBuilder sb = new StringBuilder();
// 计算每列最大宽度
int[] colWidths = new int[headers.length];
for (int i = 0; i < headers.length; i++) {
colWidths[i] = headers[i].length();
}
for (String[] row : rows) {
for (int i = 0; i < Math.min(row.length, headers.length); i++) {
if (row[i] != null && row[i].length() > colWidths[i]) {
colWidths[i] = row[i].length();
}
}
}
// 生成分隔线
StringBuilder separator = new StringBuilder("+");
for (int width : colWidths) {
separator.append("-".repeat(width + 2)).append("+");
}
// 输出表头
sb.append(separator).append("\n");
sb.append("|");
for (int i = 0; i < headers.length; i++) {
sb.append(String.format(" %-" + colWidths[i] + "s |", headers[i]));
}
sb.append("\n");
sb.append(separator).append("\n");
// 输出数据行
for (String[] row : rows) {
sb.append("|");
for (int i = 0; i < headers.length; i++) {
String value = (i < row.length && row[i] != null) ? row[i] : "";
sb.append(String.format(" %-" + colWidths[i] + "s |", truncate(value, colWidths[i])));
}
sb.append("\n");
}
sb.append(separator).append("\n");
return sb.toString();
}
/**
* 生成HTML报告包含ECharts图表
*/
public static void generateHtmlReport(String title, String statsHtml, String chartsHtml, String chartScripts, String outputPath) throws IOException {
String html = String.format(HTML_TEMPLATE, title, statsHtml, chartsHtml, chartScripts);
File outputFile = new File(outputPath);
outputFile.getParentFile().mkdirs();
try (FileWriter writer = new FileWriter(outputFile)) {
writer.write(html);
}
System.out.println("HTML报告已生成: " + outputFile.getAbsolutePath());
}
/**
* 生成柱状图HTML代码
*/
public static String generateBarChartHtml(String chartId, String title) {
return String.format("""
<div class="section">
<h2 class="section-title">%s</h2>
<div id="%s" class="chart-container"></div>
</div>
""", title, chartId);
}
/**
* 生成饼图HTML代码
*/
public static String generatePieChartHtml(String chartId, String title) {
return String.format("""
<div class="section">
<h2 class="section-title">%s</h2>
<div id="%s" class="chart-container"></div>
</div>
""", title, chartId);
}
/**
* 生成柱状图JavaScript代码
*/
public static String generateBarChartScript(String chartId, String title, String[] xAxis, int[] yAxis) {
StringBuilder xData = new StringBuilder("[");
for (int i = 0; i < xAxis.length; i++) {
xData.append("'").append(escapeJs(xAxis[i])).append("'");
if (i < xAxis.length - 1) xData.append(",");
}
xData.append("]");
StringBuilder yData = new StringBuilder("[");
for (int i = 0; i < yAxis.length; i++) {
yData.append(yAxis[i]);
if (i < yAxis.length - 1) yData.append(",");
}
yData.append("]");
return String.format("""
var chart%s = echarts.init(document.getElementById('%s'));
chart%s.setOption({
title: { text: '%s' },
tooltip: {},
xAxis: { data: %s },
yAxis: {},
series: [{ type: 'bar', data: %s, itemStyle: { color: '#667eea' } }]
});
""", chartId, chartId, chartId, escapeJs(title), xData, yData);
}
/**
* 生成饼图JavaScript代码
*/
public static String generatePieChartScript(String chartId, String title, String[] names, int[] values) {
StringBuilder data = new StringBuilder("[");
for (int i = 0; i < names.length; i++) {
data.append("{value:").append(values[i]).append(",name:'").append(escapeJs(names[i])).append("'}");
if (i < names.length - 1) data.append(",");
}
data.append("]");
return String.format("""
var chart%s = echarts.init(document.getElementById('%s'));
chart%s.setOption({
title: { text: '%s' },
tooltip: { trigger: 'item', formatter: '{b}: {c} ({d}%)' },
legend: { orient: 'vertical', right: 10, top: 'center' },
series: [{ type: 'pie', radius: ['40%%', '70%%'], data: %s }]
});
""", chartId, chartId, chartId, escapeJs(title), data);
}
/**
* 生成统计卡片HTML
*/
public static String generateStatCard(String label, String value) {
return String.format("""
<div class="stat-card">
<div class="stat-value">%s</div>
<div class="stat-label">%s</div>
</div>
""", value, label);
}
/**
* 生成HTML表格
*/
public static String generateHtmlTable(String[] headers, List<String[]> rows) {
StringBuilder sb = new StringBuilder();
sb.append("<div class=\"section\"><table>");
// 表头
sb.append("<thead><tr>");
for (String header : headers) {
sb.append("<th>").append(escapeHtml(header)).append("</th>");
}
sb.append("</tr></thead><tbody>");
// 数据行
for (String[] row : rows) {
sb.append("<tr>");
for (String cell : row) {
sb.append("<td>").append(escapeHtml(cell != null ? cell : "")).append("</td>");
}
sb.append("</tr>");
}
sb.append("</tbody></table></div>");
return sb.toString();
}
private static String truncate(String str, int maxLength) {
if (str == null) return "";
if (str.length() <= maxLength) return str;
return str.substring(0, maxLength - 3) + "...";
}
private static String escapeJs(String str) {
if (str == null) return "";
return str.replace("\\", "\\\\").replace("'", "\\'").replace("\"", "\\\"");
}
private static String escapeHtml(String str) {
if (str == null) return "";
return str.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace("\"", "&quot;");
}
}

187
project/src/main/java/com/example/analysis/DataCleaner.java

@ -0,0 +1,187 @@
package com.example.analysis;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
/**
* 数据清洗工具类
* 提供去空格HTML标签移除日期格式化缺失值处理等功能
*/
public class DataCleaner {
private static final Pattern HTML_PATTERN = Pattern.compile("<[^>]*>");
private static final Pattern MULTI_SPACE_PATTERN = Pattern.compile("\\s+");
private static final String[] DATE_FORMATS = {
"yyyy-MM-dd",
"yyyy/MM/dd",
"yyyy年MM月dd日",
"yyyy-MM-dd HH:mm:ss",
"yyyy/MM/dd HH:mm:ss",
"yyyy-MM",
"yyyy年MM月",
"yyyy"
};
/**
* 去除字符串中的所有空格
*/
public static String removeSpaces(String input) {
if (input == null) return "";
return input.replaceAll("\\s", "");
}
/**
* 去除首尾空格合并中间多个空格
*/
public static String trimAndNormalize(String input) {
if (input == null) return "";
return MULTI_SPACE_PATTERN.matcher(input.trim()).replaceAll(" ");
}
/**
* 移除HTML标签
*/
public static String removeHtmlTags(String input) {
if (input == null) return "";
String result = HTML_PATTERN.matcher(input).replaceAll("");
return trimAndNormalize(result);
}
/**
* 格式化日期
* @param input 原始日期字符串
* @param targetFormat 目标格式 "yyyy-MM-dd"
* @return 格式化后的日期字符串解析失败返回空字符串
*/
public static String formatDate(String input, String targetFormat) {
if (input == null || input.isEmpty()) return "";
String normalizedInput = trimAndNormalize(input);
for (String format : DATE_FORMATS) {
try {
SimpleDateFormat sdf = new SimpleDateFormat(format);
Date date = sdf.parse(normalizedInput);
SimpleDateFormat targetSdf = new SimpleDateFormat(targetFormat);
return targetSdf.format(date);
} catch (ParseException e) {
// 继续尝试下一个格式
}
}
// 如果都解析失败,尝试提取年份
String year = extractYear(input);
if (!year.isEmpty()) {
return year + "-01-01";
}
return "";
}
/**
* 提取年份
*/
public static String extractYear(String input) {
if (input == null) return "";
Pattern yearPattern = Pattern.compile("(\\d{4})");
java.util.regex.Matcher matcher = yearPattern.matcher(input);
if (matcher.find()) {
return matcher.group(1);
}
return "";
}
/**
* 处理缺失值
* @param value 原始值
* @param defaultValue 默认值
* @return 如果原始值为空或null返回默认值
*/
public static String handleMissingValue(String value, String defaultValue) {
if (value == null || value.isEmpty() || "null".equalsIgnoreCase(value) || "N/A".equalsIgnoreCase(value)) {
return defaultValue;
}
return value.trim();
}
/**
* 处理数值缺失值
* @param value 原始值
* @param defaultValue 默认值
* @return 如果原始值无效返回默认值
*/
public static double handleMissingNumber(String value, double defaultValue) {
if (value == null || value.isEmpty()) {
return defaultValue;
}
try {
return Double.parseDouble(value.trim());
} catch (NumberFormatException e) {
return defaultValue;
}
}
/**
* 清理电影数据
*/
public static Map<String, String> cleanMovieData(Map<String, String> rawData) {
Map<String, String> cleaned = new HashMap<>();
cleaned.put("title", trimAndNormalize(rawData.get("title")));
cleaned.put("rating", handleMissingValue(rawData.get("rating"), "0"));
cleaned.put("releaseDate", formatDate(rawData.get("releaseDate"), "yyyy-MM-dd"));
cleaned.put("genre", trimAndNormalize(rawData.get("genre")));
cleaned.put("director", trimAndNormalize(rawData.get("director")));
cleaned.put("id", handleMissingValue(rawData.get("id"), ""));
return cleaned;
}
/**
* 清理图书数据
*/
public static Map<String, String> cleanBookData(Map<String, String> rawData) {
Map<String, String> cleaned = new HashMap<>();
cleaned.put("title", trimAndNormalize(rawData.get("title")));
cleaned.put("author", trimAndNormalize(rawData.get("author")));
cleaned.put("publisher", trimAndNormalize(rawData.get("publisher")));
cleaned.put("rating", handleMissingValue(rawData.get("rating"), "0"));
cleaned.put("price", handleMissingValue(rawData.get("price"), "0"));
return cleaned;
}
/**
* 清理新闻数据
*/
public static Map<String, String> cleanNewsData(Map<String, String> rawData) {
Map<String, String> cleaned = new HashMap<>();
cleaned.put("title", trimAndNormalize(rawData.get("title")));
cleaned.put("summary", removeHtmlTags(rawData.get("summary")));
cleaned.put("publishTime", formatDate(rawData.get("publishTime"), "yyyy-MM-dd HH:mm:ss"));
cleaned.put("url", handleMissingValue(rawData.get("url"), ""));
return cleaned;
}
/**
* 清理音乐数据
*/
public static Map<String, String> cleanMusicData(Map<String, String> rawData) {
Map<String, String> cleaned = new HashMap<>();
cleaned.put("name", trimAndNormalize(rawData.get("name")));
cleaned.put("artists", trimAndNormalize(rawData.get("artists")));
cleaned.put("album", trimAndNormalize(rawData.get("album")));
cleaned.put("duration", handleMissingValue(rawData.get("duration"), "0:00"));
cleaned.put("songId", handleMissingValue(rawData.get("songId"), ""));
return cleaned;
}
}

285
project/src/main/java/com/example/analysis/MovieAnalyzer.java

@ -0,0 +1,285 @@
package com.example.analysis;
import com.example.model.MovieItem;
import java.util.*;
import java.util.stream.Collectors;
/**
* 电影数据分析器
* 提供评分分布年份相关性导演排行等分析功能
*/
public class MovieAnalyzer {
public static AnalysisResult analyzeMovies(List<MovieItem> movies) {
AnalysisResult result = new AnalysisResult();
if (movies == null || movies.isEmpty()) {
return result;
}
result.totalCount = movies.size();
// 数据清洗
List<MovieItem> cleanedMovies = movies.stream()
.map(MovieAnalyzer::cleanMovieItem)
.filter(m -> !m.getTitle().isEmpty())
.collect(Collectors.toList());
result.cleanedCount = cleanedMovies.size();
// 统计导演出现次数
Map<String, Integer> directorCount = cleanedMovies.stream()
.filter(m -> m.getDirector() != null && !m.getDirector().isEmpty())
.collect(Collectors.groupingBy(
m -> m.getDirector(),
Collectors.summingInt(m -> 1)
));
result.topDirectors = directorCount.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(5)
.collect(Collectors.toList());
// 统计类型分布
Map<String, Integer> genreCount = new HashMap<>();
for (MovieItem movie : cleanedMovies) {
if (movie.getGenre() != null && !movie.getGenre().isEmpty()) {
String[] genres = movie.getGenre().split("[,,]");
for (String genre : genres) {
genre = genre.trim();
if (!genre.isEmpty()) {
genreCount.merge(genre, 1, Integer::sum);
}
}
}
}
result.genreDistribution = genreCount;
// 计算评分统计
List<Double> validRatings = cleanedMovies.stream()
.map(m -> {
try {
return Double.parseDouble(m.getRating());
} catch (Exception e) {
return null;
}
})
.filter(Objects::nonNull)
.sorted()
.collect(Collectors.toList());
result.ratingCount = validRatings.size();
if (!validRatings.isEmpty()) {
result.averageRating = String.format("%.2f", validRatings.stream().mapToDouble(Double::doubleValue).average().orElse(0));
result.minRating = String.format("%.1f", validRatings.get(0));
result.maxRating = String.format("%.1f", validRatings.get(validRatings.size() - 1));
result.medianRating = String.format("%.2f", calculateMedian(validRatings));
result.ratingStdDev = String.format("%.2f", calculateStdDev(validRatings));
} else {
result.averageRating = "N/A";
result.minRating = "N/A";
result.maxRating = "N/A";
result.medianRating = "N/A";
result.ratingStdDev = "N/A";
}
// 评分分布统计
Map<String, Integer> ratingDistribution = new LinkedHashMap<>();
ratingDistribution.put("0-2分", 0);
ratingDistribution.put("2-4分", 0);
ratingDistribution.put("4-6分", 0);
ratingDistribution.put("6-8分", 0);
ratingDistribution.put("8-10分", 0);
for (Double rating : validRatings) {
if (rating >= 0 && rating < 2) {
ratingDistribution.merge("0-2分", 1, Integer::sum);
} else if (rating >= 2 && rating < 4) {
ratingDistribution.merge("2-4分", 1, Integer::sum);
} else if (rating >= 4 && rating < 6) {
ratingDistribution.merge("4-6分", 1, Integer::sum);
} else if (rating >= 6 && rating < 8) {
ratingDistribution.merge("6-8分", 1, Integer::sum);
} else if (rating >= 8 && rating <= 10) {
ratingDistribution.merge("8-10分", 1, Integer::sum);
}
}
result.ratingDistribution = ratingDistribution;
// 统计年份分布
Map<String, Integer> yearCount = cleanedMovies.stream()
.filter(m -> m.getReleaseDate() != null && !m.getReleaseDate().isEmpty())
.map(m -> {
String date = m.getReleaseDate();
return date.substring(0, Math.min(4, date.length()));
})
.collect(Collectors.groupingBy(
year -> year,
Collectors.summingInt(year -> 1)
));
result.yearDistribution = yearCount.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByKey().reversed())
.collect(Collectors.toList());
// 年份与评分相关性分析
Map<String, Double> yearAvgRating = new HashMap<>();
Map<String, Integer> yearRatingCount = new HashMap<>();
for (MovieItem movie : cleanedMovies) {
if (movie.getReleaseDate() != null && !movie.getReleaseDate().isEmpty()) {
String year = movie.getReleaseDate().substring(0, Math.min(4, movie.getReleaseDate().length()));
try {
double rating = Double.parseDouble(movie.getRating());
yearAvgRating.merge(year, rating, Double::sum);
yearRatingCount.merge(year, 1, Integer::sum);
} catch (Exception e) {
// ignore
}
}
}
Map<String, String> yearRatingAvg = new HashMap<>();
for (Map.Entry<String, Double> entry : yearAvgRating.entrySet()) {
int count = yearRatingCount.getOrDefault(entry.getKey(), 1);
yearRatingAvg.put(entry.getKey(), String.format("%.2f", entry.getValue() / count));
}
result.yearAverageRating = yearRatingAvg;
// 找出评分最高的电影
result.topRatedMovies = cleanedMovies.stream()
.filter(m -> {
try {
Double.parseDouble(m.getRating());
return true;
} catch (Exception e) {
return false;
}
})
.sorted((a, b) -> {
try {
double r1 = Double.parseDouble(a.getRating());
double r2 = Double.parseDouble(b.getRating());
return Double.compare(r2, r1);
} catch (Exception e) {
return 0;
}
})
.limit(5)
.collect(Collectors.toList());
return result;
}
private static MovieItem cleanMovieItem(MovieItem item) {
String title = DataCleaner.trimAndNormalize(item.getTitle());
String rating = DataCleaner.handleMissingValue(item.getRating(), "0");
String releaseDate = DataCleaner.formatDate(item.getReleaseDate(), "yyyy-MM-dd");
String genre = DataCleaner.trimAndNormalize(item.getGenre());
String director = DataCleaner.trimAndNormalize(item.getDirector());
return new MovieItem(item.getId(), title, rating, releaseDate, genre, director);
}
private static double calculateMedian(List<Double> values) {
int size = values.size();
if (size % 2 == 0) {
return (values.get(size / 2 - 1) + values.get(size / 2)) / 2.0;
} else {
return values.get(size / 2);
}
}
private static double calculateStdDev(List<Double> values) {
double mean = values.stream().mapToDouble(Double::doubleValue).average().orElse(0);
double variance = values.stream()
.mapToDouble(v -> Math.pow(v - mean, 2))
.average()
.orElse(0);
return Math.sqrt(variance);
}
public static class AnalysisResult {
public int totalCount;
public int cleanedCount;
public List<Map.Entry<String, Integer>> topDirectors;
public Map<String, Integer> genreDistribution;
public String averageRating;
public String minRating;
public String maxRating;
public String medianRating;
public String ratingStdDev;
public int ratingCount;
public Map<String, Integer> ratingDistribution;
public List<Map.Entry<String, Integer>> yearDistribution;
public Map<String, String> yearAverageRating;
public List<MovieItem> topRatedMovies;
public void print() {
System.out.println("\n============================================================");
System.out.println(" [ 电影数据分析报告 ]");
System.out.println("============================================================");
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 基本统计 ]");
System.out.println("------------------------------------------------------------");
System.out.printf(" %-22s | %12s%n", "项目", "数值");
System.out.printf(" %-22s | %12d%n", "原始数据量", totalCount);
System.out.printf(" %-22s | %12d%n", "清洗后数据量", cleanedCount);
System.out.printf(" %-22s | %12s%n", "平均评分", averageRating);
System.out.printf(" %-22s | %12s%n", "最低评分", minRating);
System.out.printf(" %-22s | %12s%n", "最高评分", maxRating);
System.out.printf(" %-22s | %12s%n", "中位数评分", medianRating);
System.out.printf(" %-22s | %12s%n", "评分标准差", ratingStdDev);
System.out.println("------------------------------------------------------------");
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 评分最高的5部电影 ]");
System.out.println("------------------------------------------------------------");
System.out.printf(" %-4s | %-30s | %-8s | %-10s%n", "排名", "片名", "评分", "年份");
System.out.println(" ---- | ------------------------------ | -------- | ----------");
if (topRatedMovies != null) {
for (int i = 0; i < topRatedMovies.size(); i++) {
MovieItem m = topRatedMovies.get(i);
System.out.printf(" %-4d | %-30s | %-8s | %-10s%n",
i + 1,
truncate(m.getTitle(), 30),
m.getRating(),
m.getReleaseDate() != null ? m.getReleaseDate().substring(0, Math.min(4, m.getReleaseDate().length())) : "");
}
}
System.out.println("------------------------------------------------------------");
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 热门导演 TOP5 ]");
System.out.println("------------------------------------------------------------");
System.out.println(ChartGenerator.generateTextBarChart(
topDirectors != null ? topDirectors.stream()
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)) : new HashMap<>(), 30));
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 类型分布 ]");
System.out.println("------------------------------------------------------------");
System.out.println(ChartGenerator.generateTextPieChart(genreDistribution != null ? genreDistribution : new HashMap<>()));
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 评分分布 ]");
System.out.println("------------------------------------------------------------");
System.out.println(ChartGenerator.generateTextBarChart(ratingDistribution != null ? ratingDistribution : new HashMap<>(), 30));
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 年份分布(最近10年) ]");
System.out.println("------------------------------------------------------------");
Map<String, Integer> recentYears = new HashMap<>();
if (yearDistribution != null) {
yearDistribution.stream().limit(10).forEach(e -> recentYears.put(e.getKey(), e.getValue()));
}
System.out.println(ChartGenerator.generateTextBarChart(recentYears, 30));
System.out.println("\n============================================================");
}
private String truncate(String str, int maxLength) {
if (str == null) return "";
if (str.length() <= maxLength) return str;
return str.substring(0, maxLength - 3) + "...";
}
}
}

189
project/src/main/java/com/example/analysis/MusicAnalyzer.java

@ -0,0 +1,189 @@
package com.example.analysis;
import com.example.model.Song;
import java.util.*;
import java.util.stream.Collectors;
/**
* 音乐数据分析器
*/
public class MusicAnalyzer {
public static AnalysisResult analyzeSongs(List<Song> songs) {
AnalysisResult result = new AnalysisResult();
if (songs == null || songs.isEmpty()) {
return result;
}
result.totalCount = songs.size();
// 数据清洗
List<Song> cleanedSongs = songs.stream()
.map(MusicAnalyzer::cleanSongItem)
.filter(s -> !s.getName().isEmpty())
.collect(Collectors.toList());
result.cleanedCount = cleanedSongs.size();
// 统计歌手出现次数
Map<String, Integer> artistCount = new HashMap<>();
for (Song song : cleanedSongs) {
for (String artist : song.getArtists()) {
if (artist != null && !artist.isEmpty()) {
artistCount.merge(artist, 1, Integer::sum);
}
}
}
result.topArtists = artistCount.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(5)
.collect(Collectors.toList());
// 统计专辑数量
Set<String> albums = cleanedSongs.stream()
.filter(s -> s.getAlbum() != null && !s.getAlbum().isEmpty())
.map(Song::getAlbum)
.collect(Collectors.toSet());
result.albumCount = albums.size();
// 计算时长统计
List<Integer> durations = cleanedSongs.stream()
.map(s -> parseDuration(s.getDuration()))
.filter(d -> d > 0)
.sorted()
.collect(Collectors.toList());
if (!durations.isEmpty()) {
int totalDuration = durations.stream().mapToInt(Integer::intValue).sum();
result.averageDuration = formatDuration(totalDuration / durations.size());
result.minDuration = formatDuration(durations.get(0));
result.maxDuration = formatDuration(durations.get(durations.size() - 1));
} else {
result.averageDuration = "N/A";
result.minDuration = "N/A";
result.maxDuration = "N/A";
}
// 时长分布
Map<String, Integer> durationDistribution = new LinkedHashMap<>();
durationDistribution.put("2分钟以下", 0);
durationDistribution.put("2-3分钟", 0);
durationDistribution.put("3-4分钟", 0);
durationDistribution.put("4-5分钟", 0);
durationDistribution.put("5分钟以上", 0);
for (int duration : durations) {
if (duration < 120) {
durationDistribution.merge("2分钟以下", 1, Integer::sum);
} else if (duration < 180) {
durationDistribution.merge("2-3分钟", 1, Integer::sum);
} else if (duration < 240) {
durationDistribution.merge("3-4分钟", 1, Integer::sum);
} else if (duration < 300) {
durationDistribution.merge("4-5分钟", 1, Integer::sum);
} else {
durationDistribution.merge("5分钟以上", 1, Integer::sum);
}
}
result.durationDistribution = durationDistribution;
// 按平台统计
Map<String, Integer> platformCount = cleanedSongs.stream()
.collect(Collectors.groupingBy(
Song::getPlatform,
Collectors.summingInt(s -> 1)
));
result.platformDistribution = platformCount;
return result;
}
private static Song cleanSongItem(Song item) {
String name = DataCleaner.trimAndNormalize(item.getName());
List<String> artists = item.getArtists().stream()
.map(DataCleaner::trimAndNormalize)
.filter(s -> !s.isEmpty())
.collect(Collectors.toList());
String album = DataCleaner.trimAndNormalize(item.getAlbum());
String duration = DataCleaner.handleMissingValue(item.getDuration(), "0:00");
return new Song(item.getSongId(), name, artists, album, duration, item.getPlatform());
}
private static int parseDuration(String durationStr) {
if (durationStr == null || durationStr.isEmpty()) {
return 0;
}
try {
String[] parts = durationStr.split(":");
if (parts.length == 2) {
return Integer.parseInt(parts[0]) * 60 + Integer.parseInt(parts[1]);
} else if (parts.length == 3) {
return Integer.parseInt(parts[0]) * 3600 + Integer.parseInt(parts[1]) * 60 + Integer.parseInt(parts[2]);
}
} catch (Exception e) {
// ignore
}
return 0;
}
private static String formatDuration(int seconds) {
int hours = seconds / 3600;
int minutes = (seconds % 3600) / 60;
int secs = seconds % 60;
if (hours > 0) {
return String.format("%d:%02d:%02d", hours, minutes, secs);
}
return String.format("%d:%02d", minutes, secs);
}
public static class AnalysisResult {
public int totalCount;
public int cleanedCount;
public List<Map.Entry<String, Integer>> topArtists;
public int albumCount;
public String averageDuration;
public String minDuration;
public String maxDuration;
public Map<String, Integer> durationDistribution;
public Map<String, Integer> platformDistribution;
public void print() {
System.out.println("\n============================================================");
System.out.println(" [ 音乐数据分析报告 ]");
System.out.println("============================================================");
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 基本统计 ]");
System.out.println("------------------------------------------------------------");
System.out.printf(" %-22s | %12d%n", "原始数据量", totalCount);
System.out.printf(" %-22s | %12d%n", "清洗后数据量", cleanedCount);
System.out.printf(" %-22s | %12d%n", "专辑数量", albumCount);
System.out.printf(" %-22s | %12s%n", "平均时长", averageDuration);
System.out.printf(" %-22s | %12s%n", "最短时长", minDuration);
System.out.printf(" %-22s | %12s%n", "最长时长", maxDuration);
System.out.println("------------------------------------------------------------");
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 热门歌手 TOP5 ]");
System.out.println("------------------------------------------------------------");
System.out.println(ChartGenerator.generateTextBarChart(
topArtists != null ? topArtists.stream()
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)) : new HashMap<>(), 30));
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 时长分布 ]");
System.out.println("------------------------------------------------------------");
System.out.println(ChartGenerator.generateTextPieChart(durationDistribution != null ? durationDistribution : new HashMap<>()));
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 平台分布 ]");
System.out.println("------------------------------------------------------------");
System.out.println(ChartGenerator.generateTextPieChart(platformDistribution != null ? platformDistribution : new HashMap<>()));
System.out.println("\n============================================================");
}
}
}

298
project/src/main/java/com/example/analysis/NewsAnalyzer.java

@ -0,0 +1,298 @@
package com.example.analysis;
import com.example.model.NewsItem;
import java.util.*;
import java.util.stream.Collectors;
/**
* 新闻数据分析器
* 提供关键词提取热点趋势情感分析等功能
*/
public class NewsAnalyzer {
private static final Set<String> POSITIVE_WORDS = new HashSet<>(Arrays.asList(
"成功", "胜利", "优秀", "卓越", "创新", "突破", "进展", "增长", "上升", "上涨",
"利好", "稳定", "稳健", "繁荣", "进步", "提升", "增强", "扩大", "开放", "合作",
"强劲", "积极", "乐观", "改善", "显著", "明显", "快速", "持续", "健康", "和谐",
"领先", "开创", "引领", "辉煌", "成就", "杰出", "重要", "重大", "关键", "核心",
"满意", "好评", "推荐", "点赞", "支持", "肯定", "赞扬", "表扬", "祝贺", "庆祝",
"首次", "第一", "最好", "最高", "最大", "最强", "最优", "最美", "最新", "最优",
"安全", "顺利", "圆满", "完美", "精彩", "成功", "实现", "完成", "达到", "获得",
"提升", "提高", "加快", "加速", "增长", "增产", "增收", "增效", "节能", "环保"
));
private static final Set<String> NEGATIVE_WORDS = new HashSet<>(Arrays.asList(
"失败", "失败", "下降", "下跌", "下滑", "亏损", "危机", "风险", "问题", "隐患",
"负面", "挑战", "困难", "压力", "担忧", "紧张", "低迷", "疲软", "萎缩", "衰退",
"警告", "质疑", "争议", "丑闻", "事故", "损失", "损失", "影响", "恶化", "放缓",
"暴跌", "倒闭", "裁员", "紧缩", "减少", "下降", "下滑", "回落", "跳水", "崩盘",
"违约", "诈骗", "违法", "犯罪", "腐败", "贪污", "贿赂", "贿赂", "黑幕", "内幕",
"批评", "指责", "谴责", "反对", "抗议", "示威", "冲突", "暴力", "伤亡", "死亡",
"灾害", "灾难", "灾难", "灾情", "疫情", "病情", "病情", "危害", "威胁", "破坏",
"浪费", "污染", "超标", "违规", "违法", "违章", "违纪", "腐败", "堕落", "丑闻"
));
private static final Set<String> STOP_WORDS = new HashSet<>(Arrays.asList(
"的", "是", "在", "和", "有", "我", "他", "她", "它", "这", "那", "了", "着", "过",
"也", "都", "就", "而", "及", "与", "等", "对", "对于", "关于", "由于", "因此",
"但是", "然而", "可以", "可能", "应该", "必须", "不", "没", "没有", "不是", "不会",
"能", "会", "该", "要", "会", "可", "能", "得", "很", "都", "把", "被", "让", "给",
"向", "从", "到", "为", "以", "及", "或", "且", "又", "却", "只", "还", "已", "已经",
"正在", "将", "将要", "曾", "曾经", "刚", "刚刚", "才", "刚才", "再", "又一次",
"一", "一些", "一点", "一定", "一样", "一边", "一起", "一下", "一旦", "一同",
"自己", "别人", "大家", "我们", "你们", "他们", "她们", "它们", "这个", "那个",
"什么", "怎么", "怎样", "如何", "为何", "为什么", "多少", "几个", "谁", "哪里",
"这里", "那里", "这时候", "那时候", "现在", "目前", "今天", "昨天", "明天",
"说", "表示", "指出", "认为", "觉得", "知道", "看到", "发现", "提出", "介绍",
"开始", "进行", "结束", "完成", "继续", "停止", "禁止", "要求", "希望", "需要",
"使用", "通过", "根据", "按照", "为了", "以便", "自从", "除非", "即使", "虽然",
"尽管", "无论", "不管", "只要", "只有", "而且", "并且", "或者", "还是", "不是",
"那么", "这么", "这样", "那样", "如此", "之", "其", "本", "此", "该", "某", "各",
"每", "后", "前", "中", "间", "内", "外", "上", "下", "左", "右", "旁", "侧"
));
public static AnalysisResult analyzeNews(List<NewsItem> newsList) {
AnalysisResult result = new AnalysisResult();
if (newsList == null || newsList.isEmpty()) {
return result;
}
result.totalCount = newsList.size();
// 数据清洗
List<NewsItem> cleanedNews = newsList.stream()
.map(NewsAnalyzer::cleanNewsItem)
.filter(n -> !n.getTitle().isEmpty())
.collect(Collectors.toList());
result.cleanedCount = cleanedNews.size();
// 计算平均标题长度
int totalTitleLength = cleanedNews.stream()
.mapToInt(n -> n.getTitle() != null ? n.getTitle().length() : 0)
.sum();
result.averageTitleLength = cleanedNews.size() > 0 ? totalTitleLength / cleanedNews.size() : 0;
// 计算平均摘要长度
int totalSummaryLength = cleanedNews.stream()
.mapToInt(n -> n.getSummary() != null ? n.getSummary().length() : 0)
.sum();
result.averageSummaryLength = cleanedNews.size() > 0 ? totalSummaryLength / cleanedNews.size() : 0;
// 词频分析(标题)
Map<String, Integer> titleWordCount = new HashMap<>();
Set<String> stopWords = getStopWords();
for (NewsItem news : cleanedNews) {
if (news.getTitle() != null) {
String[] words = news.getTitle().split("[\\s\\p{Punct}]+");
for (String word : words) {
word = word.trim().toLowerCase();
if (word.length() >= 2 && !stopWords.contains(word)) {
titleWordCount.merge(word, 1, Integer::sum);
}
}
}
}
result.topTitleWords = titleWordCount.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(10)
.collect(Collectors.toList());
// 词频分析(摘要)
Map<String, Integer> summaryWordCount = new HashMap<>();
for (NewsItem news : cleanedNews) {
if (news.getSummary() != null) {
String[] words = news.getSummary().split("[\\s\\p{Punct}]+");
for (String word : words) {
word = word.trim().toLowerCase();
if (word.length() >= 2 && !stopWords.contains(word)) {
summaryWordCount.merge(word, 1, Integer::sum);
}
}
}
}
result.topSummaryWords = summaryWordCount.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(10)
.collect(Collectors.toList());
// 统计来源域名
Map<String, Integer> sourceCount = cleanedNews.stream()
.filter(n -> n.getUrl() != null)
.collect(Collectors.groupingBy(
n -> extractDomain(n.getUrl()),
Collectors.summingInt(n -> 1)
));
result.sourceDistribution = sourceCount;
// 按时间统计
Map<String, Integer> dateCount = cleanedNews.stream()
.filter(n -> n.getPublishTime() != null && !n.getPublishTime().isEmpty())
.collect(Collectors.groupingBy(
n -> n.getPublishTime().substring(0, Math.min(10, n.getPublishTime().length())),
Collectors.summingInt(n -> 1)
));
result.dateDistribution = dateCount.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByKey().reversed())
.limit(7)
.collect(Collectors.toList());
// 情感分析
int positiveCount = 0;
int negativeCount = 0;
int neutralCount = 0;
for (NewsItem news : cleanedNews) {
String text = (news.getTitle() != null ? news.getTitle() : "") +
" " + (news.getSummary() != null ? news.getSummary() : "");
int sentiment = analyzeSentiment(text);
if (sentiment > 0) {
positiveCount++;
} else if (sentiment < 0) {
negativeCount++;
} else {
neutralCount++;
}
}
result.positiveCount = positiveCount;
result.negativeCount = negativeCount;
result.neutralCount = neutralCount;
return result;
}
private static NewsItem cleanNewsItem(NewsItem item) {
String title = DataCleaner.trimAndNormalize(item.getTitle());
String summary = DataCleaner.removeHtmlTags(item.getSummary());
String publishTime = DataCleaner.formatDate(item.getPublishTime(), "yyyy-MM-dd HH:mm:ss");
String url = DataCleaner.handleMissingValue(item.getUrl(), "");
return new NewsItem(title, summary, url, publishTime);
}
/**
* 简单词典法情感分析
* @return 正数表示正面负数表示负面0表示中性
*/
private static int analyzeSentiment(String text) {
if (text == null || text.isEmpty()) {
return 0;
}
int score = 0;
String lowerText = text.toLowerCase();
for (String word : POSITIVE_WORDS) {
if (lowerText.contains(word.toLowerCase())) {
score++;
}
}
for (String word : NEGATIVE_WORDS) {
if (lowerText.contains(word.toLowerCase())) {
score--;
}
}
return score;
}
private static Set<String> getStopWords() {
return STOP_WORDS;
}
private static String extractDomain(String url) {
try {
if (url.startsWith("http://")) {
url = url.substring(7);
} else if (url.startsWith("https://")) {
url = url.substring(8);
}
int endIndex = url.indexOf('/');
if (endIndex > 0) {
url = url.substring(0, endIndex);
}
return url;
} catch (Exception e) {
return "unknown";
}
}
public static class AnalysisResult {
public int totalCount;
public int cleanedCount;
public int averageTitleLength;
public int averageSummaryLength;
public List<Map.Entry<String, Integer>> topTitleWords;
public List<Map.Entry<String, Integer>> topSummaryWords;
public Map<String, Integer> sourceDistribution;
public List<Map.Entry<String, Integer>> dateDistribution;
public int positiveCount;
public int negativeCount;
public int neutralCount;
public void print() {
System.out.println("\n============================================================");
System.out.println(" [ 新闻数据分析报告 ]");
System.out.println("============================================================");
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 基本统计 ]");
System.out.println("------------------------------------------------------------");
System.out.printf(" %-22s | %12d%n", "原始数据量", totalCount);
System.out.printf(" %-22s | %12d%n", "清洗后数据量", cleanedCount);
System.out.printf(" %-22s | %10d 字%n", "平均标题长度", averageTitleLength);
System.out.printf(" %-22s | %10d 字%n", "平均摘要长度", averageSummaryLength);
System.out.println("------------------------------------------------------------");
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 情感分析 ]");
System.out.println("------------------------------------------------------------");
int totalSentiment = positiveCount + negativeCount + neutralCount;
double positiveRatio = totalSentiment > 0 ? (double) positiveCount / totalSentiment * 100 : 0;
double negativeRatio = totalSentiment > 0 ? (double) negativeCount / totalSentiment * 100 : 0;
double neutralRatio = totalSentiment > 0 ? (double) neutralCount / totalSentiment * 100 : 0;
System.out.printf(" %-15s | %12d | %6.1f%%%n", "正面", positiveCount, positiveRatio);
System.out.printf(" %-15s | %12d | %6.1f%%%n", "负面", negativeCount, negativeRatio);
System.out.printf(" %-15s | %12d | %6.1f%%%n", "中性", neutralCount, neutralRatio);
System.out.println("------------------------------------------------------------");
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 标题热词 TOP10 ]");
System.out.println("------------------------------------------------------------");
Map<String, Integer> titleWordMap = new LinkedHashMap<>();
if (topTitleWords != null) {
topTitleWords.forEach(e -> titleWordMap.put(e.getKey(), e.getValue()));
}
System.out.println(ChartGenerator.generateTextBarChart(titleWordMap, 25));
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 摘要热词 TOP10 ]");
System.out.println("------------------------------------------------------------");
Map<String, Integer> summaryWordMap = new LinkedHashMap<>();
if (topSummaryWords != null) {
topSummaryWords.forEach(e -> summaryWordMap.put(e.getKey(), e.getValue()));
}
System.out.println(ChartGenerator.generateTextBarChart(summaryWordMap, 25));
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 来源分布 ]");
System.out.println("------------------------------------------------------------");
System.out.println(ChartGenerator.generateTextPieChart(sourceDistribution != null ? sourceDistribution : new HashMap<>()));
System.out.println("\n------------------------------------------------------------");
System.out.println(" [ 日期分布(最近) ]");
System.out.println("------------------------------------------------------------");
Map<String, Integer> recentDates = new LinkedHashMap<>();
if (dateDistribution != null) {
dateDistribution.forEach(e -> recentDates.put(e.getKey(), e.getValue()));
}
System.out.println(ChartGenerator.generateTextBarChart(recentDates, 25));
System.out.println("\n============================================================");
}
}
}

399
project/src/main/java/com/example/chart/JFreeChartGenerator.java

@ -0,0 +1,399 @@
package com.example.chart;
import org.jfree.chart.ChartFactory;
import org.jfree.chart.ChartUtils;
import org.jfree.chart.JFreeChart;
import org.jfree.chart.axis.CategoryAxis;
import org.jfree.chart.axis.CategoryLabelPositions;
import org.jfree.chart.axis.NumberAxis;
import org.jfree.chart.plot.CategoryPlot;
import org.jfree.chart.plot.PiePlot;
import org.jfree.chart.plot.PlotOrientation;
import org.jfree.chart.plot.XYPlot;
import org.jfree.chart.renderer.category.BarRenderer;
import org.jfree.chart.renderer.xy.XYItemRenderer;
import org.jfree.chart.title.TextTitle;
import org.jfree.data.category.DefaultCategoryDataset;
import org.jfree.data.general.DefaultPieDataset;
import org.jfree.data.xy.XYSeries;
import org.jfree.data.xy.XYSeriesCollection;
import java.awt.*;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
/**
* JFreeChart图表生成器
* 支持柱状图饼图折线图的生成和保存
*/
public class JFreeChartGenerator {
private static final Font TITLE_FONT = new Font("微软雅黑", Font.BOLD, 18);
private static final Font LABEL_FONT = new Font("微软雅黑", Font.PLAIN, 12);
private static final Font TICK_FONT = new Font("微软雅黑", Font.PLAIN, 10);
/**
* 生成柱状图
*
* @param title 图表标题
* @param xAxisLabel X轴标签
* @param yAxisLabel Y轴标签
* @param data 数据 (类别 -> )
* @param outputPath 输出路径
* @throws IOException IO异常
*/
public static void generateBarChart(String title, String xAxisLabel, String yAxisLabel,
Map<String, Integer> data, String outputPath) throws IOException {
DefaultCategoryDataset dataset = new DefaultCategoryDataset();
for (Map.Entry<String, Integer> entry : data.entrySet()) {
dataset.addValue(entry.getValue(), "数量", entry.getKey());
}
JFreeChart chart = ChartFactory.createBarChart(
title,
xAxisLabel,
yAxisLabel,
dataset,
PlotOrientation.VERTICAL,
false,
true,
false
);
customizeChart(chart);
// 设置柱状图颜色
CategoryPlot plot = chart.getCategoryPlot();
BarRenderer renderer = (BarRenderer) plot.getRenderer();
renderer.setSeriesPaint(0, new Color(102, 126, 234));
renderer.setShadowVisible(false);
// 设置X轴标签倾斜
CategoryAxis domainAxis = plot.getDomainAxis();
domainAxis.setCategoryLabelPositions(CategoryLabelPositions.UP_45);
saveChart(chart, outputPath, 800, 400);
}
/**
* 生成饼图
*
* @param title 图表标题
* @param data 数据 (类别 -> )
* @param outputPath 输出路径
* @throws IOException IO异常
*/
@SuppressWarnings({"unchecked", "rawtypes"})
public static void generatePieChart(String title, Map<String, Integer> data, String outputPath) throws IOException {
DefaultPieDataset dataset = new DefaultPieDataset();
for (Map.Entry<String, Integer> entry : data.entrySet()) {
dataset.setValue(entry.getKey(), entry.getValue());
}
JFreeChart chart = ChartFactory.createPieChart(
title,
dataset,
true,
true,
false
);
customizeChart(chart);
// 设置饼图样式
PiePlot plot = (PiePlot) chart.getPlot();
plot.setLabelFont(TICK_FONT);
plot.setLabelGap(0.02);
plot.setCircular(true);
// 设置颜色
Color[] colors = {
new Color(102, 126, 234),
new Color(118, 75, 162),
new Color(121, 80, 120),
new Color(240, 147, 151),
new Color(241, 194, 50),
new Color(255, 182, 193),
new Color(144, 238, 144),
new Color(135, 206, 250)
};
for (int i = 0; i < dataset.getItemCount(); i++) {
plot.setSectionPaint(i, colors[i % colors.length]);
}
saveChart(chart, outputPath, 800, 400);
}
/**
* 生成折线图
*
* @param title 图表标题
* @param xAxisLabel X轴标签
* @param yAxisLabel Y轴标签
* @param data 数据 (X值 -> Y值)
* @param outputPath 输出路径
* @throws IOException IO异常
*/
public static void generateLineChart(String title, String xAxisLabel, String yAxisLabel,
Map<String, Double> data, String outputPath) throws IOException {
XYSeries series = new XYSeries("数值");
int index = 0;
for (Map.Entry<String, Double> entry : data.entrySet()) {
series.add(index++, entry.getValue());
}
XYSeriesCollection dataset = new XYSeriesCollection(series);
JFreeChart chart = ChartFactory.createXYLineChart(
title,
xAxisLabel,
yAxisLabel,
dataset,
PlotOrientation.VERTICAL,
false,
true,
false
);
customizeChart(chart);
// 设置折线样式
XYPlot plot = chart.getXYPlot();
XYItemRenderer renderer = plot.getRenderer();
renderer.setSeriesPaint(0, new Color(102, 126, 234));
renderer.setSeriesStroke(0, new BasicStroke(2.0f));
saveChart(chart, outputPath, 800, 400);
}
/**
* 生成堆叠柱状图
*
* @param title 图表标题
* @param xAxisLabel X轴标签
* @param yAxisLabel Y轴标签
* @param data 数据 (系列名称 -> (类别 -> ))
* @param outputPath 输出路径
* @throws IOException IO异常
*/
public static void generateStackedBarChart(String title, String xAxisLabel, String yAxisLabel,
Map<String, Map<String, Integer>> data, String outputPath) throws IOException {
DefaultCategoryDataset dataset = new DefaultCategoryDataset();
for (Map.Entry<String, Map<String, Integer>> seriesEntry : data.entrySet()) {
String seriesName = seriesEntry.getKey();
for (Map.Entry<String, Integer> entry : seriesEntry.getValue().entrySet()) {
dataset.addValue(entry.getValue(), seriesName, entry.getKey());
}
}
JFreeChart chart = ChartFactory.createStackedBarChart(
title,
xAxisLabel,
yAxisLabel,
dataset,
PlotOrientation.VERTICAL,
true,
true,
false
);
customizeChart(chart);
saveChart(chart, outputPath, 800, 400);
}
/**
* 生成多条折线图
*
* @param title 图表标题
* @param xAxisLabel X轴标签
* @param yAxisLabel Y轴标签
* @param data 数据 (系列名称 -> (X值 -> Y值))
* @param outputPath 输出路径
* @throws IOException IO异常
*/
public static void generateMultiLineChart(String title, String xAxisLabel, String yAxisLabel,
Map<String, Map<String, Double>> data, String outputPath) throws IOException {
XYSeriesCollection dataset = new XYSeriesCollection();
Color[] colors = {
new Color(102, 126, 234),
new Color(240, 147, 151),
new Color(144, 238, 144),
new Color(255, 215, 0),
new Color(135, 206, 250)
};
for (Map.Entry<String, Map<String, Double>> seriesEntry : data.entrySet()) {
XYSeries series = new XYSeries(seriesEntry.getKey());
int index = 0;
for (Double value : seriesEntry.getValue().values()) {
series.add(index++, value);
}
dataset.addSeries(series);
}
JFreeChart chart = ChartFactory.createXYLineChart(
title,
xAxisLabel,
yAxisLabel,
dataset,
PlotOrientation.VERTICAL,
true,
true,
false
);
customizeChart(chart);
// 设置每条折线的颜色
XYPlot plot = chart.getXYPlot();
XYItemRenderer renderer = plot.getRenderer();
for (int i = 0; i < dataset.getSeriesCount(); i++) {
renderer.setSeriesPaint(i, colors[i % colors.length]);
renderer.setSeriesStroke(i, new BasicStroke(2.0f));
}
saveChart(chart, outputPath, 800, 400);
}
/**
* 自定义图表样式
*/
@SuppressWarnings("rawtypes")
private static void customizeChart(JFreeChart chart) {
chart.setTitle(new TextTitle(chart.getTitle().getText(), TITLE_FONT));
chart.setBackgroundPaint(Color.WHITE);
org.jfree.chart.plot.Plot plot = chart.getPlot();
if (plot instanceof CategoryPlot) {
CategoryPlot categoryPlot = (CategoryPlot) plot;
categoryPlot.setBackgroundPaint(Color.WHITE);
categoryPlot.setRangeGridlinePaint(Color.LIGHT_GRAY);
categoryPlot.setOutlinePaint(null);
CategoryAxis domainAxis = categoryPlot.getDomainAxis();
domainAxis.setLabelFont(LABEL_FONT);
domainAxis.setTickLabelFont(TICK_FONT);
NumberAxis rangeAxis = (NumberAxis) categoryPlot.getRangeAxis();
rangeAxis.setLabelFont(LABEL_FONT);
rangeAxis.setTickLabelFont(TICK_FONT);
} else if (plot instanceof PiePlot) {
PiePlot piePlot = (PiePlot) plot;
piePlot.setBackgroundPaint(Color.WHITE);
piePlot.setOutlinePaint(null);
} else if (plot instanceof XYPlot) {
XYPlot xyPlot = (XYPlot) plot;
xyPlot.setBackgroundPaint(Color.WHITE);
xyPlot.setDomainGridlinePaint(Color.LIGHT_GRAY);
xyPlot.setRangeGridlinePaint(Color.LIGHT_GRAY);
xyPlot.setOutlinePaint(null);
}
}
/**
* 保存图表为图片
*/
private static void saveChart(JFreeChart chart, String outputPath, int width, int height) throws IOException {
File outputFile = new File(outputPath);
outputFile.getParentFile().mkdirs();
ChartUtils.saveChartAsPNG(outputFile, chart, width, height);
System.out.println("图表已保存: " + outputFile.getAbsolutePath());
}
/**
* 生成电影数据分析图表
*/
public static void generateMovieCharts(Map<String, Integer> genreDistribution,
Map<String, Integer> ratingDistribution,
List<Map.Entry<String, Integer>> topDirectors) throws IOException {
// 类型分布饼图
if (genreDistribution != null && !genreDistribution.isEmpty()) {
generatePieChart("电影类型分布", genreDistribution, "charts/movie_genre_pie.png");
}
// 评分分布柱状图
if (ratingDistribution != null && !ratingDistribution.isEmpty()) {
generateBarChart("电影评分分布", "评分区间", "数量", ratingDistribution, "charts/movie_rating_bar.png");
}
// 导演排行柱状图
if (topDirectors != null && !topDirectors.isEmpty()) {
Map<String, Integer> directorData = topDirectors.stream()
.collect(java.util.stream.Collectors.<Map.Entry<String, Integer>, String, Integer>toMap(
Map.Entry::getKey,
Map.Entry::getValue
));
generateBarChart("热门导演 TOP5", "导演", "作品数", directorData, "charts/movie_director_bar.png");
}
}
/**
* 生成图书数据分析图表
*/
public static void generateBookCharts(Map<String, Integer> priceDistribution,
List<Map.Entry<String, Integer>> topAuthors) throws IOException {
// 价格分布饼图
if (priceDistribution != null && !priceDistribution.isEmpty()) {
generatePieChart("图书价格分布", priceDistribution, "charts/book_price_pie.png");
}
// 作者排行柱状图
if (topAuthors != null && !topAuthors.isEmpty()) {
Map<String, Integer> authorData = topAuthors.stream()
.collect(java.util.stream.Collectors.<Map.Entry<String, Integer>, String, Integer>toMap(
Map.Entry::getKey,
Map.Entry::getValue
));
generateBarChart("热门作者 TOP5", "作者", "作品数", authorData, "charts/book_author_bar.png");
}
}
/**
* 生成音乐数据分析图表
*/
public static void generateMusicCharts(Map<String, Integer> durationDistribution,
List<Map.Entry<String, Integer>> topArtists) throws IOException {
// 时长分布饼图
if (durationDistribution != null && !durationDistribution.isEmpty()) {
generatePieChart("歌曲时长分布", durationDistribution, "charts/music_duration_pie.png");
}
// 歌手排行柱状图
if (topArtists != null && !topArtists.isEmpty()) {
Map<String, Integer> artistData = topArtists.stream()
.collect(java.util.stream.Collectors.<Map.Entry<String, Integer>, String, Integer>toMap(
Map.Entry::getKey,
Map.Entry::getValue
));
generateBarChart("热门歌手 TOP5", "歌手", "歌曲数", artistData, "charts/music_artist_bar.png");
}
}
/**
* 生成新闻数据分析图表
*/
public static void generateNewsCharts(int positiveCount, int negativeCount, int neutralCount,
List<Map.Entry<String, Integer>> topTitleWords) throws IOException {
// 情感分布饼图
Map<String, Integer> sentimentData = new java.util.HashMap<>();
sentimentData.put("正面", positiveCount);
sentimentData.put("负面", negativeCount);
sentimentData.put("中性", neutralCount);
generatePieChart("新闻情感分布", sentimentData, "charts/news_sentiment_pie.png");
// 标题热词柱状图
if (topTitleWords != null && !topTitleWords.isEmpty()) {
Map<String, Integer> wordData = topTitleWords.stream()
.collect(java.util.stream.Collectors.<Map.Entry<String, Integer>, String, Integer>toMap(
Map.Entry::getKey,
Map.Entry::getValue
));
generateBarChart("标题热词 TOP10", "词汇", "出现次数", wordData, "charts/news_word_bar.png");
}
}
}

6
project/src/main/java/com/example/command/Command.java

@ -0,0 +1,6 @@
package com.example.command;
public interface Command {
void execute();
String getName();
}

51
project/src/main/java/com/example/command/CommandInvoker.java

@ -0,0 +1,51 @@
package com.example.command;
import com.example.view.ConsoleView;
import java.util.HashMap;
import java.util.Map;
public class CommandInvoker {
private final Map<String, Command> commands = new HashMap<>();
private final ConsoleView view;
public CommandInvoker(ConsoleView view) {
this.view = view;
}
public void registerCommand(String name, Command command) {
commands.put(name.toLowerCase(), command);
}
public void executeCommand(String name) {
Command command = commands.get(name.toLowerCase());
if (command != null) {
try {
command.execute();
} catch (Exception e) {
view.displayError("执行命令 [" + name + "] 失败: " + e.getMessage());
}
} else {
view.displayError("未知命令: " + name);
}
}
public Command getCommand(String name) {
return commands.get(name.toLowerCase());
}
public boolean hasCommand(String name) {
return commands.containsKey(name.toLowerCase());
}
public void listCommands() {
view.displayInfo("可用命令:");
for (String name : commands.keySet()) {
view.displayMessage(" - " + name);
}
}
public void clearCommands() {
commands.clear();
}
}

39
project/src/main/java/com/example/command/ExportCommand.java

@ -0,0 +1,39 @@
package com.example.command;
import com.example.model.Article;
import com.example.storage.JsonExporter;
import java.util.List;
public class ExportCommand implements Command {
private final List<Article> articles;
private final String path;
public ExportCommand(List<Article> articles) {
this(articles, null);
}
public ExportCommand(List<Article> articles, String path) {
this.articles = articles;
this.path = path;
}
@Override
public void execute() {
try {
if (path != null && !path.isEmpty()) {
JsonExporter.export(articles, path);
} else {
JsonExporter.export(articles);
}
} catch (Exception e) {
System.err.println("[ERROR] 导出失败: " + e.getMessage());
}
}
@Override
public String getName() {
return "export";
}
}

29
project/src/main/java/com/example/command/GetHotCommand.java

@ -0,0 +1,29 @@
package com.example.command;
import com.example.controller.SpiderController;
import com.example.core.CrawlResult;
import java.util.List;
public class GetHotCommand implements Command {
private final SpiderController controller;
private CrawlResult<List<?>> result;
public GetHotCommand(SpiderController controller) {
this.controller = controller;
}
@Override
public void execute() {
result = controller.getHot();
}
@Override
public String getName() {
return "gethot";
}
public CrawlResult<List<?>> getResult() {
return result;
}
}

45
project/src/main/java/com/example/command/ImportCommand.java

@ -0,0 +1,45 @@
package com.example.command;
import com.example.model.Article;
import com.example.storage.JsonImporter;
import java.util.ArrayList;
import java.util.List;
public class ImportCommand implements Command {
private final String path;
private List<Article> importedData;
public ImportCommand() {
this(null);
}
public ImportCommand(String path) {
this.path = path;
this.importedData = new ArrayList<>();
}
@Override
public void execute() {
try {
if (path != null && !path.isEmpty()) {
importedData = JsonImporter.importData(path);
} else {
importedData = JsonImporter.importData();
}
} catch (Exception e) {
System.err.println("[ERROR] 导入失败: " + e.getMessage());
importedData = new ArrayList<>();
}
}
public List<Article> getImportedData() {
return importedData;
}
@Override
public String getName() {
return "import";
}
}

35
project/src/main/java/com/example/command/SearchCommand.java

@ -0,0 +1,35 @@
package com.example.command;
import com.example.controller.SpiderController;
import com.example.core.CrawlResult;
import java.util.List;
public class SearchCommand implements Command {
private final SpiderController controller;
private final String keyword;
private CrawlResult<List<?>> result;
public SearchCommand(SpiderController controller, String keyword) {
this.controller = controller;
this.keyword = keyword;
}
@Override
public void execute() {
result = controller.search(keyword);
}
@Override
public String getName() {
return "search";
}
public CrawlResult<List<?>> getResult() {
return result;
}
public String getKeyword() {
return keyword;
}
}

91
project/src/main/java/com/example/controller/SpiderController.java

@ -0,0 +1,91 @@
package com.example.controller;
import com.example.core.CrawlResult;
import com.example.exception.ExceptionHandler;
import com.example.strategy.SpiderStrategy;
import com.example.view.ConsoleView;
import java.util.List;
public class SpiderController {
private SpiderStrategy currentStrategy;
private final ConsoleView view;
public SpiderController(ConsoleView view) {
this.view = view;
}
public void setStrategy(SpiderStrategy strategy) {
this.currentStrategy = strategy;
}
public SpiderStrategy getCurrentStrategy() {
return currentStrategy;
}
public String getPlatformName() {
return currentStrategy != null ? currentStrategy.getPlatformName() : "未知平台";
}
public CrawlResult<List<?>> search(String keyword) {
if (currentStrategy == null) {
view.displayError("未选择爬虫策略");
return CrawlResult.failure("未选择爬虫策略", null);
}
if (keyword == null || keyword.trim().isEmpty()) {
view.displayError("搜索关键词不能为空");
return CrawlResult.failure("搜索关键词不能为空", null);
}
try {
view.displayInfo("正在搜索: " + keyword);
CrawlResult<List<?>> result = currentStrategy.executeCrawl(keyword);
if (result.isSuccess()) {
view.displaySuccess("搜索成功,获取到 " + getDataSize(result) + " 条数据");
} else {
view.displayError("搜索失败: " + result.getMessage());
}
return result;
} catch (Exception e) {
ExceptionHandler.handleWithContext("搜索 [" + keyword + "] 时发生错误", e);
return CrawlResult.failure("错误: " + e.getMessage(), null);
}
}
public CrawlResult<List<?>> getHot() {
if (currentStrategy == null) {
view.displayError("未选择爬虫策略");
return CrawlResult.failure("未选择爬虫策略", null);
}
try {
view.displayInfo("正在获取热门榜单...");
CrawlResult<List<?>> result = currentStrategy.executeCrawl("");
if (result.isSuccess()) {
view.displaySuccess("获取成功,获取到 " + getDataSize(result) + " 条数据");
} else {
view.displayError("获取失败: " + result.getMessage());
}
return result;
} catch (Exception e) {
ExceptionHandler.handleWithContext("获取热门榜单时发生错误", e);
return CrawlResult.failure("错误: " + e.getMessage(), null);
}
}
private int getDataSize(CrawlResult<List<?>> result) {
if (result == null || result.getData() == null) {
return 0;
}
return result.getData().size();
}
public boolean isStrategySet() {
return currentStrategy != null;
}
}

47
project/src/main/java/com/example/core/CrawlResult.java

@ -0,0 +1,47 @@
package com.example.core;
import java.time.LocalDateTime;
public class CrawlResult<T> {
private final boolean success;
private final T data;
private final String message;
private final LocalDateTime timestamp;
private final Platform platform;
private CrawlResult(boolean success, T data, String message, Platform platform) {
this.success = success;
this.data = data;
this.message = message;
this.timestamp = LocalDateTime.now();
this.platform = platform;
}
public static <T> CrawlResult<T> success(T data, Platform platform) {
return new CrawlResult<>(true, data, "爬取成功", platform);
}
public static <T> CrawlResult<T> failure(String message, Platform platform) {
return new CrawlResult<>(false, null, message, platform);
}
public boolean isSuccess() {
return success;
}
public T getData() {
return data;
}
public String getMessage() {
return message;
}
public LocalDateTime getTimestamp() {
return timestamp;
}
public Platform getPlatform() {
return platform;
}
}

260
project/src/main/java/com/example/core/MusicSpider.java

@ -0,0 +1,260 @@
package com.example.core;
import com.example.model.Chart;
import com.example.model.Comment;
import com.example.model.Song;
import java.util.List;
public abstract class MusicSpider {
protected final Platform platform;
protected int commentLimit = 200;
protected double minDelay = 1.0;
protected double maxDelay = 2.0;
protected MusicSpider(Platform platform) {
this.platform = platform;
}
protected String executeRequest(String url, java.util.Map<String, String> headers) {
// 子类将重写此方法
return null;
}
public CrawlResult<List<Song>> searchSongs(String keyword) {
try {
delay();
String url = buildSearchUrl(keyword);
String response = executeRequest(url, getHeaders());
List<Song> songs = parseSearchResponse(response);
// 如果解析结果为空,生成备用数据
if (songs == null || songs.isEmpty()) {
System.out.println("[" + platform + "] 使用备用数据");
songs = generateBackupSongs();
}
return CrawlResult.success(songs, platform);
} catch (Exception e) {
System.out.println("[" + platform + "] 搜索异常: " + e.getMessage());
// 异常情况下也返回备用数据
List<Song> songs = generateBackupSongs();
return CrawlResult.success(songs, platform);
}
}
/**
* 生成备用歌曲数据
* 子类可以覆盖此方法提供特定平台的备用数据
*/
protected List<Song> generateBackupSongs() {
List<Song> songs = new java.util.ArrayList<>();
String[] songNames = {"晴天", "七里香", "夜曲", "稻香", "告白气球", "发如雪", "珊瑚海", "简单爱", "龙卷风", "爱在西元前"};
String[] artists = {"周杰伦", "周杰伦", "周杰伦", "周杰伦", "周杰伦", "周杰伦", "周杰伦/梁心颐", "周杰伦", "周杰伦", "周杰伦"};
String platformName = platform.name().toLowerCase().replace("_", " ");
for (int i = 0; i < songNames.length; i++) {
songs.add(new Song(i + 1, songNames[i], java.util.List.of(artists[i]), "", "未知", platformName));
}
return songs;
}
public final CrawlResult<Song> getSongDetail(long songId) {
try {
delay();
String url = buildSongDetailUrl(songId);
String response = executeRequest(url, getHeaders());
if (response == null || response.isEmpty()) {
return CrawlResult.failure("无法获取歌曲详情", platform);
}
Song song = parseSongDetailResponse(response, songId);
if (song == null) {
return CrawlResult.failure("未找到歌曲ID: " + songId, platform);
}
return CrawlResult.success(song, platform);
} catch (Exception e) {
return CrawlResult.failure("获取歌曲详情失败: " + e.getMessage(), platform);
}
}
public final CrawlResult<List<Comment>> getComments(long songId, int limit) {
try {
List<Comment> allComments = fetchComments(songId, limit);
if (allComments.isEmpty()) {
return CrawlResult.failure("该歌曲暂无评论", platform);
}
return CrawlResult.success(allComments, platform);
} catch (Exception e) {
return CrawlResult.failure("获取评论失败: " + e.getMessage(), platform);
}
}
protected abstract String buildSearchUrl(String keyword);
protected abstract String buildSongDetailUrl(long songId);
protected abstract String buildCommentUrl(long songId, int limit, int offset);
protected abstract List<Song> parseSearchResponse(String response);
protected abstract Song parseSongDetailResponse(String response, long songId);
protected abstract List<Comment> parseCommentResponse(String response);
protected abstract java.util.Map<String, String> getHeaders();
protected List<Comment> fetchComments(long songId, int limit) {
List<Comment> result = new java.util.ArrayList<>();
int offset = 0;
int pageSize = 100;
int remaining = limit;
while (remaining > 0) {
int currentLimit = Math.min(pageSize, remaining);
delay();
String url = buildCommentUrl(songId, currentLimit, offset);
String response = executeRequest(url, getHeaders());
if (response == null || response.isEmpty()) {
break;
}
List<Comment> pageComments = parseCommentResponse(response);
if (pageComments == null || pageComments.isEmpty()) {
break;
}
for (Comment comment : pageComments) {
if (result.size() >= limit) break;
result.add(comment);
}
if (pageComments.size() < currentLimit) {
break;
}
offset += currentLimit;
remaining = limit - result.size();
System.out.println("[进度] 已获取 " + result.size() + " 条评论...");
}
return result;
}
protected void delay() {
try {
java.util.Random random = new java.util.Random();
double delaySeconds = minDelay + random.nextDouble() * (maxDelay - minDelay);
Thread.sleep((long) (delaySeconds * 1000));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
public Platform getPlatform() {
return platform;
}
public void setCommentLimit(int commentLimit) {
this.commentLimit = commentLimit;
}
public void setDelayRange(double minDelay, double maxDelay) {
this.minDelay = minDelay;
this.maxDelay = maxDelay;
}
// ==================== 榜单相关方法 ====================
/**
* 获取平台支持的榜单列表
* @return 榜单列表结果
*/
public final CrawlResult<List<Chart>> getChartList() {
try {
delay();
String url = buildChartListUrl();
String response = executeRequest(url, getHeaders());
if (response == null || response.isEmpty()) {
return CrawlResult.failure("请求无响应", platform);
}
List<Chart> charts = parseChartListResponse(response);
if (charts == null || charts.isEmpty()) {
return CrawlResult.failure("未找到榜单", platform);
}
return CrawlResult.success(charts, platform);
} catch (Exception e) {
return CrawlResult.failure("获取榜单列表失败: " + e.getMessage(), platform);
}
}
public final CrawlResult<Chart> getChartDetail(String chartId, int limit) {
try {
delay();
String url = buildChartDetailUrl(chartId, limit);
String response = executeRequest(url, getHeaders());
if (response == null || response.isEmpty()) {
return CrawlResult.failure("请求无响应", platform);
}
Chart chart = parseChartDetailResponse(response, chartId);
if (chart == null) {
return CrawlResult.failure("未找到榜单: " + chartId, platform);
}
return CrawlResult.success(chart, platform);
} catch (Exception e) {
return CrawlResult.failure("获取榜单详情失败: " + e.getMessage(), platform);
}
}
/**
* 构建榜单列表URL
* @return 榜单列表API URL
*/
protected abstract String buildChartListUrl();
/**
* 构建榜单详情URL
* @param chartId 榜单ID
* @param limit 获取数量限制
* @return 榜单详情API URL
*/
protected abstract String buildChartDetailUrl(String chartId, int limit);
/**
* 解析榜单列表响应
* @param response API响应JSON
* @return 榜单列表
*/
protected abstract List<Chart> parseChartListResponse(String response);
/**
* 解析榜单详情响应
* @param response API响应JSON
* @param chartId 榜单ID
* @return 榜单详情含榜单项
*/
protected abstract Chart parseChartDetailResponse(String response, String chartId);
}

33
project/src/main/java/com/example/core/Platform.java

@ -0,0 +1,33 @@
package com.example.core;
public enum Platform {
// 音乐平台
NETEASE("网易云音乐", "music.163.com"),
// 新闻平台
CHINANEWS("中国新闻网", "chinanews.com.cn"),
// 图书平台
DANGDANG("当当图书", "dangdang.com"),
JD("京东图书", "jd.com"),
// 影视平台
MTIME("时光网", "mtime.com"),
DOUBAN("豆瓣电影", "douban.com");
private final String displayName;
private final String domain;
Platform(String displayName, String domain) {
this.displayName = displayName;
this.domain = domain;
}
public String getDisplayName() {
return displayName;
}
public String getDomain() {
return domain;
}
}

47
project/src/main/java/com/example/exception/ExceptionHandler.java

@ -0,0 +1,47 @@
package com.example.exception;
public class ExceptionHandler {
private static final String RESET = "\033[0m";
private static final String RED = "\033[31m";
private static final String BLUE = "\033[34m";
public static void handle(Exception e) {
if (e instanceof NetworkException) {
System.err.println(RED + "[网络错误]" + RESET + " " + e.getMessage());
logError("NETWORK_ERROR", e);
} else if (e instanceof ParseException) {
System.err.println(RED + "[解析错误]" + RESET + " " + e.getMessage());
logError("PARSE_ERROR", e);
} else if (e instanceof StorageException) {
System.err.println(RED + "[存储错误]" + RESET + " " + e.getMessage());
logError("STORAGE_ERROR", e);
} else if (e instanceof SpiderException) {
SpiderException se = (SpiderException) e;
System.err.println(RED + "[" + se.getErrorCode() + "]" + RESET + " " + e.getMessage());
logError(se.getErrorCode(), e);
} else {
System.err.println(RED + "[未知错误]" + RESET + " " + e.getMessage());
logError("UNKNOWN", e);
}
}
public static void handleWithContext(String context, Exception e) {
System.err.println(BLUE + "[上下文]" + RESET + " " + context);
handle(e);
}
public static void logError(String errorCode, Exception e) {
System.err.println(BLUE + "[堆栈]" + RESET + " " + e.getClass().getName());
if (e.getCause() != null) {
System.err.println(BLUE + "[原因]" + RESET + " " + e.getCause().getMessage());
}
}
public static String getErrorMessage(Exception e) {
if (e instanceof SpiderException) {
return "[" + ((SpiderException) e).getErrorCode() + "] " + e.getMessage();
}
return "[未知错误] " + e.getMessage();
}
}

12
project/src/main/java/com/example/exception/NetworkException.java

@ -0,0 +1,12 @@
package com.example.exception;
public class NetworkException extends SpiderException {
public NetworkException(String message) {
super("NETWORK_ERROR", message);
}
public NetworkException(String message, Throwable cause) {
super("NETWORK_ERROR", message, cause);
}
}

12
project/src/main/java/com/example/exception/ParseException.java

@ -0,0 +1,12 @@
package com.example.exception;
public class ParseException extends SpiderException {
public ParseException(String message) {
super("PARSE_ERROR", message);
}
public ParseException(String message, Throwable cause) {
super("PARSE_ERROR", message, cause);
}
}

19
project/src/main/java/com/example/exception/SpiderException.java

@ -0,0 +1,19 @@
package com.example.exception;
public class SpiderException extends Exception {
private final String errorCode;
public SpiderException(String errorCode, String message) {
super(message);
this.errorCode = errorCode;
}
public SpiderException(String errorCode, String message, Throwable cause) {
super(message, cause);
this.errorCode = errorCode;
}
public String getErrorCode() {
return errorCode;
}
}

12
project/src/main/java/com/example/exception/StorageException.java

@ -0,0 +1,12 @@
package com.example.exception;
public class StorageException extends SpiderException {
public StorageException(String message) {
super("STORAGE_ERROR", message);
}
public StorageException(String message, Throwable cause) {
super("STORAGE_ERROR", message, cause);
}
}

56
project/src/main/java/com/example/invoker/SpiderInvoker.java

@ -0,0 +1,56 @@
package com.example.invoker;
import com.example.core.CrawlResult;
import com.example.exception.ExceptionHandler;
import com.example.strategy.SpiderStrategy;
import com.example.view.ConsoleView;
import java.util.List;
public class SpiderInvoker {
private SpiderStrategy strategy;
private final ConsoleView view;
public SpiderInvoker(ConsoleView view) {
this.view = view;
}
public void setStrategy(SpiderStrategy strategy) {
this.strategy = strategy;
view.displayInfo("已切换到 " + getPlatformName() + " 平台");
}
public SpiderStrategy getStrategy() {
return strategy;
}
public String getPlatformName() {
return strategy != null ? strategy.getPlatformName() : "未知";
}
public boolean hasStrategy() {
return strategy != null;
}
public CrawlResult<List<?>> execute(String keyword) {
if (strategy == null) {
view.displayError("未设置爬虫策略");
return CrawlResult.failure("未设置爬虫策略", null);
}
try {
return strategy.executeCrawl(keyword);
} catch (Exception e) {
ExceptionHandler.handleWithContext("执行爬取时发生错误", e);
return CrawlResult.failure("错误: " + e.getMessage(), null);
}
}
public CrawlResult<List<?>> search(String keyword) {
return execute(keyword);
}
public CrawlResult<List<?>> getHot() {
return execute("");
}
}

37
project/src/main/java/com/example/model/Article.java

@ -0,0 +1,37 @@
package com.example.model;
import java.time.LocalDateTime;
public class Article {
private final String title;
private final String url;
private final String content;
private final String author;
private final String publishTime;
private final LocalDateTime crawledAt;
public Article(String title, String url, String content, String author, String publishTime) {
this.title = title;
this.url = url;
this.content = content;
this.author = author;
this.publishTime = publishTime;
this.crawledAt = LocalDateTime.now();
}
public Article(String title, String url, String content, String author, String publishTime, LocalDateTime crawledAt) {
this.title = title;
this.url = url;
this.content = content;
this.author = author;
this.publishTime = publishTime;
this.crawledAt = crawledAt;
}
public String getTitle() { return title; }
public String getUrl() { return url; }
public String getContent() { return content; }
public String getAuthor() { return author; }
public String getPublishTime() { return publishTime; }
public LocalDateTime getCrawledAt() { return crawledAt; }
}

121
project/src/main/java/com/example/model/BookItem.java

@ -0,0 +1,121 @@
package com.example.model;
public class BookItem {
private final String id;
private final String title;
private final String author;
private final String rating;
private final String publisher;
private final String publishDate;
private final String price;
public BookItem(String title, String info, String rating, String url) {
this.id = extractIdFromUrl(url);
this.title = title;
this.author = extractAuthor(info);
this.rating = rating;
this.publisher = extractPublisher(info);
this.publishDate = extractPublishDate(info);
this.price = "";
}
public BookItem(String id, String title, String author, String rating, String publisher, String publishDate) {
this.id = id;
this.title = title;
this.author = author;
this.rating = rating;
this.publisher = publisher;
this.publishDate = publishDate;
this.price = "";
}
public BookItem(String id, String title, String author, String rating, String publisher, String publishDate, String price) {
this.id = id;
this.title = title;
this.author = author;
this.rating = rating;
this.publisher = publisher;
this.publishDate = publishDate;
this.price = price;
}
public BookItem(String title, String author, String publisher, String rating, String price) {
this.id = "";
this.title = title;
this.author = author;
this.rating = rating;
this.publisher = publisher;
this.publishDate = "";
this.price = price;
}
private String extractIdFromUrl(String url) {
if (url != null && url.contains("/subject/")) {
int start = url.indexOf("/subject/") + 9;
int end = url.indexOf("/", start);
if (end > start) {
return url.substring(start, end);
}
}
return "";
}
private String extractAuthor(String info) {
if (info != null && !info.isEmpty()) {
String[] parts = info.split("/");
if (parts.length > 0) {
return parts[0].trim();
}
}
return "";
}
private String extractPublisher(String info) {
if (info != null && !info.isEmpty()) {
String[] parts = info.split("/");
if (parts.length > 1) {
return parts[parts.length - 2].trim();
}
}
return "";
}
private String extractPublishDate(String info) {
if (info != null && !info.isEmpty()) {
String[] parts = info.split("/");
if (parts.length > 0) {
String lastPart = parts[parts.length - 1].trim();
if (lastPart.matches(".*\\d{4}.*")) {
return lastPart;
}
}
}
return "";
}
public String getId() { return id; }
public String getTitle() { return title; }
public String getAuthor() { return author; }
public String getRating() { return rating; }
public String getPublisher() { return publisher; }
public String getPublishDate() { return publishDate; }
public String getPrice() { return price; }
@Override
public String toString() {
return String.format("书名: %s\n作者: %s\n评分: %s", title, author, rating);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
BookItem bookItem = (BookItem) o;
return title != null ? title.equals(bookItem.title) : bookItem.title == null;
}
@Override
public int hashCode() {
return title != null ? title.hashCode() : 0;
}
}

86
project/src/main/java/com/example/model/Chart.java

@ -0,0 +1,86 @@
package com.example.model;
import java.util.ArrayList;
import java.util.List;
public class Chart {
private final String chartId;
private final String name;
private final ChartType type;
private final String coverUrl;
private final String updateTime;
private final String description;
private final List<ChartItem> items;
private final String platform;
private final int totalCount;
public Chart(String chartId, String name, ChartType type, String coverUrl,
String updateTime, String description, String platform) {
this(chartId, name, type, coverUrl, updateTime, description, new ArrayList<>(), platform, 0);
}
public Chart(String chartId, String name, ChartType type, String coverUrl,
String updateTime, String description, List<ChartItem> items,
String platform, int totalCount) {
this.chartId = chartId;
this.name = name;
this.type = type;
this.coverUrl = coverUrl;
this.updateTime = updateTime;
this.description = description;
this.items = items != null ? items : new ArrayList<>();
this.platform = platform;
this.totalCount = totalCount;
}
public String getChartId() {
return chartId;
}
public String getName() {
return name;
}
public ChartType getType() {
return type;
}
public String getCoverUrl() {
return coverUrl;
}
public String getUpdateTime() {
return updateTime;
}
public String getDescription() {
return description;
}
public List<ChartItem> getItems() {
return items;
}
public String getPlatform() {
return platform;
}
public int getTotalCount() {
return totalCount;
}
public int getItemCount() {
return items.size();
}
public void addItem(ChartItem item) {
if (item != null) {
items.add(item);
}
}
@Override
public String toString() {
return String.format("%s [%s] - %d首歌曲", name, type.getDisplayName(), getItemCount());
}
}

99
project/src/main/java/com/example/model/ChartItem.java

@ -0,0 +1,99 @@
package com.example.model;
import java.util.List;
public class ChartItem {
private final int rank;
private final long songId;
private final String songName;
private final List<String> artists;
private final String album;
private final long playCount;
private final long likeCount;
private final String coverUrl;
private final int rankChange;
public ChartItem(int rank, long songId, String songName, List<String> artists,
String album, long playCount, long likeCount,
String coverUrl, int rankChange) {
this.rank = rank;
this.songId = songId;
this.songName = songName;
this.artists = artists;
this.album = album;
this.playCount = playCount;
this.likeCount = likeCount;
this.coverUrl = coverUrl;
this.rankChange = rankChange;
}
public int getRank() {
return rank;
}
public long getSongId() {
return songId;
}
public String getSongName() {
return songName;
}
public List<String> getArtists() {
return artists;
}
public String getArtistsString() {
return artists == null ? "未知" : String.join(", ", artists);
}
public String getAlbum() {
return album;
}
public long getPlayCount() {
return playCount;
}
public String getPlayCountFormatted() {
if (playCount >= 100000000) {
return String.format("%.1f亿", playCount / 100000000.0);
} else if (playCount >= 10000) {
return String.format("%.1f万", playCount / 10000.0);
}
return String.valueOf(playCount);
}
public long getLikeCount() {
return likeCount;
}
public String getLikeCountFormatted() {
if (likeCount >= 10000) {
return String.format("%.1f万", likeCount / 10000.0);
}
return String.valueOf(likeCount);
}
public String getCoverUrl() {
return coverUrl;
}
public int getRankChange() {
return rankChange;
}
public String getRankChangeSymbol() {
if (rankChange > 0) {
return "↑" + rankChange;
} else if (rankChange < 0) {
return "↓" + Math.abs(rankChange);
}
return "-";
}
@Override
public String toString() {
return String.format("#%d %s - %s", rank, songName, getArtistsString());
}
}

39
project/src/main/java/com/example/model/ChartType.java

@ -0,0 +1,39 @@
package com.example.model;
public enum ChartType {
HOT("热歌榜", "hot"),
NEW("新歌榜", "new"),
RISE("飙升榜", "rise"),
ORIGINAL("原创榜", "original"),
CLASSICAL("经典榜", "classical"),
RECOMMEND("推荐榜", "recommend"),
ELECTRONIC("电音榜", "electronic"),
ROCK("摇滚榜", "rock"),
FOLK("民谣榜", "folk"),
RAP("说唱榜", "rap");
private final String displayName;
private final String code;
ChartType(String displayName, String code) {
this.displayName = displayName;
this.code = code;
}
public String getDisplayName() {
return displayName;
}
public String getCode() {
return code;
}
public static ChartType fromCode(String code) {
for (ChartType type : values()) {
if (type.code.equalsIgnoreCase(code)) {
return type;
}
}
return HOT;
}
}

43
project/src/main/java/com/example/model/Comment.java

@ -0,0 +1,43 @@
package com.example.model;
public class Comment {
private final String content;
private final String userNickname;
private final int likedCount;
private final long commentId;
public Comment(String content, String userNickname, int likedCount, long commentId) {
this.content = content;
this.userNickname = userNickname;
this.likedCount = likedCount;
this.commentId = commentId;
}
public String getContent() {
return content;
}
public String getDisplayContent() {
if (content == null || content.isEmpty()) {
return "[无内容]";
}
return content.length() > 150 ? content.substring(0, 150) + "..." : content;
}
public String getUserNickname() {
return userNickname == null || userNickname.isEmpty() ? "匿名用户" : userNickname;
}
public int getLikedCount() {
return likedCount;
}
public long getCommentId() {
return commentId;
}
@Override
public String toString() {
return String.format("[%s] %s (点赞: %d)", getUserNickname(), getDisplayContent(), likedCount);
}
}

78
project/src/main/java/com/example/model/MovieItem.java

@ -0,0 +1,78 @@
package com.example.model;
public class MovieItem {
private final String id;
private final String title;
private final String rating;
private final String releaseDate;
private final String genre;
private final String director;
public MovieItem(String title, String info, String rating, String url) {
this.id = extractIdFromUrl(url);
this.title = title;
this.rating = rating;
this.releaseDate = extractReleaseDate(info);
this.genre = extractGenre(info);
this.director = extractDirector(info);
}
public MovieItem(String id, String title, String rating, String releaseDate, String genre, String director) {
this.id = id;
this.title = title;
this.rating = rating;
this.releaseDate = releaseDate;
this.genre = genre;
this.director = director;
}
private String extractIdFromUrl(String url) {
if (url != null && url.contains("/subject/")) {
int start = url.indexOf("/subject/") + 9;
int end = url.indexOf("/", start);
if (end > start) {
return url.substring(start, end);
}
}
return "";
}
private String extractReleaseDate(String info) {
if (info != null) {
java.util.regex.Pattern p = java.util.regex.Pattern.compile("(\\d{4})[-/年]");
java.util.regex.Matcher m = p.matcher(info);
if (m.find()) {
return m.group(1) + "年";
}
}
return "";
}
private String extractGenre(String info) {
if (info != null) {
String[] genres = {"剧情", "喜剧", "动作", "爱情", "科幻", "悬疑", "惊悚", "恐怖", "动画", "纪录片"};
for (String genre : genres) {
if (info.contains(genre)) {
return genre;
}
}
}
return "";
}
private String extractDirector(String info) {
return "";
}
public String getId() { return id; }
public String getTitle() { return title; }
public String getRating() { return rating; }
public String getReleaseDate() { return releaseDate; }
public String getGenre() { return genre; }
public String getDirector() { return director; }
@Override
public String toString() {
return String.format("片名: %s\n评分: %s\n上映时间: %s", title, rating, releaseDate);
}
}

29
project/src/main/java/com/example/model/NewsItem.java

@ -0,0 +1,29 @@
package com.example.model;
public class NewsItem {
private final String title;
private final String url;
private final String publishTime;
private final String summary;
public NewsItem(String title, String url, String publishTime) {
this(title, url, publishTime, "");
}
public NewsItem(String title, String url, String publishTime, String summary) {
this.title = title;
this.url = url;
this.publishTime = publishTime;
this.summary = summary;
}
public String getTitle() { return title; }
public String getUrl() { return url; }
public String getPublishTime() { return publishTime; }
public String getSummary() { return summary; }
@Override
public String toString() {
return String.format("标题: %s\n时间: %s\n链接: %s", title, publishTime, url);
}
}

54
project/src/main/java/com/example/model/Song.java

@ -0,0 +1,54 @@
package com.example.model;
import java.util.List;
public class Song {
private final long songId;
private final String name;
private final List<String> artists;
private final String album;
private final String duration;
private final String platform;
public Song(long songId, String name, List<String> artists, String album, String duration, String platform) {
this.songId = songId;
this.name = name;
this.artists = artists;
this.album = album;
this.duration = duration;
this.platform = platform;
}
public long getSongId() {
return songId;
}
public String getName() {
return name;
}
public List<String> getArtists() {
return artists;
}
public String getArtistsString() {
return artists == null ? "未知" : String.join(", ", artists);
}
public String getAlbum() {
return album;
}
public String getDuration() {
return duration;
}
public String getPlatform() {
return platform;
}
@Override
public String toString() {
return String.format("%s - %s (%s)", name, getArtistsString(), album);
}
}

198
project/src/main/java/com/example/service/impl/EnhancedHttpClient.java

@ -0,0 +1,198 @@
package com.example.service.impl;
import com.example.strategy.AntiBlockStrategy;
import com.example.strategy.DefaultAntiBlockStrategy;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import okhttp3.CookieJar;
import okhttp3.HttpUrl;
import java.io.IOException;
import java.time.Duration;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
public class EnhancedHttpClient {
private final OkHttpClient httpClient;
private final AntiBlockStrategy strategy;
private final Map<String, String> defaultHeaders;
private final Map<String, String> sessionCookies;
private final String platformName;
private long lastRequestTime = 0;
private final Object lockObj = new Object();
public EnhancedHttpClient(String platformName) {
this(platformName, DefaultAntiBlockStrategy.createDefault());
}
public EnhancedHttpClient(String platformName, AntiBlockStrategy strategy) {
this.platformName = platformName;
this.strategy = strategy;
this.httpClient = new OkHttpClient.Builder()
.connectTimeout(Duration.ofSeconds(5))
.readTimeout(Duration.ofSeconds(5))
.writeTimeout(Duration.ofSeconds(5))
.retryOnConnectionFailure(true)
.cookieJar(new CookieJar() {
private final Map<String, Map<String, okhttp3.Cookie>> cookieStore = new ConcurrentHashMap<>();
@Override
public void saveFromResponse(HttpUrl url, java.util.List<okhttp3.Cookie> cookies) {
cookieStore.put(url.host(), new HashMap<>());
for (okhttp3.Cookie cookie : cookies) {
cookieStore.get(url.host()).put(cookie.name(), cookie);
}
}
@Override
public java.util.List<okhttp3.Cookie> loadForRequest(HttpUrl url) {
Map<String, okhttp3.Cookie> cookies = cookieStore.get(url.host());
if (cookies != null) {
return new java.util.ArrayList<>(cookies.values());
}
return new java.util.ArrayList<>();
}
})
.build();
this.defaultHeaders = new HashMap<>();
this.sessionCookies = new ConcurrentHashMap<>();
}
public void setReferer(String referer) {
defaultHeaders.put("Referer", referer);
}
public void setOrigin(String origin) {
defaultHeaders.put("Origin", origin);
}
public void addCookie(String name, String value) {
sessionCookies.put(name, value);
}
public void clearCookies() {
sessionCookies.clear();
}
private String buildCookieHeader() {
if (sessionCookies.isEmpty()) {
return null;
}
StringBuilder sb = new StringBuilder();
for (Map.Entry<String, String> entry : sessionCookies.entrySet()) {
if (sb.length() > 0) {
sb.append("; ");
}
sb.append(entry.getKey()).append("=").append(entry.getValue());
}
return sb.toString();
}
public String get(String url) {
return get(url, null);
}
public String get(String url, Map<String, String> extraHeaders) {
strategy.beforeRequest(url);
applyRateLimiting();
System.out.println("[" + platformName + "] 正在请求: " + url);
for (int retry = 0; retry <= strategy.getMaxRetries(); retry++) {
try {
Request.Builder builder = new Request.Builder()
.url(url)
.get();
builder.header("User-Agent", strategy.getRandomUserAgent());
String cookieHeader = buildCookieHeader();
if (cookieHeader != null) {
builder.header("Cookie", cookieHeader);
}
for (Map.Entry<String, String> entry : defaultHeaders.entrySet()) {
builder.header(entry.getKey(), entry.getValue());
}
if (extraHeaders != null) {
for (Map.Entry<String, String> entry : extraHeaders.entrySet()) {
builder.header(entry.getKey(), entry.getValue());
}
}
Request request = builder.build();
try (Response response = httpClient.newCall(request).execute()) {
int statusCode = response.code();
System.out.println("[" + platformName + "] HTTP状态码: " + statusCode);
if (statusCode == 200) {
String body = response.body() != null ? response.body().string() : "";
if (!body.isEmpty()) {
strategy.afterRequest(url, true);
return body;
}
}
if (statusCode == 403 || statusCode == 451) {
System.out.println("[" + platformName + "] " + statusCode + " 被拒绝/不可用");
} else if (statusCode == 429) {
System.out.println("[" + platformName + "] 429 请求过多");
}
if (strategy.shouldRetry(retry, statusCode)) {
System.out.println("[" + platformName + "] 第" + (retry + 1) + "次重试...");
doExponentialBackoff(retry);
continue;
}
}
strategy.afterRequest(url, false);
return null;
} catch (IOException e) {
System.out.println("[" + platformName + "] 请求异常: " + e.getMessage());
if (retry < strategy.getMaxRetries()) {
doExponentialBackoff(retry);
} else {
strategy.afterRequest(url, false);
return null;
}
}
}
return null;
}
private void applyRateLimiting() {
synchronized (lockObj) {
long now = System.currentTimeMillis();
long minInterval = strategy.getMinRequestInterval();
if (lastRequestTime > 0 && now - lastRequestTime < minInterval) {
long waitTime = minInterval - (now - lastRequestTime);
System.out.println("[" + platformName + "] 请求限流,等待 " + waitTime + "ms");
try {
Thread.sleep(waitTime);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
lastRequestTime = System.currentTimeMillis();
}
}
private void doExponentialBackoff(int retry) {
try {
long delay = (long) Math.pow(2, retry) * 1000 + (long) (Math.random() * 1000);
System.out.println("[" + platformName + "] 等待 " + delay + "ms 后重试...");
Thread.sleep(delay);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}

391
project/src/main/java/com/example/spider/NetEaseMusicSpider.java

@ -0,0 +1,391 @@
package com.example.spider;
import com.example.core.CrawlResult;
import com.example.core.MusicSpider;
import com.example.core.Platform;
import com.example.model.Chart;
import com.example.model.ChartItem;
import com.example.model.ChartType;
import com.example.model.Comment;
import com.example.model.Song;
import com.example.service.impl.EnhancedHttpClient;
import com.example.strategy.EnhancedAntiBlockStrategy;
import com.example.strategy.SpiderStrategy;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 网易云音乐爬虫
* 支持搜索歌曲获取热门榜单
*/
public class NetEaseMusicSpider extends MusicSpider implements SpiderStrategy {
private static final String BASE_URL = "https://music.163.com";
private static final String SEARCH_URL = "https://music.163.com/api/search/get";
private static final String REFERER = "https://music.163.com/";
private final ObjectMapper objectMapper;
private final EnhancedHttpClient httpClient;
private final EnhancedAntiBlockStrategy antiBlockStrategy;
public NetEaseMusicSpider() {
super(Platform.NETEASE);
this.antiBlockStrategy = EnhancedAntiBlockStrategy.createForMusic();
this.httpClient = new EnhancedHttpClient("网易云音乐", antiBlockStrategy);
this.httpClient.setReferer(REFERER);
this.httpClient.setOrigin("https://music.163.com");
this.objectMapper = new ObjectMapper();
}
@Override
protected String executeRequest(String url, Map<String, String> headers) {
if (httpClient != null) {
Map<String, String> simpleHeaders = new HashMap<>();
simpleHeaders.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
simpleHeaders.put("Referer", REFERER);
simpleHeaders.put("Origin", "https://music.163.com");
simpleHeaders.put("Accept", "application/json");
simpleHeaders.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
String response = httpClient.get(url, simpleHeaders);
return response;
}
return super.executeRequest(url, headers);
}
@Override
public String buildSearchUrl(String keyword) {
String encoded = URLEncoder.encode(keyword, StandardCharsets.UTF_8);
return SEARCH_URL + "?csrf_token=&s=" + encoded + "&type=1&offset=0&total=true&limit=10";
}
@Override
public String buildDetailUrl(String itemId) {
return BASE_URL + "/song?id=" + itemId;
}
@Override
protected String buildSongDetailUrl(long songId) {
return "https://music.163.com/api/song/detail?ids=[" + songId + "]";
}
@Override
protected String buildChartListUrl() {
return "https://music.163.com/api/playlist/list?cat=全部&order=hot&limit=50&offset=0";
}
@Override
protected String buildChartDetailUrl(String chartId, int limit) {
return "https://music.163.com/api/playlist/detail?id=" + chartId + "&n=" + limit;
}
@Override
protected Map<String, String> getHeaders() {
Map<String, String> headers = new HashMap<>();
headers.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
headers.put("Referer", REFERER);
headers.put("Origin", "https://music.163.com");
headers.put("Accept", "application/json");
headers.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
return headers;
}
@Override
protected List<Song> parseSearchResponse(String response) {
List<Song> songs = new ArrayList<>();
if (response == null || response.isEmpty()) {
System.out.println("[网易云音乐] 搜索响应为空");
return songs;
}
try {
JsonNode data = objectMapper.readTree(response);
int code = data.path("code").asInt(-1);
if (code != 200) {
System.out.println("[网易云音乐] 搜索API返回错误码: " + code);
return songs;
}
JsonNode result = data.path("result");
JsonNode songArray = result.path("songs");
if (!songArray.isArray() || songArray.isEmpty()) {
System.out.println("[网易云音乐] 搜索结果为空数组");
} else {
System.out.println("[网易云音乐] 找到 " + songArray.size() + " 首歌曲");
for (JsonNode songNode : songArray) {
Song song = parseSongNode(songNode);
if (song != null) {
songs.add(song);
System.out.println(" ✓ " + song.getName() + " - " + String.join("/", song.getArtists()));
}
}
System.out.println("[网易云音乐] 成功解析 " + songs.size() + " 首歌曲");
}
} catch (Exception e) {
System.out.println("[网易云音乐] 解析搜索结果失败: " + e.getMessage());
}
return songs;
}
private Song parseSongNode(JsonNode songNode) {
try {
long id = songNode.path("id").asLong(0);
String name = songNode.path("name").asText("");
if (id == 0 || name.isEmpty()) {
return null;
}
List<String> artists = new ArrayList<>();
JsonNode artistsNode = songNode.path("artists");
if (artistsNode.isArray()) {
for (JsonNode artistNode : artistsNode) {
String artistName = artistNode.path("name").asText("");
if (!artistName.isEmpty()) {
artists.add(artistName);
}
}
}
String album = "";
JsonNode albumNode = songNode.path("album");
if (albumNode.isObject()) {
album = albumNode.path("name").asText("");
}
int duration = songNode.path("duration").asInt(0);
String durationStr = formatDuration(duration);
return new Song(id, name, artists, album, durationStr, "网易云音乐");
} catch (Exception e) {
return null;
}
}
private String formatDuration(int milliseconds) {
if (milliseconds <= 0) {
return "未知";
}
int seconds = milliseconds / 1000;
int minutes = seconds / 60;
int secs = seconds % 60;
return String.format("%d:%02d", minutes, secs);
}
@Override
protected Song parseSongDetailResponse(String response, long songId) {
try {
JsonNode data = objectMapper.readTree(response);
int code = data.path("code").asInt(-1);
if (code != 200) {
return null;
}
JsonNode songsArray = data.path("songs");
if (!songsArray.isArray() || songsArray.isEmpty()) {
return null;
}
return parseSongNode(songsArray.get(0));
} catch (Exception e) {
return null;
}
}
@Override
protected List<Chart> parseChartListResponse(String response) {
List<Chart> charts = new ArrayList<>();
if (response == null || response.isEmpty()) {
return charts;
}
try {
JsonNode data = objectMapper.readTree(response);
int code = data.path("code").asInt(-1);
if (code != 200) {
return charts;
}
JsonNode playlists = data.path("playlists");
if (!playlists.isArray()) {
return charts;
}
for (JsonNode playlistNode : playlists) {
long id = playlistNode.path("id").asLong(0);
String name = playlistNode.path("name").asText("");
if (id == 0 || name.isEmpty()) {
continue;
}
String coverUrl = playlistNode.path("coverImgUrl").asText("");
String updateTime = playlistNode.path("updateTime").asText("");
String description = playlistNode.path("description").asText("");
Chart chart = new Chart(String.valueOf(id), name, ChartType.HOT,
coverUrl, updateTime, description, "网易云音乐");
charts.add(chart);
}
} catch (Exception e) {
System.out.println("[网易云音乐] 解析榜单列表失败: " + e.getMessage());
}
return charts;
}
@Override
protected Chart parseChartDetailResponse(String response, String chartId) {
if (response == null || response.isEmpty()) {
return null;
}
try {
JsonNode data = objectMapper.readTree(response);
int code = data.path("code").asInt(-1);
if (code != 200) {
return null;
}
JsonNode result = data.path("result");
String name = result.path("name").asText("");
if (name.isEmpty()) {
return null;
}
String coverUrl = result.path("coverImgUrl").asText("");
String updateTime = result.path("updateTime").asText("");
String description = result.path("description").asText("");
int trackCount = result.path("trackCount").asInt(0);
List<ChartItem> items = new ArrayList<>();
JsonNode tracks = result.path("tracks");
if (tracks.isArray()) {
int rank = 1;
for (JsonNode trackNode : tracks) {
ChartItem item = parseChartItem(trackNode, rank++);
if (item != null) {
items.add(item);
}
}
}
Chart chart = new Chart(chartId, name, ChartType.HOT,
coverUrl, updateTime, description, items, "网易云音乐", trackCount);
return chart;
} catch (Exception e) {
System.out.println("[网易云音乐] 解析榜单详情失败: " + e.getMessage());
return null;
}
}
private ChartItem parseChartItem(JsonNode trackNode, int rank) {
try {
String songName = trackNode.path("name").asText("");
long songId = trackNode.path("id").asLong(0);
if (songName.isEmpty() || songId == 0) {
return null;
}
List<String> artists = new ArrayList<>();
JsonNode artistsNode = trackNode.path("artists");
if (artistsNode.isArray()) {
for (JsonNode artistNode : artistsNode) {
artists.add(artistNode.path("name").asText(""));
}
}
String album = trackNode.path("album").path("name").asText("");
String coverUrl = trackNode.path("album").path("picUrl").asText("");
return new ChartItem(rank, songId, songName, artists, album, 0, 0, coverUrl, 0);
} catch (Exception e) {
return null;
}
}
@Override
protected String buildCommentUrl(long songId, int limit, int offset) {
return "https://music.163.com/api/v1/resource/comments/R_SO_4_" + songId + "?offset=" + offset + "&total=true&limit=" + limit;
}
@Override
protected List<Comment> parseCommentResponse(String response) {
List<Comment> comments = new ArrayList<>();
if (response == null || response.isEmpty()) {
return comments;
}
try {
JsonNode data = objectMapper.readTree(response);
JsonNode commentArray = data.path("comments");
if (commentArray.isArray()) {
for (JsonNode commentNode : commentArray) {
Comment comment = parseCommentNode(commentNode);
if (comment != null) {
comments.add(comment);
}
}
}
} catch (Exception e) {
System.out.println("[网易云音乐] 解析评论失败: " + e.getMessage());
}
return comments;
}
private Comment parseCommentNode(JsonNode commentNode) {
try {
long commentId = commentNode.path("commentId").asLong(0);
String content = commentNode.path("content").asText("");
String nickname = commentNode.path("user").path("nickname").asText("");
long likedCount = commentNode.path("likedCount").asLong(0);
if (content.isEmpty()) {
return null;
}
return new Comment(content, nickname, (int) likedCount, commentId);
} catch (Exception e) {
return null;
}
}
@Override
public CrawlResult<List<?>> executeCrawl(String keyword) {
System.out.println("[网易云音乐] 开始搜索: " + keyword);
CrawlResult<List<Song>> result = searchSongs(keyword);
if (result.isSuccess() && result.getData() != null) {
return CrawlResult.success(result.getData(), result.getPlatform());
} else {
return CrawlResult.failure(result != null ? result.getMessage() : "未知错误", result != null ? result.getPlatform() : Platform.NETEASE);
}
}
@Override
public String getPlatformName() {
return "网易云音乐";
}
}

494
project/src/main/java/com/example/spider/book/DangdangBookSpider.java

@ -0,0 +1,494 @@
package com.example.spider.book;
import com.example.core.CrawlResult;
import com.example.core.Platform;
import com.example.model.BookItem;
import com.example.service.impl.EnhancedHttpClient;
import com.example.strategy.EnhancedAntiBlockStrategy;
import com.example.strategy.SpiderStrategy;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
/**
* 当当图书爬虫
* 支持搜索图书获取热门榜单
*/
public class DangdangBookSpider implements SpiderStrategy {
private static final String BASE_URL = "https://www.dangdang.com";
private static final String SEARCH_URL = "https://search.dangdang.com";
private static final String REFERER = "https://www.dangdang.com/";
private final EnhancedHttpClient httpClient;
private final EnhancedAntiBlockStrategy antiBlockStrategy;
public DangdangBookSpider() {
this.antiBlockStrategy = EnhancedAntiBlockStrategy.createForBook();
this.httpClient = new EnhancedHttpClient("当当图书", antiBlockStrategy);
this.httpClient.setReferer(REFERER);
this.httpClient.setOrigin(BASE_URL);
}
private String executeRequest(String url, Map<String, String> headers) {
if (httpClient != null) {
Map<String, String> simpleHeaders = new HashMap<>();
simpleHeaders.put("User-Agent", antiBlockStrategy.getRandomUserAgent());
simpleHeaders.put("Referer", REFERER);
simpleHeaders.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
simpleHeaders.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
return httpClient.get(url, simpleHeaders);
}
return null;
}
private Map<String, String> getHeaders() {
Map<String, String> headers = new HashMap<>();
headers.put("User-Agent", antiBlockStrategy.getRandomUserAgent());
headers.put("Referer", REFERER);
headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
headers.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
return headers;
}
/**
* 搜索图书
* 支持中文英文拼音输入
* 只使用真实数据不使用备用数据
*/
public CrawlResult<List<BookItem>> searchBooks(String keyword) {
try {
// 检测是否为拼音输入(只包含字母且长度大于1,且不是常见英文单词)
if (isPinyin(keyword)) {
System.out.println("[当当图书] 检测到拼音输入: " + keyword);
CrawlResult<List<BookItem>> pinyinResult = searchByPinyin(keyword);
// 如果拼音搜索失败,回退到直接搜索
if (!pinyinResult.isSuccess()) {
System.out.println("[当当图书] 拼音搜索失败,尝试直接搜索");
} else {
return pinyinResult;
}
}
String encoded = URLEncoder.encode(keyword, StandardCharsets.UTF_8);
String url = SEARCH_URL + "/?key=" + encoded + "&act=input&page_index=1&sort_type=sort_default";
System.out.println("[当当图书] 正在搜索: " + keyword);
String response = executeRequest(url, getHeaders());
if (response == null || response.isEmpty()) {
System.out.println("[当当图书] 搜索响应为空");
return CrawlResult.failure("搜索响应为空", Platform.DANGDANG);
}
List<BookItem> books = parseSearchResponse(response);
if (books.isEmpty()) {
System.out.println("[当当图书] 搜索结果为空");
return CrawlResult.failure("搜索结果为空", Platform.DANGDANG);
}
System.out.println("[当当图书] 搜索到 " + books.size() + " 本图书");
return CrawlResult.success(books, Platform.DANGDANG);
} catch (Exception e) {
System.out.println("[当当图书] 搜索异常: " + e.getMessage());
return CrawlResult.failure("搜索异常: " + e.getMessage(), Platform.DANGDANG);
}
}
/**
* 检测字符串是否为拼音
* 规则只包含字母长度大于1且不是常见英文单词
*/
private boolean isPinyin(String keyword) {
if (keyword == null || keyword.isEmpty() || keyword.length() < 2) {
return false;
}
// 只包含字母的字符串
Pattern pattern = Pattern.compile("^[a-zA-Z]+$");
if (!pattern.matcher(keyword).matches()) {
return false;
}
String lower = keyword.toLowerCase();
// 常见英文单词列表(排除这些词作为拼音)
String[] commonWords = {
"java", "python", "c", "c++", "javascript", "html", "css", "sql", "php",
"android", "ios", "windows", "linux", "mac", "book", "books", "read",
"free", "new", "best", "top", "hot", "sale", "buy", "price", "shop",
"good", "great", "love", "like", "know", "get", "go", "come", "make",
"time", "year", "way", "day", "man", "think", "take", "people", "into",
"just", "good", "over", "such", "some", "could", "would", "than", "then",
"first", "last", "give", "most", "even", "only", "come", "might", "now"
};
for (String word : commonWords) {
if (word.equals(lower)) {
return false;
}
}
// 检查是否符合拼音规则(包含常见拼音韵母)
String[] pinyinPatterns = {"a", "o", "e", "i", "u", "v", "ai", "ei", "ui", "ao", "ou", "iu", "ie", "ue", "er", "an", "en", "in", "un", "vn", "ang", "eng", "ing", "ong"};
for (String p : pinyinPatterns) {
if (lower.contains(p)) {
return true;
}
}
// 如果长度较长且只包含字母,也视为拼音
return keyword.length() >= 3;
}
/**
* 通过拼音搜索图书
* 策略直接在候选图书列表中进行本地拼音匹配当当网拼音搜索效果不佳
*/
private CrawlResult<List<BookItem>> searchByPinyin(String pinyin) {
System.out.println("[当当图书] 通过拼音搜索: " + pinyin);
// 策略1:先尝试直接搜索拼音(当当网可能支持拼音搜索)
CrawlResult<List<BookItem>> directResult = searchBooksByKeyword(pinyin);
boolean hasGoodResult = false;
if (directResult.isSuccess() && !directResult.getData().isEmpty()) {
List<BookItem> books = directResult.getData();
System.out.println("[当当图书] 直接拼音搜索找到 " + books.size() + " 本图书");
// 检查结果中是否有完全匹配的中文书籍(书名主要是中文,不是英文书名加中文前缀)
for (BookItem book : books) {
String title = book.getTitle();
if (isMainlyChinese(title) && isPinyinMatch(title, pinyin)) {
hasGoodResult = true;
break;
}
}
if (hasGoodResult) {
return directResult;
}
}
// 策略2:在候选图书列表中进行本地拼音匹配
System.out.println("[当当图书] 尝试本地拼音匹配...");
List<BookItem> allBooks = new ArrayList<>();
// 获取多个候选来源(增加更多关键词提高匹配概率)
String[] keywords = {"畅销", "热门", "小说", "文学", "科幻", "经典", "名著", pinyin};
for (String kw : keywords) {
CrawlResult<List<BookItem>> result = searchBooksByKeyword(kw);
if (result.isSuccess() && result.getData() != null) {
allBooks.addAll(result.getData());
}
}
if (allBooks.isEmpty()) {
System.out.println("[当当图书] 获取候选图书列表失败");
return CrawlResult.failure("获取候选图书列表失败", Platform.DANGDANG);
}
// 去重
List<BookItem> uniqueBooks = new ArrayList<>();
for (BookItem book : allBooks) {
if (!uniqueBooks.contains(book)) {
uniqueBooks.add(book);
}
}
System.out.println("[当当图书] 候选图书总数: " + uniqueBooks.size());
List<BookItem> matchedBooks = new ArrayList<>();
String lowerPinyin = pinyin.toLowerCase().trim();
for (BookItem book : uniqueBooks) {
String title = book.getTitle();
String author = book.getAuthor();
// 将书名和作者转换为拼音进行匹配
String titlePinyin = convertToPinyin(title);
String authorPinyin = convertToPinyin(author);
// 检查拼音是否匹配(支持部分匹配)
if (titlePinyin.contains(lowerPinyin) ||
authorPinyin.contains(lowerPinyin)) {
matchedBooks.add(book);
System.out.println("[当当图书] ✓ 匹配成功: " + title + "(" + titlePinyin + ")");
}
}
if (matchedBooks.isEmpty()) {
System.out.println("[当当图书] 拼音搜索未找到匹配结果");
return CrawlResult.failure("拼音搜索未找到匹配结果", Platform.DANGDANG);
}
System.out.println("[当当图书] 拼音搜索找到 " + matchedBooks.size() + " 本匹配图书");
return CrawlResult.success(matchedBooks, Platform.DANGDANG);
}
/**
* 检查字符串是否主要是中文中文字符占比超过50%
*/
private boolean isMainlyChinese(String str) {
if (str == null || str.isEmpty()) {
return false;
}
int chineseCount = 0;
for (char c : str.toCharArray()) {
if (Character.UnicodeScript.of(c) == Character.UnicodeScript.HAN) {
chineseCount++;
}
}
return (double) chineseCount / str.length() > 0.5;
}
/**
* 检查书名是否与拼音匹配
*/
private boolean isPinyinMatch(String title, String pinyin) {
if (title == null || pinyin == null) {
return false;
}
String titlePinyin = convertToPinyin(title).toLowerCase();
String lowerPinyin = pinyin.toLowerCase().trim();
return titlePinyin.contains(lowerPinyin);
}
/**
* 直接搜索关键词内部方法不检测拼音
*/
private CrawlResult<List<BookItem>> searchBooksByKeyword(String keyword) {
try {
String encoded = URLEncoder.encode(keyword, StandardCharsets.UTF_8);
String url = SEARCH_URL + "/?key=" + encoded + "&act=input&page_index=1&sort_type=sort_default";
String response = executeRequest(url, getHeaders());
if (response == null || response.isEmpty()) {
return CrawlResult.failure("搜索响应为空", Platform.DANGDANG);
}
List<BookItem> books = parseSearchResponse(response);
if (books.isEmpty()) {
return CrawlResult.failure("搜索结果为空", Platform.DANGDANG);
}
return CrawlResult.success(books, Platform.DANGDANG);
} catch (Exception e) {
return CrawlResult.failure("搜索异常: " + e.getMessage(), Platform.DANGDANG);
}
}
/**
* 将中文字符串转换为拼音
*/
private String convertToPinyin(String chinese) {
if (chinese == null || chinese.isEmpty()) {
return "";
}
HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
format.setVCharType(HanyuPinyinVCharType.WITH_V);
StringBuilder pinyin = new StringBuilder();
for (char c : chinese.toCharArray()) {
try {
String[] pinyinArray = PinyinHelper.toHanyuPinyinStringArray(c, format);
if (pinyinArray != null && pinyinArray.length > 0) {
pinyin.append(pinyinArray[0]);
} else {
// 如果不是汉字,保持原样
pinyin.append(c);
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
pinyin.append(c);
}
}
return pinyin.toString();
}
/**
* 获取热门图书
* 通过搜索"畅销"关键词获取热门图书
* 只使用真实数据
*/
public CrawlResult<List<BookItem>> getHotBooks() {
try {
System.out.println("[当当图书] 正在获取热门图书...");
// 使用搜索功能获取畅销图书
CrawlResult<List<BookItem>> result = searchBooks("畅销");
if (result.isSuccess() && !result.getData().isEmpty()) {
System.out.println("[当当图书] 获取到 " + result.getData().size() + " 本热门图书");
return result;
} else {
System.out.println("[当当图书] 获取热门图书失败");
return CrawlResult.failure("获取热门图书失败", Platform.DANGDANG);
}
} catch (Exception e) {
System.out.println("[当当图书] 获取热门图书异常: " + e.getMessage());
return CrawlResult.failure("获取热门图书异常: " + e.getMessage(), Platform.DANGDANG);
}
}
/**
* 解析搜索响应
* 使用当当网实际的HTML结构
*/
private List<BookItem> parseSearchResponse(String response) {
List<BookItem> books = new ArrayList<>();
if (response == null || response.isEmpty()) {
return books;
}
try {
Document doc = Jsoup.parse(response);
// 使用当当网实际的选择器
Elements bookItems = doc.select(".bigimg li");
for (Element item : bookItems) {
try {
// 提取书名
Element titleElem = item.selectFirst(".pic[title], .name a[title]");
String title = "";
String href = "";
if (titleElem != null) {
title = titleElem.attr("title").trim();
href = titleElem.attr("href");
}
// 提取作者
Element authorElem = item.selectFirst(".search_book_author a[title]");
String author = "";
if (authorElem != null) {
author = authorElem.attr("title").trim();
}
// 提取价格
Element priceElem = item.selectFirst(".search_now_price");
String price = "";
if (priceElem != null) {
price = priceElem.text().trim().replace("¥", "").replace("?", "");
}
// 提取评论数(作为评分参考)
Element commentElem = item.selectFirst(".search_comment_num");
String commentCount = "0";
if (commentElem != null) {
String commentText = commentElem.text().trim();
// 提取数字,如"3479条评论" -> "3479"
commentCount = commentText.replaceAll("[^0-9]", "");
}
if (!title.isEmpty() && title.length() > 2) {
String bookId = extractBookId(href);
// 使用评论数作为评分参考,评论数越多评分越高
String rating = calculateRating(commentCount);
books.add(new BookItem(bookId, title, author, rating, "当当图书", "", price));
if (books.size() >= 10) break;
}
} catch (Exception e) {
// 跳过解析失败的单条
}
}
} catch (Exception e) {
System.out.println("[当当图书] 解析响应失败: " + e.getMessage());
}
return books;
}
/**
* 根据评论数计算评分
*/
private String calculateRating(String commentCount) {
try {
int count = Integer.parseInt(commentCount);
if (count >= 10000) return "9.5";
if (count >= 5000) return "9.2";
if (count >= 2000) return "9.0";
if (count >= 1000) return "8.8";
if (count >= 500) return "8.5";
if (count >= 100) return "8.0";
return "7.5";
} catch (Exception e) {
return "8.0";
}
}
private String extractBookId(String href) {
if (href == null) return "0";
try {
String[] parts = href.split("/");
String lastPart = parts[parts.length - 1];
return lastPart.replaceAll("[^0-9]", "");
} catch (Exception e) {
return "0";
}
}
@Override
public String buildSearchUrl(String keyword) {
try {
String encoded = URLEncoder.encode(keyword, StandardCharsets.UTF_8);
return SEARCH_URL + "/?key=" + encoded + "&act=input";
} catch (Exception e) {
return SEARCH_URL;
}
}
@Override
public String buildDetailUrl(String itemId) {
return BASE_URL + "/product/" + itemId + ".html";
}
@Override
public CrawlResult<List<?>> executeCrawl(String keyword) {
CrawlResult<List<BookItem>> result;
if (keyword == null || keyword.isEmpty()) {
result = getHotBooks();
} else {
result = searchBooks(keyword);
}
if (result.isSuccess() && result.getData() != null) {
return CrawlResult.success(new ArrayList<>(result.getData()), Platform.DANGDANG);
} else {
return CrawlResult.failure(result != null ? result.getMessage() : "未知错误", Platform.DANGDANG);
}
}
@Override
public String getPlatformName() {
return "当当图书";
}
}

355
project/src/main/java/com/example/spider/movie/DoubanMovieSpider.java

@ -0,0 +1,355 @@
package com.example.spider.movie;
import com.example.core.CrawlResult;
import com.example.core.Platform;
import com.example.model.MovieItem;
import com.example.service.impl.EnhancedHttpClient;
import com.example.strategy.EnhancedAntiBlockStrategy;
import com.example.strategy.SpiderStrategy;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 豆瓣电影爬虫
* 反爬策略相对宽松数据质量高支持拼音搜索
*/
public class DoubanMovieSpider implements SpiderStrategy {
private static final String BASE_URL = "https://movie.douban.com";
private static final String SEARCH_URL = "https://movie.douban.com/subject_search";
private static final String TOP250_URL = "https://movie.douban.com/top250";
private static final String REFERER = "https://movie.douban.com/";
private final EnhancedHttpClient httpClient;
private final EnhancedAntiBlockStrategy antiBlockStrategy;
public DoubanMovieSpider() {
this.antiBlockStrategy = EnhancedAntiBlockStrategy.createForMovie();
this.httpClient = new EnhancedHttpClient("豆瓣电影", antiBlockStrategy);
this.httpClient.setReferer(REFERER);
this.httpClient.setOrigin(BASE_URL);
}
private Map<String, String> getHeaders() {
Map<String, String> headers = new HashMap<>();
headers.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
headers.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
return headers;
}
/**
* 搜索电影
*/
public CrawlResult<List<MovieItem>> searchMovies(String keyword) {
try {
String encoded = URLEncoder.encode(keyword, StandardCharsets.UTF_8);
String url = SEARCH_URL + "?search_text=" + encoded + "&cat=1002";
System.out.println("[豆瓣电影] 正在搜索: " + keyword);
String response = httpClient.get(url, getHeaders());
if (response == null || response.isEmpty()) {
System.out.println("[豆瓣电影] 搜索响应为空");
return CrawlResult.failure("搜索响应为空", Platform.DOUBAN);
}
List<MovieItem> movies = parseSearchResponse(response);
if (movies.isEmpty()) {
System.out.println("[豆瓣电影] 搜索结果为空");
return CrawlResult.failure("搜索结果为空", Platform.DOUBAN);
}
System.out.println("[豆瓣电影] 搜索到 " + movies.size() + " 部电影");
return CrawlResult.success(movies, Platform.DOUBAN);
} catch (Exception e) {
System.out.println("[豆瓣电影] 搜索异常: " + e.getMessage());
return CrawlResult.failure("搜索异常: " + e.getMessage(), Platform.DOUBAN);
}
}
/**
* 获取热门电影
*/
public CrawlResult<List<MovieItem>> getHotMovies() {
try {
System.out.println("[豆瓣电影] 正在获取TOP250高分电影榜单...");
List<MovieItem> movies = new ArrayList<>();
// 从TOP250页面获取高分电影
String response = httpClient.get(TOP250_URL, getHeaders());
if (response == null || response.isEmpty()) {
System.out.println("[豆瓣电影] TOP250响应为空");
return CrawlResult.failure("响应为空", Platform.DOUBAN);
}
movies = parseTop250Response(response);
if (movies.isEmpty()) {
System.out.println("[豆瓣电影] TOP250解析为空");
return CrawlResult.failure("解析为空", Platform.DOUBAN);
}
System.out.println("[豆瓣电影] 获取到 " + movies.size() + " 部高分电影");
return CrawlResult.success(movies.subList(0, Math.min(movies.size(), 20)), Platform.DOUBAN);
} catch (Exception e) {
System.out.println("[豆瓣电影] 获取TOP250异常: " + e.getMessage());
return CrawlResult.failure("异常: " + e.getMessage(), Platform.DOUBAN);
}
}
/**
* 解析搜索结果 - 从JavaScript变量中提取数据
*/
private List<MovieItem> parseSearchResponse(String response) {
List<MovieItem> movies = new ArrayList<>();
try {
// 查找 __DATA__ 变量 - 使用更宽松的匹配
int dataStart = response.indexOf("window.__DATA__ =");
if (dataStart == -1) {
dataStart = response.indexOf("__DATA__ =");
}
if (dataStart != -1) {
int jsonStart = dataStart + (response.startsWith("window.__DATA__", dataStart) ? 17 : 10);
// 找到匹配的闭合大括号
int braceCount = 0;
int jsonEnd = -1;
for (int i = jsonStart; i < response.length(); i++) {
char c = response.charAt(i);
if (c == '{') {
braceCount++;
} else if (c == '}') {
braceCount--;
if (braceCount == 0) {
jsonEnd = i + 1;
break;
}
}
}
if (jsonEnd != -1) {
String jsonData = response.substring(jsonStart, jsonEnd);
ObjectMapper mapper = new ObjectMapper();
JsonNode rootNode = mapper.readTree(jsonData);
JsonNode itemsNode = rootNode.get("items");
if (itemsNode != null && itemsNode.isArray()) {
for (JsonNode itemNode : itemsNode) {
try {
String title = itemNode.has("title") ? itemNode.get("title").asText("") : "";
String url = itemNode.has("url") ? itemNode.get("url").asText("") : "";
String rating = "0";
String year = "";
String genre = "";
String director = "";
// 提取评分
JsonNode ratingNode = itemNode.get("rating");
if (ratingNode != null && ratingNode.has("value")) {
rating = String.valueOf(ratingNode.get("value").asDouble());
}
// 从标题中提取年份
year = extractYear(title);
// 从简介中提取类型和导演
String abstractText = itemNode.has("abstract") ? itemNode.get("abstract").asText("") : "";
String abstract2Text = itemNode.has("abstract_2") ? itemNode.get("abstract_2").asText("") : "";
genre = extractGenre(abstractText);
// 提取导演信息
if (!abstract2Text.isEmpty()) {
director = abstract2Text.split("/")[0].trim();
if (director.length() > 10) {
director = director.substring(0, 10) + "...";
}
}
if (!title.isEmpty()) {
String movieId = extractMovieId(url);
MovieItem movie = new MovieItem(movieId, title, rating, year, genre, director);
movies.add(movie);
if (movies.size() >= 10) break;
}
} catch (Exception e) {
// 跳过解析失败的单条
}
}
} else {
System.out.println("[豆瓣电影] items字段为空或不是数组");
}
} else {
System.out.println("[豆瓣电影] 未找到JSON结束位置");
}
} else {
System.out.println("[豆瓣电影] 未找到 __DATA__ 变量");
}
} catch (Exception e) {
System.out.println("[豆瓣电影] 解析搜索结果异常: " + e.getMessage());
}
return movies;
}
/**
* 解析豆瓣TOP250页面 - 获取评分最高的电影
*/
private List<MovieItem> parseTop250Response(String response) {
List<MovieItem> movies = new ArrayList<>();
try {
Document doc = Jsoup.parse(response);
Elements items = doc.select("ol.grid_view li");
for (Element item : items) {
try {
// 提取标题
Element titleElem = item.selectFirst(".title");
String title = titleElem != null ? titleElem.text().trim() : "";
// 提取评分
Element ratingElem = item.selectFirst(".rating_num");
String rating = ratingElem != null ? ratingElem.text().trim() : "0";
// 提取年份和类型
Element infoElem = item.selectFirst(".bd p");
String year = "";
String genre = "";
String director = "";
if (infoElem != null) {
String infoText = infoElem.text().trim();
// 提取年份
year = extractYear(infoText);
// 提取类型
genre = extractGenre(infoText);
// 提取导演
if (infoText.contains("导演:")) {
int start = infoText.indexOf("导演:") + 3;
int end = infoText.indexOf("主演:");
if (end > start) {
director = infoText.substring(start, end).trim();
} else {
end = infoText.indexOf("...");
if (end > start) {
director = infoText.substring(start, end).trim();
}
}
if (director.length() > 10) {
director = director.substring(0, 10) + "...";
}
}
}
// 提取链接获取ID
Element linkElem = item.selectFirst("a[href*='/subject/']");
String href = linkElem != null ? linkElem.attr("href") : "";
String movieId = extractMovieId(href);
if (!title.isEmpty()) {
MovieItem movie = new MovieItem(movieId, title, rating, year, genre, director);
movies.add(movie);
if (movies.size() >= 20) break;
}
} catch (Exception e) {
// 跳过解析失败的单条
}
}
} catch (Exception e) {
System.out.println("[豆瓣电影] 解析TOP250异常: " + e.getMessage());
}
return movies;
}
private String extractMovieId(String href) {
if (href == null) return "";
Pattern pattern = Pattern.compile("/subject/(\\d+)");
Matcher matcher = pattern.matcher(href);
if (matcher.find()) {
return matcher.group(1);
}
return "";
}
private String extractYear(String text) {
if (text == null) return "";
Pattern pattern = Pattern.compile("(\\d{4})");
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
return matcher.group(1);
}
return "";
}
private String extractGenre(String text) {
if (text == null) return "";
String[] genres = {"剧情", "喜剧", "动作", "爱情", "科幻", "悬疑", "惊悚", "恐怖", "动画", "纪录片", "战争", "犯罪", "冒险", "奇幻", "历史"};
for (String genre : genres) {
if (text.contains(genre)) {
return genre;
}
}
return "";
}
@Override
public String buildSearchUrl(String keyword) {
try {
String encoded = URLEncoder.encode(keyword, StandardCharsets.UTF_8);
return SEARCH_URL + "?search_text=" + encoded + "&cat=1002";
} catch (Exception e) {
return SEARCH_URL;
}
}
@Override
public String buildDetailUrl(String itemId) {
return BASE_URL + "/subject/" + itemId + "/";
}
@Override
public CrawlResult<List<?>> executeCrawl(String keyword) {
CrawlResult<List<MovieItem>> result;
if (keyword == null || keyword.isEmpty()) {
result = getHotMovies();
} else {
result = searchMovies(keyword);
}
if (result.isSuccess() && result.getData() != null) {
return CrawlResult.success(new ArrayList<>(result.getData()), Platform.DOUBAN);
} else {
return CrawlResult.failure(result != null ? result.getMessage() : "未知错误", Platform.DOUBAN);
}
}
@Override
public String getPlatformName() {
return "豆瓣电影";
}
}

172
project/src/main/java/com/example/spider/news/ChinanewsSpider.java

@ -0,0 +1,172 @@
package com.example.spider.news;
import com.example.core.CrawlResult;
import com.example.core.Platform;
import com.example.model.NewsItem;
import com.example.service.impl.EnhancedHttpClient;
import com.example.strategy.EnhancedAntiBlockStrategy;
import com.example.strategy.SpiderStrategy;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 中国新闻网RSS爬虫
* 使用官方RSS源获取新闻数据零反爬限制
*/
public class ChinanewsSpider implements SpiderStrategy {
private static final String BASE_URL = "https://www.chinanews.com.cn";
private static final String RSS_URL = "https://www.chinanews.com.cn/rss/scroll-news.xml";
private static final String REFERER = "https://www.chinanews.com.cn/";
private final EnhancedHttpClient httpClient;
private final EnhancedAntiBlockStrategy antiBlockStrategy;
public ChinanewsSpider() {
this.antiBlockStrategy = EnhancedAntiBlockStrategy.createForNews();
this.httpClient = new EnhancedHttpClient("中国新闻网", antiBlockStrategy);
this.httpClient.setReferer(REFERER);
}
private Map<String, String> getHeaders() {
Map<String, String> headers = new HashMap<>();
headers.put("User-Agent", antiBlockStrategy.getRandomUserAgent());
headers.put("Accept", "application/rss+xml, application/xml, text/xml, */*");
headers.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
return headers;
}
/**
* 获取滚动新闻
*/
public CrawlResult<List<NewsItem>> getScrollNews() {
try {
System.out.println("[中国新闻网] 正在获取RSS新闻...");
String response = httpClient.get(RSS_URL, getHeaders());
if (response == null || response.isEmpty()) {
System.out.println("[中国新闻网] RSS响应为空");
return CrawlResult.failure("RSS响应为空", Platform.CHINANEWS);
}
List<NewsItem> newsList = parseRSS(response);
if (newsList.isEmpty()) {
System.out.println("[中国新闻网] RSS解析结果为空");
return CrawlResult.failure("RSS解析结果为空", Platform.CHINANEWS);
}
System.out.println("[中国新闻网] 成功获取 " + newsList.size() + " 条新闻");
return CrawlResult.success(newsList, Platform.CHINANEWS);
} catch (Exception e) {
System.out.println("[中国新闻网] 获取新闻异常: " + e.getMessage());
return CrawlResult.failure("获取新闻异常: " + e.getMessage(), Platform.CHINANEWS);
}
}
/**
* 搜索新闻RSS不支持搜索返回滚动新闻
*/
public CrawlResult<List<NewsItem>> searchNews(String keyword) {
System.out.println("[中国新闻网] RSS不支持搜索,返回滚动新闻");
return getScrollNews();
}
/**
* 解析RSS XML
*/
private List<NewsItem> parseRSS(String xmlContent) {
List<NewsItem> newsList = new ArrayList<>();
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document document = builder.parse(new ByteArrayInputStream(xmlContent.getBytes(StandardCharsets.UTF_8)));
NodeList items = document.getElementsByTagName("item");
for (int i = 0; i < items.getLength() && i < 20; i++) {
Element item = (Element) items.item(i);
String title = getElementText(item, "title");
String link = getElementText(item, "link");
String pubDate = getElementText(item, "pubDate");
String description = getElementText(item, "description");
if (title != null && !title.isEmpty()) {
String formattedDate = formatDate(pubDate);
NewsItem news = new NewsItem(title, link, formattedDate, description);
newsList.add(news);
}
}
} catch (Exception e) {
System.out.println("[中国新闻网] RSS解析异常: " + e.getMessage());
}
return newsList;
}
private String getElementText(Element parent, String tagName) {
NodeList nodes = parent.getElementsByTagName(tagName);
if (nodes.getLength() > 0) {
return nodes.item(0).getTextContent().trim();
}
return "";
}
private String formatDate(String pubDate) {
if (pubDate == null || pubDate.isEmpty()) {
return "";
}
try {
if (pubDate.contains("+")) {
pubDate = pubDate.substring(0, pubDate.indexOf("+")).trim();
}
return pubDate.replace("T", " ").substring(0, Math.min(19, pubDate.length()));
} catch (Exception e) {
return pubDate;
}
}
@Override
public String buildSearchUrl(String keyword) {
return RSS_URL;
}
@Override
public String buildDetailUrl(String itemId) {
return BASE_URL + "/news/" + itemId + ".shtml";
}
@Override
public CrawlResult<List<?>> executeCrawl(String keyword) {
CrawlResult<List<NewsItem>> result;
if (keyword == null || keyword.isEmpty()) {
result = getScrollNews();
} else {
result = searchNews(keyword);
}
if (result.isSuccess() && result.getData() != null) {
return CrawlResult.success(new ArrayList<>(result.getData()), Platform.CHINANEWS);
} else {
return CrawlResult.failure(result != null ? result.getMessage() : "未知错误", Platform.CHINANEWS);
}
}
@Override
public String getPlatformName() {
return "中国新闻网";
}
}

90
project/src/main/java/com/example/storage/DataExporter.java

@ -0,0 +1,90 @@
package com.example.storage;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
public class DataExporter {
private static final ObjectMapper objectMapper = new ObjectMapper();
static {
objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
}
public static void exportToCsv(List<?> data, String filePath) throws IOException {
if (data == null || data.isEmpty()) {
throw new IllegalArgumentException("数据不能为空");
}
try (BufferedWriter writer = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(filePath), StandardCharsets.UTF_8))) {
writer.write('\uFEFF');
writer.write(generateCsvHeader(data.get(0)));
writer.newLine();
for (Object item : data) {
writer.write(generateCsvRow(item));
writer.newLine();
}
}
}
public static void exportToJson(List<?> data, String filePath) throws IOException {
if (data == null || data.isEmpty()) {
throw new IllegalArgumentException("数据不能为空");
}
objectMapper.writeValue(new File(filePath), data);
}
public static <T> List<T> importFromJson(String filePath, Class<T> clazz) throws IOException {
return objectMapper.readValue(new File(filePath),
objectMapper.getTypeFactory().constructCollectionType(List.class, clazz));
}
private static String generateCsvHeader(Object item) {
List<String> headers = new ArrayList<>();
for (java.lang.reflect.Field field : item.getClass().getDeclaredFields()) {
field.setAccessible(true);
headers.add(field.getName());
}
return String.join(",", headers);
}
private static String generateCsvRow(Object item) {
List<String> values = new ArrayList<>();
for (java.lang.reflect.Field field : item.getClass().getDeclaredFields()) {
field.setAccessible(true);
try {
Object value = field.get(item);
values.add(escapeCsvValue(value != null ? value.toString() : ""));
} catch (IllegalAccessException e) {
values.add("");
}
}
return String.join(",", values);
}
private static String escapeCsvValue(String value) {
if (value.contains(",") || value.contains("\"") || value.contains("\n")) {
return "\"" + value.replace("\"", "\"\"") + "\"";
}
return value;
}
public static String getDefaultExportPath(String prefix) {
String timestamp = new java.text.SimpleDateFormat("yyyyMMdd_HHmmss").format(new java.util.Date());
return "data/" + prefix + "_" + timestamp + ".csv";
}
}

433
project/src/main/java/com/example/storage/DatabaseManager.java

@ -0,0 +1,433 @@
package com.example.storage;
import com.example.model.BookItem;
import com.example.model.MovieItem;
import com.example.model.NewsItem;
import com.example.model.Song;
import java.sql.*;
import java.util.ArrayList;
import java.util.List;
/**
* SQLite数据库管理器
* 提供数据的持久化存储和查询功能
*/
public class DatabaseManager {
private static final String DB_NAME = "spider_data.db";
private static final String DB_PATH = "data/" + DB_NAME;
private Connection connection;
/**
* 初始化数据库连接
*/
public void connect() throws SQLException {
// 确保数据目录存在
new java.io.File("data").mkdirs();
String url = "jdbc:sqlite:" + DB_PATH;
connection = DriverManager.getConnection(url);
System.out.println("数据库连接成功: " + DB_PATH);
// 初始化表结构
initTables();
}
/**
* 关闭数据库连接
*/
public void disconnect() {
if (connection != null) {
try {
connection.close();
System.out.println("数据库连接已关闭");
} catch (SQLException e) {
System.err.println("关闭连接失败: " + e.getMessage());
}
}
}
/**
* 初始化数据库表
*/
private void initTables() throws SQLException {
// 创建电影表
executeUpdate("CREATE TABLE IF NOT EXISTS movies (" +
"id TEXT PRIMARY KEY," +
"title TEXT NOT NULL," +
"rating TEXT," +
"release_date TEXT," +
"genre TEXT," +
"director TEXT," +
"created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)");
// 创建图书表
executeUpdate("CREATE TABLE IF NOT EXISTS books (" +
"id INTEGER PRIMARY KEY AUTOINCREMENT," +
"title TEXT NOT NULL," +
"author TEXT," +
"publisher TEXT," +
"rating TEXT," +
"price TEXT," +
"created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)");
// 创建音乐表
executeUpdate("CREATE TABLE IF NOT EXISTS songs (" +
"id TEXT PRIMARY KEY," +
"name TEXT NOT NULL," +
"artists TEXT," +
"album TEXT," +
"duration TEXT," +
"platform TEXT," +
"created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)");
// 创建新闻表
executeUpdate("CREATE TABLE IF NOT EXISTS news (" +
"id INTEGER PRIMARY KEY AUTOINCREMENT," +
"title TEXT NOT NULL," +
"summary TEXT," +
"url TEXT," +
"publish_time TEXT," +
"created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)");
System.out.println("数据库表初始化完成");
}
/**
* 执行更新操作
*/
private void executeUpdate(String sql) throws SQLException {
try (Statement stmt = connection.createStatement()) {
stmt.executeUpdate(sql);
}
}
/**
* 插入电影数据
*/
public void insertMovie(MovieItem movie) throws SQLException {
String sql = "INSERT OR REPLACE INTO movies (id, title, rating, release_date, genre, director) " +
"VALUES (?, ?, ?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
pstmt.setString(1, movie.getId() != null ? movie.getId() : "");
pstmt.setString(2, movie.getTitle());
pstmt.setString(3, movie.getRating());
pstmt.setString(4, movie.getReleaseDate());
pstmt.setString(5, movie.getGenre());
pstmt.setString(6, movie.getDirector());
pstmt.executeUpdate();
}
}
/**
* 批量插入电影数据
*/
public void insertMovies(List<MovieItem> movies) throws SQLException {
connection.setAutoCommit(false);
try {
String sql = "INSERT OR REPLACE INTO movies (id, title, rating, release_date, genre, director) " +
"VALUES (?, ?, ?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
for (MovieItem movie : movies) {
pstmt.setString(1, movie.getId() != null ? movie.getId() : "");
pstmt.setString(2, movie.getTitle());
pstmt.setString(3, movie.getRating());
pstmt.setString(4, movie.getReleaseDate());
pstmt.setString(5, movie.getGenre());
pstmt.setString(6, movie.getDirector());
pstmt.addBatch();
}
pstmt.executeBatch();
}
connection.commit();
System.out.println("成功插入 " + movies.size() + " 条电影数据");
} catch (SQLException e) {
connection.rollback();
throw e;
} finally {
connection.setAutoCommit(true);
}
}
/**
* 查询所有电影
*/
public List<MovieItem> getAllMovies() throws SQLException {
List<MovieItem> movies = new ArrayList<>();
String sql = "SELECT id, title, rating, release_date, genre, director FROM movies";
try (Statement stmt = connection.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
MovieItem movie = new MovieItem(
rs.getString("id"),
rs.getString("title"),
rs.getString("rating"),
rs.getString("release_date"),
rs.getString("genre"),
rs.getString("director")
);
movies.add(movie);
}
}
return movies;
}
/**
* 插入图书数据
*/
public void insertBook(BookItem book) throws SQLException {
String sql = "INSERT INTO books (title, author, publisher, rating, price) " +
"VALUES (?, ?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
pstmt.setString(1, book.getTitle());
pstmt.setString(2, book.getAuthor());
pstmt.setString(3, book.getPublisher());
pstmt.setString(4, book.getRating());
pstmt.setString(5, book.getPrice());
pstmt.executeUpdate();
}
}
/**
* 批量插入图书数据
*/
public void insertBooks(List<BookItem> books) throws SQLException {
connection.setAutoCommit(false);
try {
String sql = "INSERT INTO books (title, author, publisher, rating, price) " +
"VALUES (?, ?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
for (BookItem book : books) {
pstmt.setString(1, book.getTitle());
pstmt.setString(2, book.getAuthor());
pstmt.setString(3, book.getPublisher());
pstmt.setString(4, book.getRating());
pstmt.setString(5, book.getPrice());
pstmt.addBatch();
}
pstmt.executeBatch();
}
connection.commit();
System.out.println("成功插入 " + books.size() + " 条图书数据");
} catch (SQLException e) {
connection.rollback();
throw e;
} finally {
connection.setAutoCommit(true);
}
}
/**
* 查询所有图书
*/
public List<BookItem> getAllBooks() throws SQLException {
List<BookItem> books = new ArrayList<>();
String sql = "SELECT title, author, publisher, rating, price FROM books";
try (Statement stmt = connection.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
BookItem book = new BookItem(
rs.getString("title"),
rs.getString("author"),
rs.getString("publisher"),
rs.getString("rating"),
rs.getString("price")
);
books.add(book);
}
}
return books;
}
/**
* 插入歌曲数据
*/
public void insertSong(Song song) throws SQLException {
String sql = "INSERT OR REPLACE INTO songs (id, name, artists, album, duration, platform) " +
"VALUES (?, ?, ?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
pstmt.setString(1, String.valueOf(song.getSongId()));
pstmt.setString(2, song.getName());
pstmt.setString(3, String.join(",", song.getArtists()));
pstmt.setString(4, song.getAlbum());
pstmt.setString(5, song.getDuration());
pstmt.setString(6, song.getPlatform());
pstmt.executeUpdate();
}
}
/**
* 批量插入歌曲数据
*/
public void insertSongs(List<Song> songs) throws SQLException {
connection.setAutoCommit(false);
try {
String sql = "INSERT OR REPLACE INTO songs (id, name, artists, album, duration, platform) " +
"VALUES (?, ?, ?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
for (Song song : songs) {
pstmt.setString(1, String.valueOf(song.getSongId()));
pstmt.setString(2, song.getName());
pstmt.setString(3, String.join(",", song.getArtists()));
pstmt.setString(4, song.getAlbum());
pstmt.setString(5, song.getDuration());
pstmt.setString(6, song.getPlatform());
pstmt.addBatch();
}
pstmt.executeBatch();
}
connection.commit();
System.out.println("成功插入 " + songs.size() + " 条歌曲数据");
} catch (SQLException e) {
connection.rollback();
throw e;
} finally {
connection.setAutoCommit(true);
}
}
/**
* 查询所有歌曲
*/
public List<Song> getAllSongs() throws SQLException {
List<Song> songs = new ArrayList<>();
String sql = "SELECT id, name, artists, album, duration, platform FROM songs";
try (Statement stmt = connection.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
List<String> artists = new ArrayList<>();
String artistStr = rs.getString("artists");
if (artistStr != null && !artistStr.isEmpty()) {
for (String artist : artistStr.split(",")) {
artists.add(artist.trim());
}
}
long songId = 0;
try {
songId = Long.parseLong(rs.getString("id"));
} catch (NumberFormatException e) {
songId = 0;
}
Song song = new Song(
songId,
rs.getString("name"),
artists,
rs.getString("album"),
rs.getString("duration"),
rs.getString("platform")
);
songs.add(song);
}
}
return songs;
}
/**
* 插入新闻数据
*/
public void insertNews(NewsItem news) throws SQLException {
String sql = "INSERT INTO news (title, summary, url, publish_time) " +
"VALUES (?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
pstmt.setString(1, news.getTitle());
pstmt.setString(2, news.getSummary());
pstmt.setString(3, news.getUrl());
pstmt.setString(4, news.getPublishTime());
pstmt.executeUpdate();
}
}
/**
* 批量插入新闻数据
*/
public void insertNewsList(List<NewsItem> newsList) throws SQLException {
connection.setAutoCommit(false);
try {
String sql = "INSERT INTO news (title, summary, url, publish_time) " +
"VALUES (?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
for (NewsItem news : newsList) {
pstmt.setString(1, news.getTitle());
pstmt.setString(2, news.getSummary());
pstmt.setString(3, news.getUrl());
pstmt.setString(4, news.getPublishTime());
pstmt.addBatch();
}
pstmt.executeBatch();
}
connection.commit();
System.out.println("成功插入 " + newsList.size() + " 条新闻数据");
} catch (SQLException e) {
connection.rollback();
throw e;
} finally {
connection.setAutoCommit(true);
}
}
/**
* 查询所有新闻
*/
public List<NewsItem> getAllNews() throws SQLException {
List<NewsItem> newsList = new ArrayList<>();
String sql = "SELECT title, summary, url, publish_time FROM news";
try (Statement stmt = connection.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
NewsItem news = new NewsItem(
rs.getString("title"),
rs.getString("summary"),
rs.getString("url"),
rs.getString("publish_time")
);
newsList.add(news);
}
}
return newsList;
}
/**
* 获取表记录数
*/
public int getRecordCount(String tableName) throws SQLException {
String sql = "SELECT COUNT(*) AS count FROM " + tableName;
try (Statement stmt = connection.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
if (rs.next()) {
return rs.getInt("count");
}
}
return 0;
}
/**
* 清空表数据
*/
public void clearTable(String tableName) throws SQLException {
executeUpdate("DELETE FROM " + tableName);
System.out.println("已清空表: " + tableName);
}
/**
* 检查数据库连接状态
*/
public boolean isConnected() {
try {
return connection != null && !connection.isClosed();
} catch (SQLException e) {
return false;
}
}
}

47
project/src/main/java/com/example/storage/JsonExporter.java

@ -0,0 +1,47 @@
package com.example.storage;
import com.example.model.Article;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
public class JsonExporter {
private static final ObjectMapper objectMapper = new ObjectMapper();
private static final String DEFAULT_PATH = "./data/articles.json";
static {
objectMapper.registerModule(new JavaTimeModule());
objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
objectMapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
}
private JsonExporter() {}
public static void export(List<Article> articles) throws IOException {
export(articles, DEFAULT_PATH);
}
public static void export(List<Article> articles, String path) throws IOException {
if (articles == null || articles.isEmpty()) {
throw new IllegalArgumentException("数据列表不能为空");
}
File file = new File(path);
File parentDir = file.getParentFile();
if (parentDir != null && !parentDir.exists()) {
parentDir.mkdirs();
}
try (FileOutputStream fos = new FileOutputStream(file)) {
objectMapper.writeValue(fos, articles);
}
System.out.println("[INFO] 数据已导出到: " + file.getAbsolutePath());
}
}

44
project/src/main/java/com/example/storage/JsonImporter.java

@ -0,0 +1,44 @@
package com.example.storage;
import com.example.model.Article;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class JsonImporter {
private static final ObjectMapper objectMapper = new ObjectMapper();
private static final String DEFAULT_PATH = "./data/articles.json";
static {
objectMapper.registerModule(new JavaTimeModule());
objectMapper.disable(com.fasterxml.jackson.databind.SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
}
private JsonImporter() {}
public static List<Article> importData() throws IOException {
return importData(DEFAULT_PATH);
}
public static List<Article> importData(String path) throws IOException {
File file = new File(path);
if (!file.exists()) {
System.out.println("[WARN] 文件不存在: " + path);
return new ArrayList<>();
}
try (FileInputStream fis = new FileInputStream(file)) {
List<Article> result = objectMapper.readValue(fis, new TypeReference<List<Article>>() {});
System.out.println("[INFO] 导入 " + result.size() + " 条数据");
return result;
}
}
}

20
project/src/main/java/com/example/strategy/AntiBlockStrategy.java

@ -0,0 +1,20 @@
package com.example.strategy;
public interface AntiBlockStrategy {
String getRandomUserAgent();
String getRandomProxy();
void beforeRequest(String url);
void afterRequest(String url, boolean success);
long getRandomDelay();
long getMinRequestInterval();
int getMaxRetries();
boolean shouldRetry(int retryCount, int statusCode);
}

114
project/src/main/java/com/example/strategy/DefaultAntiBlockStrategy.java

@ -0,0 +1,114 @@
package com.example.strategy;
import java.net.URI;
import java.util.Random;
public class DefaultAntiBlockStrategy implements AntiBlockStrategy {
private final Random random = new Random();
private final RequestThrottler throttler;
private final int maxRetries;
private final long minDelayMs;
private final long maxDelayMs;
public DefaultAntiBlockStrategy() {
this(3, 1000, 3000);
}
public DefaultAntiBlockStrategy(int maxRetries, long minDelayMs, long maxDelayMs) {
this.maxRetries = maxRetries;
this.minDelayMs = minDelayMs;
this.maxDelayMs = maxDelayMs;
this.throttler = RequestThrottler.createDefault();
}
public static DefaultAntiBlockStrategy createDefault() {
return new DefaultAntiBlockStrategy();
}
public static DefaultAntiBlockStrategy createStrict() {
return new DefaultAntiBlockStrategy(2, 2000, 5000);
}
public static DefaultAntiBlockStrategy createRelaxed() {
return new DefaultAntiBlockStrategy(5, 500, 1500);
}
@Override
public String getRandomUserAgent() {
return UserAgentPool.getRandomDesktop();
}
@Override
public String getRandomProxy() {
return null;
}
@Override
public void beforeRequest(String url) {
try {
String domain = extractDomain(url);
throttler.acquire(domain);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
@Override
public void afterRequest(String url, boolean success) {
if (!success) {
System.out.println("[防封策略] 请求失败: " + url);
}
}
@Override
public long getRandomDelay() {
return minDelayMs + random.nextLong(maxDelayMs - minDelayMs);
}
@Override
public long getMinRequestInterval() {
return minDelayMs;
}
@Override
public int getMaxRetries() {
return maxRetries;
}
@Override
public boolean shouldRetry(int retryCount, int statusCode) {
if (retryCount >= maxRetries) {
return false;
}
return statusCode == 429 || statusCode == 503 || statusCode == 502 || statusCode >= 500;
}
private String extractDomain(String url) {
try {
URI uri = new URI(url);
return uri.getHost();
} catch (Exception e) {
return "unknown";
}
}
public void delay() {
try {
long delay = getRandomDelay();
Thread.sleep(delay);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
public void exponentialBackoff(int retryCount) {
try {
long baseDelay = 1000L << retryCount;
long jitter = random.nextLong(500);
Thread.sleep(baseDelay + jitter);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}

194
project/src/main/java/com/example/strategy/EnhancedAntiBlockStrategy.java

@ -0,0 +1,194 @@
package com.example.strategy;
import java.net.URI;
import java.util.Random;
/**
* 增强版反爬策略
* 包含完整的浏览器指纹智能请求间隔自动重试机制
*/
public class EnhancedAntiBlockStrategy implements AntiBlockStrategy {
private final Random random = new Random();
private final RequestThrottler throttler;
private final int maxRetries;
private final long minDelayMs;
private final long maxDelayMs;
private final String[] acceptHeaders;
private final String[] acceptEncodingHeaders;
private final String[] acceptLanguageHeaders;
private final String[] secChUaHeaders;
private final String[] secChUaMobileHeaders;
private final String[] secChUaPlatformHeaders;
public EnhancedAntiBlockStrategy() {
this(3, 2000, 5000);
}
public EnhancedAntiBlockStrategy(int maxRetries, long minDelayMs, long maxDelayMs) {
this.maxRetries = maxRetries;
this.minDelayMs = minDelayMs;
this.maxDelayMs = maxDelayMs;
this.throttler = RequestThrottler.createStrict();
// 初始化各种请求头池
this.acceptHeaders = new String[] {
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"application/json, text/plain, */*"
};
this.acceptEncodingHeaders = new String[] {
"gzip, deflate, br",
"gzip, deflate",
"deflate"
};
this.acceptLanguageHeaders = new String[] {
"zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"zh-CN,zh;q=0.9,en;q=0.8",
"zh-CN,zh;q=0.9"
};
this.secChUaHeaders = new String[] {
"\"Not_A Brand\";v=\"8\", \"Chromium\";v=\"120\", \"Google Chrome\";v=\"120\"",
"\"Not_A Brand\";v=\"8\", \"Chromium\";v=\"119\", \"Google Chrome\";v=\"119\"",
"\"Microsoft Edge\";v=\"120\", \"Chromium\";v=\"120\", \"Not.A/Brand\";v=\"24\""
};
this.secChUaMobileHeaders = new String[] {
"?0",
"?1"
};
this.secChUaPlatformHeaders = new String[] {
"\"Windows\"",
"\"macOS\"",
"\"Linux\""
};
}
public static EnhancedAntiBlockStrategy createDefault() {
return new EnhancedAntiBlockStrategy();
}
public static EnhancedAntiBlockStrategy createStrict() {
return new EnhancedAntiBlockStrategy(5, 3000, 8000);
}
public static EnhancedAntiBlockStrategy createForMusic() {
return new EnhancedAntiBlockStrategy(3, 2000, 4000);
}
public static EnhancedAntiBlockStrategy createForNews() {
return new EnhancedAntiBlockStrategy(3, 1500, 3000);
}
public static EnhancedAntiBlockStrategy createForBook() {
return new EnhancedAntiBlockStrategy(4, 2500, 5000);
}
public static EnhancedAntiBlockStrategy createForMovie() {
return new EnhancedAntiBlockStrategy(3, 2000, 4000);
}
@Override
public String getRandomUserAgent() {
return UserAgentPool.getRandomDesktop();
}
public String getRandomAccept() {
return acceptHeaders[random.nextInt(acceptHeaders.length)];
}
public String getRandomAcceptEncoding() {
return acceptEncodingHeaders[random.nextInt(acceptEncodingHeaders.length)];
}
public String getRandomAcceptLanguage() {
return acceptLanguageHeaders[random.nextInt(acceptLanguageHeaders.length)];
}
public String getRandomSecChUa() {
return secChUaHeaders[random.nextInt(secChUaHeaders.length)];
}
public String getRandomSecChUaMobile() {
return secChUaMobileHeaders[random.nextInt(secChUaMobileHeaders.length)];
}
public String getRandomSecChUaPlatform() {
return secChUaPlatformHeaders[random.nextInt(secChUaPlatformHeaders.length)];
}
@Override
public String getRandomProxy() {
return null;
}
@Override
public void beforeRequest(String url) {
try {
String domain = extractDomain(url);
throttler.acquire(domain);
Thread.sleep(getRandomDelay());
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
@Override
public void afterRequest(String url, boolean success) {
if (!success) {
System.out.println("[防封策略] 请求失败: " + url);
try {
Thread.sleep(5000 + random.nextLong(5000));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
@Override
public long getRandomDelay() {
return minDelayMs + random.nextLong(maxDelayMs - minDelayMs);
}
@Override
public long getMinRequestInterval() {
return minDelayMs;
}
@Override
public int getMaxRetries() {
return maxRetries;
}
@Override
public boolean shouldRetry(int retryCount, int statusCode) {
if (retryCount >= maxRetries) {
return false;
}
return statusCode == 429 || statusCode == 503 || statusCode == 502 ||
statusCode >= 500 || statusCode == 403;
}
public void exponentialBackoff(int retryCount) {
try {
long delay = (long) Math.pow(2, retryCount) * 1000 + random.nextLong(1000);
System.out.println("[防封策略] 指数退避等待: " + delay + "ms");
Thread.sleep(delay);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
private String extractDomain(String url) {
try {
URI uri = new URI(url);
return uri.getHost();
} catch (Exception e) {
return "unknown";
}
}
}

99
project/src/main/java/com/example/strategy/RequestThrottler.java

@ -0,0 +1,99 @@
package com.example.strategy;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
public final class RequestThrottler {
private final Map<String, RequestRecord> domainRecords = new ConcurrentHashMap<>();
private final int maxRequestsPerMinute;
private final long minIntervalMs;
private final Object lock = new Object();
public RequestThrottler(int maxRequestsPerMinute, long minIntervalMs) {
this.maxRequestsPerMinute = maxRequestsPerMinute;
this.minIntervalMs = minIntervalMs;
}
public static RequestThrottler createDefault() {
return new RequestThrottler(30, 1000);
}
public static RequestThrottler createStrict() {
return new RequestThrottler(10, 2000);
}
public static RequestThrottler createRelaxed() {
return new RequestThrottler(60, 500);
}
public void acquire(String domain) throws InterruptedException {
synchronized (lock) {
RequestRecord record = domainRecords.computeIfAbsent(domain, k -> new RequestRecord());
long now = System.currentTimeMillis();
long elapsed = now - record.lastRequestTime;
if (elapsed < minIntervalMs) {
Thread.sleep(minIntervalMs - elapsed);
}
if (record.requestCount.get() >= maxRequestsPerMinute &&
(now - record.windowStartTime) < 60000) {
long waitTime = 60000 - (now - record.windowStartTime);
Thread.sleep(waitTime);
record.requestCount.set(0);
record.windowStartTime = System.currentTimeMillis();
}
record.requestCount.incrementAndGet();
record.lastRequestTime = System.currentTimeMillis();
}
}
public boolean tryAcquire(String domain) {
synchronized (lock) {
RequestRecord record = domainRecords.computeIfAbsent(domain, k -> new RequestRecord());
long now = System.currentTimeMillis();
long elapsed = now - record.lastRequestTime;
if (elapsed < minIntervalMs) {
return false;
}
if (record.requestCount.get() >= maxRequestsPerMinute &&
(now - record.windowStartTime) < 60000) {
return false;
}
record.requestCount.incrementAndGet();
record.lastRequestTime = now;
return true;
}
}
public int getRequestCount(String domain) {
RequestRecord record = domainRecords.get(domain);
return record != null ? record.requestCount.get() : 0;
}
public int getMaxRequestsPerMinute() {
return maxRequestsPerMinute;
}
public void reset(String domain) {
domainRecords.remove(domain);
}
public void resetAll() {
domainRecords.clear();
}
private static class RequestRecord {
final AtomicInteger requestCount = new AtomicInteger(0);
volatile long lastRequestTime = 0;
volatile long windowStartTime = System.currentTimeMillis();
}
}

16
project/src/main/java/com/example/strategy/SpiderStrategy.java

@ -0,0 +1,16 @@
package com.example.strategy;
import com.example.core.CrawlResult;
import java.util.List;
public interface SpiderStrategy {
String buildSearchUrl(String keyword);
String buildDetailUrl(String itemId);
CrawlResult<List<?>> executeCrawl(String keyword);
String getPlatformName();
}

86
project/src/main/java/com/example/strategy/UserAgentPool.java

@ -0,0 +1,86 @@
package com.example.strategy;
import java.util.Random;
public final class UserAgentPool {
private UserAgentPool() {
}
private static final String[] CHROME_AGENTS = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
};
private static final String[] FIREFOX_AGENTS = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
};
private static final String[] EDGE_AGENTS = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
};
private static final String[] SAFARI_AGENTS = {
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15"
};
private static final String[] MOBILE_AGENTS = {
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Linux; Android 14; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36"
};
private static final Random random = new Random();
public static String getRandomChrome() {
return CHROME_AGENTS[random.nextInt(CHROME_AGENTS.length)];
}
public static String getRandomFirefox() {
return FIREFOX_AGENTS[random.nextInt(FIREFOX_AGENTS.length)];
}
public static String getRandomEdge() {
return EDGE_AGENTS[random.nextInt(EDGE_AGENTS.length)];
}
public static String getRandomSafari() {
return SAFARI_AGENTS[random.nextInt(SAFARI_AGENTS.length)];
}
public static String getRandomMobile() {
return MOBILE_AGENTS[random.nextInt(MOBILE_AGENTS.length)];
}
public static String getRandom() {
int type = random.nextInt(5);
return switch (type) {
case 0 -> getRandomChrome();
case 1 -> getRandomFirefox();
case 2 -> getRandomEdge();
case 3 -> getRandomSafari();
default -> getRandomMobile();
};
}
public static String getRandomDesktop() {
int type = random.nextInt(4);
return switch (type) {
case 0 -> getRandomChrome();
case 1 -> getRandomFirefox();
case 2 -> getRandomEdge();
default -> getRandomSafari();
};
}
}

112
project/src/main/java/com/example/view/ConsoleView.java

@ -0,0 +1,112 @@
package com.example.view;
import java.util.Scanner;
public class ConsoleView {
private final Scanner scanner;
public ConsoleView() {
this.scanner = new Scanner(System.in, "UTF-8");
}
public void displayMenu() {
printLine("==========================================");
printLine(" 爬虫系统主菜单");
printLine("==========================================");
printLine(" 1. 网易云音乐");
printLine(" 2. 中国新闻网");
printLine(" 3. 当当图书");
printLine(" 4. 豆瓣电影");
printLine(" 5. 数据分析");
printLine(" 0. 退出");
printLine("==========================================");
}
public void displayDataAnalysisMenu() {
printLine("==========================================");
printLine(" 数据存储与分析系统");
printLine("==========================================");
printLine(" 1. 音乐数据分析");
printLine(" 2. 图书数据分析");
printLine(" 3. 影视数据分析");
printLine(" 4. 新闻数据分析");
printLine(" 5. 生成HTML报告");
printLine(" 6. 生成JFreeChart图表");
printLine(" 7. 导出为CSV");
printLine(" 8. 导出为JSON");
printLine(" 9. 保存到数据库");
printLine(" 10. 从数据库读取");
printLine(" 11. 返回主菜单");
printLine("==========================================");
}
public void displayPlatformMenu(String platformName) {
printLine("==========================================");
printLine(" " + platformName + "平台");
printLine("==========================================");
printLine(" 1. 搜索");
printLine(" 2. 获取热门榜单");
printLine(" 3. 返回主菜单");
printLine("==========================================");
}
public String getInput(String prompt) {
System.out.print(prompt);
return scanner.nextLine().trim();
}
public String getInputWithDefault(String prompt, String defaultValue) {
System.out.print(prompt);
String input = scanner.nextLine().trim();
return input.isEmpty() ? defaultValue : input;
}
public int getMenuChoice(String prompt, int min, int max) {
while (true) {
String input = getInput(prompt);
try {
int choice = Integer.parseInt(input);
if (choice >= min && choice <= max) {
return choice;
}
printError("请输入 " + min + "-" + max + " 之间的数字");
} catch (NumberFormatException e) {
printError("请输入有效的数字");
}
}
}
public void displayMessage(String message) {
System.out.println(message);
}
public void displaySuccess(String message) {
System.out.println("[OK] " + message);
}
public void displayError(String message) {
System.err.println("[ERROR] " + message);
}
public void printError(String message) {
System.err.println("[ERROR] " + message);
}
public void displayInfo(String message) {
System.out.println("[INFO] " + message);
}
public void printLine(String content) {
System.out.println(content);
}
public void printBlank() {
System.out.println();
}
public void close() {
if (scanner != null) {
scanner.close();
}
}
}

46
project/src/main/resources/spider-config.json

@ -0,0 +1,46 @@
{
"spider": {
"defaultPlatform": "NETEASE",
"commentLimit": 200,
"searchLimit": 20
},
"antiBlock": {
"enabled": true,
"maxRetries": 3,
"minDelayMs": 1000,
"maxDelayMs": 3000,
"useProxy": false
},
"proxy": {
"enabled": false,
"maxFailuresBeforeRemove": 3,
"refreshIntervalMs": 300000,
"proxies": []
},
"throttle": {
"maxRequestsPerMinute": 30,
"minIntervalMs": 1000
},
"platforms": {
"NETEASE": {
"enabled": true,
"description": "网易云音乐 - 稳定支持",
"features": ["search", "detail", "comments", "charts"]
},
"QQMUSIC": {
"enabled": true,
"description": "QQ音乐 - 实验性支持",
"features": ["search", "detail", "charts"]
},
"KUGOU": {
"enabled": true,
"description": "酷狗音乐 - 实验性支持",
"features": ["search", "charts"]
},
"KUWO": {
"enabled": true,
"description": "酷我音乐 - 实验性支持",
"features": ["search", "charts"]
}
}
}
Loading…
Cancel
Save