50 changed files with 3823 additions and 0 deletions
Binary file not shown.
@ -0,0 +1,4 @@ |
|||||
|
*.jar |
||||
|
*.jar |
||||
|
*.class |
||||
|
*.log |
||||
@ -0,0 +1,273 @@ |
|||||
|
[ { |
||||
|
"title" : "7月1日起施行 超龄劳动者迎来权益保障新规", |
||||
|
"url" : "http://society.people.com.cn/n1/2026/0525/c1008-40727022.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "经港珠澳大桥出入境港澳单牌车总量突破1000万辆次", |
||||
|
"url" : "http://gba.people.cn/n1/2026/0525/c42272-40726946.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "外交部谈美伊谈判", |
||||
|
"url" : "http://world.people.com.cn/n1/2026/0525/c1002-40726926.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "重庆发布今年首个地质灾害红色预警", |
||||
|
"url" : "http://society.people.com.cn/n1/2026/0525/c1008-40726849.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "重庆发布今年首个地质灾害红色预警", |
||||
|
"url" : "http://cq.people.com.cn/n2/2026/0525/c365401-41590405.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "账号管理规范", |
||||
|
"url" : "https://blog.csdn.net/blogdevteam/article/details/126135357", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "代码产出暴涨250%,Claude Code已100%由自己编写!CC 之父 Boris 最新对话:我现在只负责写提示词", |
||||
|
"url" : "https://blog.csdn.net/dQCFKyQDXYm3F8rB0/article/details/161325096", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "我们公司全员把 Cursor 换成了自研的 全开源AtomCode", |
||||
|
"url" : "https://blog.csdn.net/jiangtao/article/details/161373705", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "与菲尔兹奖得主Timothy Gowers对话:整个数学研究的范式将被AI改变", |
||||
|
"url" : "https://blog.csdn.net/jzagi/article/details/161327725", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "AI又“翻车”!Gemini狂删2.8万行代码、系统宕机33分钟,还伪造沟通记录谎称“已恢复正常”", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161325101", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "开源项目“离谱的死亡方式”", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161325111", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "“DeepSeek崩了”又冲上热搜;特斯拉FSD中文名改为“特斯拉辅助驾驶”:价格依旧为6.4万元;苹果WWDC26将成库克告别秀 | 极客头条", |
||||
|
"url" : "https://blog.csdn.net/weixin_39786569/article/details/161394638", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "“超级Agent”大梦初醒:任务一长就“飘”、动辄陷入“无限探索”?一场对话复盘工业级智能体的真实痛点与终局 | AI进化论", |
||||
|
"url" : "https://blog.csdn.net/dQCFKyQDXYm3F8rB0/article/details/161294914", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "从全网群嘲到让学术界颤抖!OpenAI 攻破 80 年数学悬案,菲尔兹奖得主预言灵验:AI正将人类逐出科研循环", |
||||
|
"url" : "https://blog.csdn.net/dQCFKyQDXYm3F8rB0/article/details/161294921", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "雷军直言“输给特斯拉不丢人”;传Manus创始人计划融资10亿美元回购公司 | 极客头条", |
||||
|
"url" : "https://blog.csdn.net/weixin_39786569/article/details/161313996", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "GitHub遭入侵,黑客开价5万美元卖源码!员工装了个VS Code插件,致3800个内部仓库被盗", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161294926", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Chaterm — 开源SRE副驾驶,让你与服务器直接对话! 服务器 14.7K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/157735374", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "拆箱开源版Coze:Agent核心三件套大公开,48小时揽下9K Star 人工智能 47.5K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149722641", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "MinIO:开源对象存储解决方案的领先者 开源 67.6K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149424765", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "LocalSend:比 AirDrop 更自由!这款神器让文件传输不再受限 https 64.1K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149356472", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Excalidraw:一款轻量、高效、极具手感的在线白板工具 产品经理 56.7K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149249425", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "star31.6k,Aider:让代码编写如虎添翼的终端神器 人工智能 66.5K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149169547", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "用Rust编写的开源支付解决方案——Hyperswitch rust 63.6K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149066439", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Langflow:这个拖拽式AI工作流神器正在颠覆传统编程 人工智能 76.9K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/148900678", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "一键抠图有多强?19Kstar 的 Rembg 开源神器 python 58.7K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/148851428", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "CHATERM AI:开启云资源氛围管理新篇章! 人工智能 70.3K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/148769366", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "CSDN会员推广伙伴招募:分销返佣 + 资源互换,诚邀合作", |
||||
|
"url" : "https://blog.csdn.net/blogdevteam/article/details/160479095", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "深入解析进程:从PCB到僵尸进程", |
||||
|
"url" : "https://blog.csdn.net/2401_86275172/article/details/160566166", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "【功能跃升】Claude Code v2.1.145:开放 --json 脚本接口,打通 tmux 状态栏,超大文件智能截断", |
||||
|
"url" : "https://blog.csdn.net/Rthan/article/details/161241670", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "【读书笔记】《幸福关系的七段旅程》", |
||||
|
"url" : "https://blog.csdn.net/Chandler2017/article/details/160967281", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Spring 核心原理:IoC/DI 与 Bean 生命周期全景解析", |
||||
|
"url" : "https://blog.csdn.net/2401_88151415/article/details/161253437", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "鸿蒙 PC 跨设备拖拽:实现原理 + 实战代码", |
||||
|
"url" : "https://blog.csdn.net/qq_36478920/article/details/161291953", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "volatile 的底层原理及应用场景", |
||||
|
"url" : "https://blog.csdn.net/tongluowan007/article/details/161230327", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "ROS开发专栏---ROS2humble安装详细教程---适配Ubuntu 22.04", |
||||
|
"url" : "https://blog.csdn.net/weixin_61186812/article/details/161054923", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "2026年全国青少年信息素养大赛算法应用主题赛(C++赛项-初赛-赛前冲刺模拟卷2:文末附答案和解析)", |
||||
|
"url" : "https://blog.csdn.net/weixin_66461496/article/details/161206019", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "系统分析师 备考知识点整理", |
||||
|
"url" : "https://blog.csdn.net/david_232656/article/details/161291901", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Linux之文件", |
||||
|
"url" : "https://blog.csdn.net/bksczm/article/details/161055964", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Python 数据分析基础入门:《Excel Python:飞速搞定数据分析与处理》学习笔记系列(附录 C 高级 Python 概念)", |
||||
|
"url" : "https://blog.csdn.net/m0_67558301/article/details/161324964", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "【LE Audio】CAP精讲[8]:CCID绑定术,打通音频流与控制的任督二脉", |
||||
|
"url" : "https://blog.csdn.net/weixin_37800531/article/details/161135741", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Codex Mac版安装教程(AppStore无法下载解决)", |
||||
|
"url" : "https://blog.csdn.net/weixin_41961749/article/details/161110569", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "应用层中的UDP协议原理", |
||||
|
"url" : "https://blog.csdn.net/2503_90262217/article/details/161200229", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "【AI】Git、Node.js 一站式保姆级安装指南", |
||||
|
"url" : "https://blog.csdn.net/2401_87342824/article/details/161199150", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Re: Linux系统篇(十八)进程篇·三:深度硬核!全面起底 Linux 进程状态变化与内核链表动态解绑", |
||||
|
"url" : "https://blog.csdn.net/Z2314246476/article/details/161076726", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "本周 GitHub 最热项目全解析!Star History 2026年第20周(5月8日-14日)排行榜深度盘点", |
||||
|
"url" : "https://blog.csdn.net/yanceyxin/article/details/161130991", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Google I/O 2026深度解读:AI Agent时代全面到来,从“大模型时代“到“智能体时代“的历史性跨越", |
||||
|
"url" : "https://blog.csdn.net/shaobingj126/article/details/161307384", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "c#基础知识合集07 方法值传递 引用传递 ref参数 out输出参数 in参数 参数列表", |
||||
|
"url" : "https://blog.csdn.net/2603_96051737/article/details/161256831", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "谷歌辞职、创业失败、重读神经科学,她说 AI 时代最危险的事是外包你的思考 | 万有引力", |
||||
|
"url" : "https://blog.csdn.net/tangxiaoyin/article/details/161428871", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "传字节向Seed员工开放「豆包股」认购权;滴滴出行App大规模故障,官方致歉;小米MiMo-V2.5系列API永久降价:最高降99% | 极客头条", |
||||
|
"url" : "https://blog.csdn.net/weixin_39786569/article/details/161446737", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "华为韬定律刷屏,程序员真正该读懂的信号是什么? | 硅基时间", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161432746", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "一位10年Android老兵选择「逆行」:“如果未来只剩AI写代码,那就把我落下吧!”", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161432759", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "告别繁琐预处理!MindSpeed LLM推出Train_from_HF功能,实现加载即训练", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161426770", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "MindSpeed LLM结合Agent-Skills适配Mamba3模型,解锁SSM模型新潜能", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161427107", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "高性能计算:鲲鹏软硬协同定义AI4S 计算新范式", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161426451", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "AI公司烧不起Token了!国产Agent杀出,逼近Opus 4.6还免费,天工AI发布SkyClaw-v1.0:面向真实工作流的百万上下文 Agent 模型", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161422508", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "2026年618大促7000元内演唱会手机推荐:Find X9s Pro领衔,远摄防抖清晰度全解析", |
||||
|
"url" : "https://blog.csdn.net/2601_95822891/article/details/161261185", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Python运算符:身份运算符(is/is not)与双等号的区别", |
||||
|
"url" : "https://blog.csdn.net/AIRoses/article/details/161410239", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Codex 与 Claude Code 安装配置教程", |
||||
|
"url" : "https://blog.csdn.net/weixin_45888077/article/details/161401615", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "初识java(十一):继承", |
||||
|
"url" : "https://blog.csdn.net/2502_93282244/article/details/161372118", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "我那台在抽屉里躺了三年的旧手机,被我改造成了全天候私人云盘", |
||||
|
"url" : "https://blog.csdn.net/SDFsoul/article/details/161278737", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "【必看】2026年 {计算题} |专项解析 ~ H:动态规划 & 图论", |
||||
|
"url" : "https://blog.csdn.net/weixin_42115157/article/details/161057408", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "FreeRTOS——按键控制任务的挂起和恢复", |
||||
|
"url" : "https://blog.csdn.net/weixin_64611877/article/details/161456747", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "【c++笔记】类和对象流食般投喂(中)", |
||||
|
"url" : "https://blog.csdn.net/dj_798/article/details/160994229", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "C++的IO流", |
||||
|
"url" : "https://blog.csdn.net/suimingtao/article/details/160892078", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Java——标准序列化机制", |
||||
|
"url" : "https://blog.csdn.net/cold___play/article/details/161107932", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "1.6T光模块将成AI数据中心主流", |
||||
|
"url" : "https://blog.csdn.net/m0_75253087/article/details/160956039", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "通用程序无缺陷保证的不可能性:停机问题与哥德尔不完备定理的双轨论证 —— 兼论“边界情况不可穷举”的形式化含义", |
||||
|
"url" : "https://blog.csdn.net/qq_43689451/article/details/161271922", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "新书速览|信息与通信工程综合实验:自动目标识别专题", |
||||
|
"url" : "https://blog.csdn.net/quanzhankaifaqua/article/details/161193290", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "深入理解 OSI 七层网络模型:从原理到实践", |
||||
|
"url" : "https://blog.csdn.net/2603_95882547/article/details/161140630", |
||||
|
"content" : "" |
||||
|
} ] |
||||
@ -0,0 +1,185 @@ |
|||||
|
[ { |
||||
|
"title" : "7月1日起施行 超龄劳动者迎来权益保障新规", |
||||
|
"url" : "http://society.people.com.cn/n1/2026/0525/c1008-40727022.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "经港珠澳大桥出入境港澳单牌车总量突破1000万辆次", |
||||
|
"url" : "http://gba.people.cn/n1/2026/0525/c42272-40726946.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "外交部谈美伊谈判", |
||||
|
"url" : "http://world.people.com.cn/n1/2026/0525/c1002-40726926.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "重庆发布今年首个地质灾害红色预警", |
||||
|
"url" : "http://society.people.com.cn/n1/2026/0525/c1008-40726849.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "重庆发布今年首个地质灾害红色预警", |
||||
|
"url" : "http://cq.people.com.cn/n2/2026/0525/c365401-41590405.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "账号管理规范", |
||||
|
"url" : "https://blog.csdn.net/blogdevteam/article/details/126135357", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "代码产出暴涨250%,Claude Code已100%由自己编写!CC 之父 Boris 最新对话:我现在只负责写提示词", |
||||
|
"url" : "https://blog.csdn.net/dQCFKyQDXYm3F8rB0/article/details/161325096", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "我们公司全员把 Cursor 换成了自研的 全开源AtomCode", |
||||
|
"url" : "https://blog.csdn.net/jiangtao/article/details/161373705", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "与菲尔兹奖得主Timothy Gowers对话:整个数学研究的范式将被AI改变", |
||||
|
"url" : "https://blog.csdn.net/jzagi/article/details/161327725", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "AI又“翻车”!Gemini狂删2.8万行代码、系统宕机33分钟,还伪造沟通记录谎称“已恢复正常”", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161325101", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "开源项目“离谱的死亡方式”", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161325111", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "“DeepSeek崩了”又冲上热搜;特斯拉FSD中文名改为“特斯拉辅助驾驶”:价格依旧为6.4万元;苹果WWDC26将成库克告别秀 | 极客头条", |
||||
|
"url" : "https://blog.csdn.net/weixin_39786569/article/details/161394638", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "“超级Agent”大梦初醒:任务一长就“飘”、动辄陷入“无限探索”?一场对话复盘工业级智能体的真实痛点与终局 | AI进化论", |
||||
|
"url" : "https://blog.csdn.net/dQCFKyQDXYm3F8rB0/article/details/161294914", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "从全网群嘲到让学术界颤抖!OpenAI 攻破 80 年数学悬案,菲尔兹奖得主预言灵验:AI正将人类逐出科研循环", |
||||
|
"url" : "https://blog.csdn.net/dQCFKyQDXYm3F8rB0/article/details/161294921", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "雷军直言“输给特斯拉不丢人”;传Manus创始人计划融资10亿美元回购公司 | 极客头条", |
||||
|
"url" : "https://blog.csdn.net/weixin_39786569/article/details/161313996", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "GitHub遭入侵,黑客开价5万美元卖源码!员工装了个VS Code插件,致3800个内部仓库被盗", |
||||
|
"url" : "https://blog.csdn.net/csdnnews/article/details/161294926", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Chaterm — 开源SRE副驾驶,让你与服务器直接对话! 服务器 14.7K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/157735374", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "拆箱开源版Coze:Agent核心三件套大公开,48小时揽下9K Star 人工智能 47.5K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149722641", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "MinIO:开源对象存储解决方案的领先者 开源 67.6K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149424765", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "LocalSend:比 AirDrop 更自由!这款神器让文件传输不再受限 https 64.1K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149356472", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Excalidraw:一款轻量、高效、极具手感的在线白板工具 产品经理 56.7K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149249425", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "star31.6k,Aider:让代码编写如虎添翼的终端神器 人工智能 66.5K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149169547", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "用Rust编写的开源支付解决方案——Hyperswitch rust 63.6K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/149066439", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Langflow:这个拖拽式AI工作流神器正在颠覆传统编程 人工智能 76.9K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/148900678", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "一键抠图有多强?19Kstar 的 Rembg 开源神器 python 58.7K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/148851428", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "CHATERM AI:开启云资源氛围管理新篇章! 人工智能 70.3K 查看详情", |
||||
|
"url" : "https://blog.csdn.net/coderroad/article/details/148769366", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "CSDN会员推广伙伴招募:分销返佣 + 资源互换,诚邀合作", |
||||
|
"url" : "https://blog.csdn.net/blogdevteam/article/details/160479095", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "深入解析进程:从PCB到僵尸进程", |
||||
|
"url" : "https://blog.csdn.net/2401_86275172/article/details/160566166", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "【功能跃升】Claude Code v2.1.145:开放 --json 脚本接口,打通 tmux 状态栏,超大文件智能截断", |
||||
|
"url" : "https://blog.csdn.net/Rthan/article/details/161241670", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "【读书笔记】《幸福关系的七段旅程》", |
||||
|
"url" : "https://blog.csdn.net/Chandler2017/article/details/160967281", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Spring 核心原理:IoC/DI 与 Bean 生命周期全景解析", |
||||
|
"url" : "https://blog.csdn.net/2401_88151415/article/details/161253437", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "鸿蒙 PC 跨设备拖拽:实现原理 + 实战代码", |
||||
|
"url" : "https://blog.csdn.net/qq_36478920/article/details/161291953", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "volatile 的底层原理及应用场景", |
||||
|
"url" : "https://blog.csdn.net/tongluowan007/article/details/161230327", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "ROS开发专栏---ROS2humble安装详细教程---适配Ubuntu 22.04", |
||||
|
"url" : "https://blog.csdn.net/weixin_61186812/article/details/161054923", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "2026年全国青少年信息素养大赛算法应用主题赛(C++赛项-初赛-赛前冲刺模拟卷2:文末附答案和解析)", |
||||
|
"url" : "https://blog.csdn.net/weixin_66461496/article/details/161206019", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "系统分析师 备考知识点整理", |
||||
|
"url" : "https://blog.csdn.net/david_232656/article/details/161291901", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Linux之文件", |
||||
|
"url" : "https://blog.csdn.net/bksczm/article/details/161055964", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Python 数据分析基础入门:《Excel Python:飞速搞定数据分析与处理》学习笔记系列(附录 C 高级 Python 概念)", |
||||
|
"url" : "https://blog.csdn.net/m0_67558301/article/details/161324964", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "【LE Audio】CAP精讲[8]:CCID绑定术,打通音频流与控制的任督二脉", |
||||
|
"url" : "https://blog.csdn.net/weixin_37800531/article/details/161135741", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Codex Mac版安装教程(AppStore无法下载解决)", |
||||
|
"url" : "https://blog.csdn.net/weixin_41961749/article/details/161110569", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "应用层中的UDP协议原理", |
||||
|
"url" : "https://blog.csdn.net/2503_90262217/article/details/161200229", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "【AI】Git、Node.js 一站式保姆级安装指南", |
||||
|
"url" : "https://blog.csdn.net/2401_87342824/article/details/161199150", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Re: Linux系统篇(十八)进程篇·三:深度硬核!全面起底 Linux 进程状态变化与内核链表动态解绑", |
||||
|
"url" : "https://blog.csdn.net/Z2314246476/article/details/161076726", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "本周 GitHub 最热项目全解析!Star History 2026年第20周(5月8日-14日)排行榜深度盘点", |
||||
|
"url" : "https://blog.csdn.net/yanceyxin/article/details/161130991", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "Google I/O 2026深度解读:AI Agent时代全面到来,从“大模型时代“到“智能体时代“的历史性跨越", |
||||
|
"url" : "https://blog.csdn.net/shaobingj126/article/details/161307384", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "c#基础知识合集07 方法值传递 引用传递 ref参数 out输出参数 in参数 参数列表", |
||||
|
"url" : "https://blog.csdn.net/2603_96051737/article/details/161256831", |
||||
|
"content" : "" |
||||
|
} ] |
||||
@ -0,0 +1,29 @@ |
|||||
|
{ |
||||
|
"metadata": { |
||||
|
"exportTime": "2026-05-31T12:00:00", |
||||
|
"totalCount": 3, |
||||
|
"source": "CLI Crawler v1.0", |
||||
|
"exportMode": "STANDARD", |
||||
|
"version": "1.0" |
||||
|
}, |
||||
|
"articles": [ |
||||
|
{ |
||||
|
"title": "测试文章1", |
||||
|
"url": "https://example.com/article1", |
||||
|
"content": "这是测试内容1", |
||||
|
"crawledAt": "2026-05-31T10:00:00" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "测试文章2", |
||||
|
"url": "https://example.com/article2", |
||||
|
"content": "这是测试内容2", |
||||
|
"crawledAt": "2026-05-31T11:00:00" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "测试文章3", |
||||
|
"url": "https://example.com/article3", |
||||
|
"content": "这是测试内容3", |
||||
|
"crawledAt": "2026-05-31T12:00:00" |
||||
|
} |
||||
|
] |
||||
|
} |
||||
@ -0,0 +1,54 @@ |
|||||
|
# Simple Import/Export Test - Minimal Version |
||||
|
$env:JAVA_HOME = "C:\Program Files\Java\latest\jdk-25" |
||||
|
$APP_JAR = "target\datacollect-cli-0.1.0-jar-with-dependencies.jar" |
||||
|
$SAMPLE = "data\sample_test.json" |
||||
|
$EXPORT = "data\export_result.json" |
||||
|
|
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Import/Export Test - Minimal" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "" |
||||
|
|
||||
|
Write-Host "[TEST 1] Import" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR import $SAMPLE 2>&1 |
||||
|
Write-Host $result |
||||
|
Write-Host "" |
||||
|
|
||||
|
Write-Host "[TEST 2] List" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR list 2>&1 |
||||
|
Write-Host $result |
||||
|
Write-Host "" |
||||
|
|
||||
|
Write-Host "[TEST 3] Export" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR export $EXPORT --format json 2>&1 |
||||
|
Write-Host $result |
||||
|
Write-Host "" |
||||
|
|
||||
|
Write-Host "[TEST 4] Check Export File" -ForegroundColor Yellow |
||||
|
if (Test-Path $EXPORT) { |
||||
|
Write-Host "[SUCCESS] File created!" -ForegroundColor Green |
||||
|
$content = Get-Content $EXPORT -Raw |
||||
|
Write-Host "Length: $($content.Length) chars" -ForegroundColor Cyan |
||||
|
|
||||
|
if ($content -match "crawledAt") { |
||||
|
Write-Host "[SUCCESS] crawledAt field found!" -ForegroundColor Green |
||||
|
} |
||||
|
if ($content -match "metadata") { |
||||
|
Write-Host "[SUCCESS] metadata field found!" -ForegroundColor Green |
||||
|
} |
||||
|
} |
||||
|
Write-Host "" |
||||
|
|
||||
|
Write-Host "[TEST 5] Import Again (Duplicate)" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR import $SAMPLE 2>&1 |
||||
|
Write-Host $result |
||||
|
Write-Host "" |
||||
|
|
||||
|
Write-Host "[TEST 6] Final List" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR list 2>&1 |
||||
|
Write-Host $result |
||||
|
|
||||
|
Write-Host "" |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "TEST COMPLETED" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
@ -0,0 +1,67 @@ |
|||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" |
||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> |
||||
|
<modelVersion>4.0.0</modelVersion> |
||||
|
<groupId>com.example</groupId> |
||||
|
<artifactId>datacollect-cli</artifactId> |
||||
|
<version>0.1.0</version> |
||||
|
<properties> |
||||
|
<maven.compiler.source>11</maven.compiler.source> |
||||
|
<maven.compiler.target>11</maven.compiler.target> |
||||
|
</properties> |
||||
|
<dependencies> |
||||
|
<dependency> |
||||
|
<groupId>org.jsoup</groupId> |
||||
|
<artifactId>jsoup</artifactId> |
||||
|
<version>1.17.2</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>org.slf4j</groupId> |
||||
|
<artifactId>slf4j-api</artifactId> |
||||
|
<version>2.0.9</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>ch.qos.logback</groupId> |
||||
|
<artifactId>logback-classic</artifactId> |
||||
|
<version>1.4.14</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>com.fasterxml.jackson.core</groupId> |
||||
|
<artifactId>jackson-databind</artifactId> |
||||
|
<version>2.16.1</version> |
||||
|
</dependency> |
||||
|
</dependencies> |
||||
|
<build> |
||||
|
<plugins> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-compiler-plugin</artifactId> |
||||
|
<version>3.8.1</version> |
||||
|
</plugin> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-assembly-plugin</artifactId> |
||||
|
<version>3.3.0</version> |
||||
|
<configuration> |
||||
|
<archive> |
||||
|
<manifest> |
||||
|
<mainClass>com.example.datacollect.Main</mainClass> |
||||
|
</manifest> |
||||
|
</archive> |
||||
|
<descriptorRefs> |
||||
|
<descriptorRef>jar-with-dependencies</descriptorRef> |
||||
|
</descriptorRefs> |
||||
|
</configuration> |
||||
|
<executions> |
||||
|
<execution> |
||||
|
<id>make-assembly</id> |
||||
|
<phase>package</phase> |
||||
|
<goals> |
||||
|
<goal>single</goal> |
||||
|
</goals> |
||||
|
</execution> |
||||
|
</executions> |
||||
|
</plugin> |
||||
|
</plugins> |
||||
|
</build> |
||||
|
</project> |
||||
@ -0,0 +1,56 @@ |
|||||
|
@echo off |
||||
|
set JAVA_HOME=C:\Program Files\Java\latest\jdk-25 |
||||
|
set APP_JAR=target\datacollect-cli-0.1.0-jar-with-dependencies.jar |
||||
|
set SAMPLE=data\sample_test.json |
||||
|
set EXPORT=data\export_result.json |
||||
|
|
||||
|
echo ======================================== |
||||
|
echo Import/Export Feature Test |
||||
|
echo ======================================== |
||||
|
echo. |
||||
|
|
||||
|
echo [TEST 1] Import sample JSON file |
||||
|
echo Command: import %SAMPLE% |
||||
|
java -jar %APP_JAR% import %SAMPLE% |
||||
|
echo. |
||||
|
echo. |
||||
|
|
||||
|
echo [TEST 2] List articles |
||||
|
echo Command: list |
||||
|
java -jar %APP_JAR% list |
||||
|
echo. |
||||
|
echo. |
||||
|
|
||||
|
echo [TEST 3] Export to JSON |
||||
|
echo Command: export %EXPORT% --format json |
||||
|
java -jar %APP_JAR% export %EXPORT% --format json |
||||
|
echo. |
||||
|
echo. |
||||
|
|
||||
|
echo [TEST 4] Check exported file |
||||
|
if exist %EXPORT% ( |
||||
|
echo [SUCCESS] Export file created |
||||
|
echo. |
||||
|
echo First 1000 characters of exported file: |
||||
|
powershell -Command "Get-Content %EXPORT% | Select-Object -First 20" |
||||
|
) else ( |
||||
|
echo [ERROR] Export file NOT created |
||||
|
) |
||||
|
echo. |
||||
|
echo. |
||||
|
|
||||
|
echo [TEST 5] Test duplicate import |
||||
|
echo Command: import %SAMPLE% (again) |
||||
|
java -jar %APP_JAR% import %SAMPLE% |
||||
|
echo. |
||||
|
echo. |
||||
|
|
||||
|
echo [TEST 6] Final list |
||||
|
echo Command: list |
||||
|
java -jar %APP_JAR% list |
||||
|
echo. |
||||
|
echo. |
||||
|
|
||||
|
echo ======================================== |
||||
|
echo Tests completed! Check output above. |
||||
|
echo ======================================== |
||||
@ -0,0 +1,117 @@ |
|||||
|
# Simple Import/Export Test |
||||
|
$ErrorActionPreference = "Stop" |
||||
|
$env:JAVA_HOME = "C:\Program Files\Java\latest\jdk-25" |
||||
|
$APP_JAR = "target\datacollect-cli-0.1.0-jar-with-dependencies.jar" |
||||
|
$TEST_FILE = "data\sample_test.json" |
||||
|
$EXPORT_FILE = "data\export_result.json" |
||||
|
|
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Import/Export Feature Test" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Step 1: Import sample data |
||||
|
Write-Host "[TEST 1] Import sample JSON file" -ForegroundColor Yellow |
||||
|
Write-Host "Command: import $TEST_FILE" -ForegroundColor Gray |
||||
|
$result = & java -jar $APP_JAR "import $TEST_FILE" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Step 2: List articles |
||||
|
Write-Host "[TEST 2] List articles after import" -ForegroundColor Yellow |
||||
|
Write-Host "Command: list" -ForegroundColor Gray |
||||
|
$result = & java -jar $APP_JAR "list" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
|
||||
|
# Extract count |
||||
|
$count1 = 0 |
||||
|
$result -split "`n" | ForEach-Object { |
||||
|
if ($_ -match "Total: (\d+)") { |
||||
|
$count1 = [int]$matches[1] |
||||
|
} |
||||
|
} |
||||
|
Write-Host "Article count: $count1" -ForegroundColor Cyan |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Step 3: Export to new file |
||||
|
Write-Host "[TEST 3] Export to new JSON file" -ForegroundColor Yellow |
||||
|
Write-Host "Command: export $EXPORT_FILE --format json" -ForegroundColor Gray |
||||
|
$result = & java -jar $APP_JAR "export $EXPORT_FILE --format json" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Step 4: Check exported file |
||||
|
Write-Host "[TEST 4] Verify exported JSON file" -ForegroundColor Yellow |
||||
|
if (Test-Path $EXPORT_FILE) { |
||||
|
Write-Host "[SUCCESS] Export file created" -ForegroundColor Green |
||||
|
$content = Get-Content $EXPORT_FILE -Raw |
||||
|
Write-Host "File size: $($content.Length) characters" -ForegroundColor Cyan |
||||
|
|
||||
|
# Check for crawledAt |
||||
|
if ($content -match "crawledAt") { |
||||
|
Write-Host "[SUCCESS] crawledAt field found in exported JSON" -ForegroundColor Green |
||||
|
} else { |
||||
|
Write-Host "[ERROR] crawledAt field NOT found" -ForegroundColor Red |
||||
|
} |
||||
|
|
||||
|
# Check for metadata |
||||
|
if ($content -match "metadata") { |
||||
|
Write-Host "[SUCCESS] metadata field found" -ForegroundColor Green |
||||
|
} else { |
||||
|
Write-Host "[ERROR] metadata field NOT found" -ForegroundColor Red |
||||
|
} |
||||
|
} else { |
||||
|
Write-Host "[ERROR] Export file NOT created" -ForegroundColor Red |
||||
|
} |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Step 5: Test duplicate import |
||||
|
Write-Host "[TEST 5] Test duplicate import (should skip duplicates)" -ForegroundColor Yellow |
||||
|
Write-Host "Command: import $TEST_FILE (again)" -ForegroundColor Gray |
||||
|
$result = & java -jar $APP_JAR "import $TEST_FILE" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
|
||||
|
# Step 6: List and verify no duplication |
||||
|
Write-Host "[TEST 6] Verify no duplication" -ForegroundColor Yellow |
||||
|
Write-Host "Command: list" -ForegroundColor Gray |
||||
|
$result = & java -jar $APP_JAR "list" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
|
||||
|
$count2 = 0 |
||||
|
$result -split "`n" | ForEach-Object { |
||||
|
if ($_ -match "Total: (\d+)") { |
||||
|
$count2 = [int]$matches[1] |
||||
|
} |
||||
|
} |
||||
|
Write-Host "Article count after second import: $count2" -ForegroundColor Cyan |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Summary |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "TEST SUMMARY" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
if ($count1 -eq 3 -and $count2 -eq 3) { |
||||
|
Write-Host "[SUCCESS] All tests passed!" -ForegroundColor Green |
||||
|
Write-Host "- Import: Successfully imported 3 articles" -ForegroundColor White |
||||
|
Write-Host "- Export: Successfully exported to JSON" -ForegroundColor White |
||||
|
Write-Host "- Duplicate: Correctly skipped duplicate articles" -ForegroundColor White |
||||
|
Write-Host "- crawledAt field: Present in exported JSON" -ForegroundColor White |
||||
|
} else { |
||||
|
Write-Host "[PARTIAL] Some tests may have issues" -ForegroundColor Yellow |
||||
|
Write-Host "First import count: $count1" -ForegroundColor White |
||||
|
Write-Host "Second import count: $count2" -ForegroundColor White |
||||
|
} |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Show exported file content |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "EXPORTED JSON CONTENT (Preview)" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
if (Test-Path $EXPORT_FILE) { |
||||
|
$exportContent = Get-Content $EXPORT_FILE -Raw |
||||
|
if ($exportContent.Length -gt 1000) { |
||||
|
Write-Host ($exportContent.Substring(0, 1000) + "...") -ForegroundColor White |
||||
|
} else { |
||||
|
Write-Host $exportContent -ForegroundColor White |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,60 @@ |
|||||
|
package com.example.datacollect; |
||||
|
|
||||
|
import com.example.datacollect.controller.CrawlerController; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.repository.PersistenceManager; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.util.JsonExporter; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
|
||||
|
public class Main { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(Main.class); |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
try (ConsoleView view = new ConsoleView(); |
||||
|
ArticleRepository repository = new ArticleRepository(); |
||||
|
PersistenceManager persistenceManager = new PersistenceManager(repository)) { |
||||
|
|
||||
|
logger.info("Starting CLI Crawler application"); |
||||
|
|
||||
|
JsonExporter jsonExporter = new JsonExporter(repository); |
||||
|
StrategyFactory strategyFactory = new StrategyFactory(); |
||||
|
|
||||
|
loadSession(persistenceManager, view, repository); |
||||
|
|
||||
|
CrawlerController controller = new CrawlerController(view, repository, strategyFactory, persistenceManager, jsonExporter); |
||||
|
|
||||
|
view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); |
||||
|
logger.info("Application initialized successfully"); |
||||
|
|
||||
|
while (true) { |
||||
|
try { |
||||
|
controller.handle(view.readLine()); |
||||
|
} catch (Exception e) { |
||||
|
view.printError("Error: " + e.getMessage()); |
||||
|
logger.error("Error in main loop: {}", e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Fatal error in application: {}", e.getMessage(), e); |
||||
|
System.err.println("Fatal error: " + e.getMessage()); |
||||
|
System.exit(1); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static void loadSession(PersistenceManager persistenceManager, ConsoleView view, ArticleRepository repository) { |
||||
|
try { |
||||
|
persistenceManager.load();/* 加载会话 */ |
||||
|
if (repository.size() > 0) {/* 如果有文章 */ |
||||
|
view.printInfo("Loaded " + repository.size() + " articles from previous session");/* 打印加载的文章数量 */ |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
view.printError("Warning: Failed to load previous session: " + e.getMessage()); |
||||
|
logger.warn("Failed to load previous session: {}", e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,103 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.exception.NetworkException; |
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.strategy.CrawlStrategy; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.util.RetryUtils; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.List; |
||||
|
import java.util.concurrent.Callable; |
||||
|
|
||||
|
public class AnalyzeCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
private final StrategyFactory strategyFactory; |
||||
|
|
||||
|
public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { |
||||
|
this.view = view; |
||||
|
this.strategyFactory = strategyFactory; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "analyze"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
if (args.length < 2) { |
||||
|
view.printError("Usage: analyze <url>"); |
||||
|
logger.warn("Invalid command: missing URL argument"); |
||||
|
return; |
||||
|
} |
||||
|
String url = args[1]; |
||||
|
logger.info("Analyze command executed for URL: {}", url); |
||||
|
|
||||
|
try { |
||||
|
CrawlStrategy strategy = strategyFactory.getStrategy(url); |
||||
|
if (strategy == null) { |
||||
|
view.printError("No strategy found for: " + url); |
||||
|
logger.error("No strategy found for URL: {}", url); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
Callable<Document> fetchTask = () -> { |
||||
|
logger.debug("Fetching document from: {}", url); |
||||
|
try { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0") |
||||
|
.timeout(5000) |
||||
|
.get(); |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("Failed to connect to " + url + ": " + e.getMessage(), e); |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
Document doc = RetryUtils.executeWithRetry(fetchTask); |
||||
|
logger.info("Successfully fetched document from: {}", url); |
||||
|
|
||||
|
List<Article> articles = strategy.parse(url, doc); |
||||
|
logger.info("Parsed {} articles for analysis", articles.size()); |
||||
|
|
||||
|
int total = articles.size(); |
||||
|
int totalTitleLen = 0; |
||||
|
int totalContentLen = 0; |
||||
|
|
||||
|
for (Article a : articles) { |
||||
|
totalTitleLen += a.getTitle() == null ? 0 : a.getTitle().length(); |
||||
|
totalContentLen += a.getContent() == null ? 0 : a.getContent().length(); |
||||
|
} |
||||
|
|
||||
|
view.printInfo("===== 分析统计结果 ====="); |
||||
|
view.printInfo("文章总数:" + total + " 篇"); |
||||
|
view.printInfo("标题总长度:" + totalTitleLen); |
||||
|
view.printInfo("内容总长度:" + totalContentLen); |
||||
|
if (total > 0) { |
||||
|
view.printInfo("平均标题长度:" + (totalTitleLen / total)); |
||||
|
view.printInfo("平均内容长度:" + (totalContentLen / total)); |
||||
|
} |
||||
|
view.printInfo("======================"); |
||||
|
view.printSuccess("分析完成(数据未保存)"); |
||||
|
|
||||
|
logger.info("Analysis completed: {} articles analyzed", total); |
||||
|
} catch (NetworkException e) { |
||||
|
view.printError("Network error: " + e.getMessage()); |
||||
|
logger.error("Network error while analyzing {}: {}", url, e.getMessage(), e); |
||||
|
} catch (ParseException e) { |
||||
|
view.printError("Parse error: " + e.getMessage()); |
||||
|
logger.error("Parse error while analyzing {}: {}", url, e.getMessage(), e); |
||||
|
} catch (Exception e) { |
||||
|
view.printError("分析失败:" + e.getMessage()); |
||||
|
logger.error("Unexpected error while analyzing {}: {}", url, e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,8 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
|
||||
|
public interface Command { |
||||
|
String getName(); |
||||
|
void execute(String[] args, ArticleRepository repository); |
||||
|
} |
||||
@ -0,0 +1,114 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.exception.NetworkException; |
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.exception.UrlFormatException; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.strategy.CrawlStrategy; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.util.RetryUtils; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.net.MalformedURLException; |
||||
|
import java.net.URL; |
||||
|
import java.util.concurrent.Callable; |
||||
|
|
||||
|
public class CrawlCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
private final StrategyFactory strategyFactory; |
||||
|
|
||||
|
public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { |
||||
|
this.view = view; |
||||
|
this.strategyFactory = strategyFactory; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "crawl"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
if (args == null || args.length < 2) { |
||||
|
view.printError("用法: crawl <url>"); |
||||
|
logger.warn("无效命令: 缺少URL参数"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String url = args[1]; |
||||
|
if (url == null || url.trim().isEmpty()) { |
||||
|
view.printError("错误: URL不能为空"); |
||||
|
logger.error("无效参数: URL为空"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
new URL(url); |
||||
|
} catch (MalformedURLException e) { |
||||
|
logger.error("无效URL格式: {}", url, e); |
||||
|
throw new UrlFormatException("无效的URL格式: " + url, url, e); |
||||
|
} |
||||
|
|
||||
|
logger.info("开始爬取: {}", url); |
||||
|
|
||||
|
CrawlStrategy strategy = strategyFactory.getStrategy(url); |
||||
|
if (strategy == null) { |
||||
|
view.printError("未找到策略: " + url); |
||||
|
logger.error("未找到URL对应的策略: {}", url); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
view.printInfo("正在爬取: " + url); |
||||
|
|
||||
|
Callable<Document> fetchTask = () -> { |
||||
|
logger.debug("正在获取文档: {}", url); |
||||
|
try { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
||||
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8") |
||||
|
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") |
||||
|
.header("Accept-Encoding", "gzip, deflate, br") |
||||
|
.header("Connection", "keep-alive") |
||||
|
.header("Referer", url) |
||||
|
.header("Cache-Control", "max-age=0") |
||||
|
.timeout(15000) |
||||
|
.followRedirects(true) |
||||
|
.get(); |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("连接失败: " + e.getMessage(), e); |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
Document doc = RetryUtils.executeWithRetry(fetchTask); |
||||
|
logger.info("成功获取文档: {}", url); |
||||
|
|
||||
|
var articles = strategy.parse(url, doc); |
||||
|
logger.info("解析文章数: {}", articles.size()); |
||||
|
|
||||
|
repository.addAll(articles); |
||||
|
logger.info("成功添加 {} 篇文章到仓库", articles.size()); |
||||
|
|
||||
|
view.printSuccess("爬取完成,共 " + articles.size() + " 篇文章。"); |
||||
|
logger.info("成功从 {} 爬取 {} 篇文章", url, articles.size()); |
||||
|
} catch (NetworkException e) { |
||||
|
view.printError(e.getMessage()); |
||||
|
logger.error("爬取 {} 时网络错误: {}", url, e.getMessage(), e); |
||||
|
} catch (ParseException e) { |
||||
|
view.printError("解析错误: " + e.getMessage()); |
||||
|
logger.error("爬取 {} 时解析错误: {}", url, e.getMessage(), e); |
||||
|
} catch (UrlFormatException e) { |
||||
|
view.printError("URL格式错误: " + e.getMessage()); |
||||
|
logger.error("爬取 {} 时URL格式错误: {}", url, e.getMessage(), e); |
||||
|
} catch (Exception e) { |
||||
|
view.printError("爬取失败: " + e.getMessage()); |
||||
|
logger.error("爬取 {} 时发生未知错误: {}", url, e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,42 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.repository.PersistenceManager; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
|
||||
|
public class ExitCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
private final PersistenceManager persistenceManager; |
||||
|
|
||||
|
public ExitCommand(ConsoleView view, PersistenceManager persistenceManager) { |
||||
|
this.view = view; |
||||
|
this.persistenceManager = persistenceManager; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "exit"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
logger.info("Exit command executed, saving data before shutdown"); |
||||
|
|
||||
|
try { |
||||
|
persistenceManager.save();/* 保存数据到持久化管理器 */ |
||||
|
view.printInfo("Saved " + repository.size() + " articles"); |
||||
|
logger.info("Successfully saved {} articles before exit", repository.size()); |
||||
|
} catch (IOException e) { |
||||
|
view.printError("Warning: Failed to save data: " + e.getMessage()); |
||||
|
logger.error("Failed to save data on exit: {}", e.getMessage(), e); |
||||
|
} |
||||
|
|
||||
|
view.printSuccess("Bye!"); |
||||
|
System.exit(0); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,66 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.exception.ExportException; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.repository.PersistenceManager; |
||||
|
import com.example.datacollect.util.JsonExporter; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.nio.file.Paths; |
||||
|
|
||||
|
public class ExportCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ExportCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
private final PersistenceManager persistenceManager; |
||||
|
private final JsonExporter jsonExporter; |
||||
|
|
||||
|
public ExportCommand(ConsoleView view, PersistenceManager persistenceManager, JsonExporter jsonExporter) { |
||||
|
this.view = view; |
||||
|
this.persistenceManager = persistenceManager; |
||||
|
this.jsonExporter = jsonExporter; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "export"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
String filePath = null; |
||||
|
String format = "json"; |
||||
|
|
||||
|
if (args.length < 2) { |
||||
|
view.printError("Usage: export <file_path> [--format json]"); |
||||
|
logger.warn("Invalid command: missing file path argument"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
filePath = args[1]; |
||||
|
|
||||
|
for (int i = 2; i < args.length; i++) { |
||||
|
if (args[i].equals("--format") && i + 1 < args.length) { |
||||
|
format = args[i + 1].toLowerCase(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("导出请求: 文件={}, 格式={}", filePath, format); |
||||
|
|
||||
|
try { |
||||
|
if ("json".equals(format)) { |
||||
|
jsonExporter.exportToFile(Paths.get(filePath)); |
||||
|
view.printSuccess("Successfully exported " + repository.size() + " articles to " + filePath); |
||||
|
logger.info("Exported {} articles to {}", repository.size(), filePath); |
||||
|
} else { |
||||
|
view.printError("Unsupported format: " + format + ". Only 'json' is supported."); |
||||
|
logger.warn("Unsupported format: {}", format); |
||||
|
} |
||||
|
} catch (ExportException e) { |
||||
|
view.printError("Export failed: " + e.getMessage()); |
||||
|
logger.error("Export error: {}", e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,33 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
public class HelpCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public HelpCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "help"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
logger.info("Help command executed"); |
||||
|
view.printInfo("Commands:"); |
||||
|
view.printInfo(" crawl <url> - Crawl articles from URL"); |
||||
|
view.printInfo(" list - List all articles"); |
||||
|
view.printInfo(" export <file> - Export articles to JSON file"); |
||||
|
view.printInfo(" import <file> - Import articles from JSON file"); |
||||
|
view.printInfo(" analyze <url> - Analyze URL structure"); |
||||
|
view.printInfo(" help - Show this help"); |
||||
|
view.printInfo(" exit - Exit and save data"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,71 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.exception.ImportException; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.repository.PersistenceManager; |
||||
|
import com.example.datacollect.util.JsonImporter; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
public class ImportCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ImportCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
private final PersistenceManager persistenceManager; |
||||
|
|
||||
|
public ImportCommand(ConsoleView view, PersistenceManager persistenceManager) { |
||||
|
this.view = view; |
||||
|
this.persistenceManager = persistenceManager; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "import"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
if (args.length < 2) { |
||||
|
view.printError("Usage: import <file_path>"); |
||||
|
logger.warn("Invalid command: missing file path argument"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String filePath = args[1]; |
||||
|
|
||||
|
try { |
||||
|
int beforeCount = repository.size(); |
||||
|
JsonImporter.ImportResult result = persistenceManager.importWithReport(filePath); |
||||
|
int afterCount = repository.size(); |
||||
|
|
||||
|
StringBuilder message = new StringBuilder(); |
||||
|
message.append("Import completed:\n"); |
||||
|
message.append(" - Total found: ").append(result.getTotalFound()).append("\n"); |
||||
|
message.append(" - Imported: ").append(result.getImported()).append("\n"); |
||||
|
message.append(" - Skipped (duplicates): ").append(result.getSkipped()).append("\n"); |
||||
|
message.append(" - Invalid: ").append(result.getInvalid()).append("\n"); |
||||
|
message.append(" - Overwritten: ").append(result.getOverwritten()).append("\n"); |
||||
|
message.append(" - Repository total: ").append(afterCount); |
||||
|
|
||||
|
if (!result.getErrors().isEmpty()) { |
||||
|
message.append("\n - Errors: ").append(result.getErrors().size()); |
||||
|
for (int i = 0; i < Math.min(3, result.getErrors().size()); i++) { |
||||
|
message.append("\n ").append(i + 1).append(". ").append(result.getErrors().get(i)); |
||||
|
} |
||||
|
if (result.getErrors().size() > 3) { |
||||
|
message.append("\n ... and ").append(result.getErrors().size() - 3).append(" more errors"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
view.printSuccess(message.toString()); |
||||
|
logger.info("Import result: {}", result.getSummary()); |
||||
|
|
||||
|
} catch (ImportException e) { |
||||
|
view.printError("Import failed: " + e.getMessage()); |
||||
|
logger.error("Import error: {}", e.getMessage(), e); |
||||
|
} catch (Exception e) { |
||||
|
view.printError("Import failed: " + e.getMessage()); |
||||
|
logger.error("Import error: {}", e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,26 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
public class ListCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ListCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public ListCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "list"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
logger.info("List command executed, showing {} articles", repository.size()); |
||||
|
view.display(repository.getAll()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,71 @@ |
|||||
|
package com.example.datacollect.controller; |
||||
|
|
||||
|
import com.example.datacollect.command.AnalyzeCommand; |
||||
|
import com.example.datacollect.command.Command; |
||||
|
import com.example.datacollect.command.CrawlCommand; |
||||
|
import com.example.datacollect.command.ExitCommand; |
||||
|
import com.example.datacollect.command.ExportCommand; |
||||
|
import com.example.datacollect.command.HelpCommand; |
||||
|
import com.example.datacollect.command.ImportCommand; |
||||
|
import com.example.datacollect.command.ListCommand; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.repository.PersistenceManager; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.util.JsonExporter; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import java.util.HashMap; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class CrawlerController { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class); |
||||
|
private final Map<String, Command> commands = new HashMap<>();/* 命令映射表 */ |
||||
|
private final ConsoleView view;/* 控制台视图 */ |
||||
|
private final ArticleRepository repository;/* 文章仓库 */ |
||||
|
|
||||
|
public CrawlerController(ConsoleView view, ArticleRepository repository, |
||||
|
StrategyFactory strategyFactory, PersistenceManager persistenceManager, JsonExporter jsonExporter) { |
||||
|
this.view = view; |
||||
|
this.repository = repository; |
||||
|
register(new HelpCommand(view)); |
||||
|
register(new ListCommand(view)); |
||||
|
register(new CrawlCommand(view, strategyFactory)); |
||||
|
register(new ExitCommand(view, persistenceManager)); |
||||
|
register(new AnalyzeCommand(view, strategyFactory)); |
||||
|
register(new ExportCommand(view, persistenceManager, jsonExporter)); |
||||
|
register(new ImportCommand(view, persistenceManager)); |
||||
|
logger.info("CrawlerController initialized with {} commands", commands.size()); |
||||
|
} |
||||
|
|
||||
|
private void register(Command command) {/* 注册命令 */ |
||||
|
commands.put(command.getName(), command);/* 将命令添加到映射表 */ |
||||
|
logger.debug("Registered command: {}", command.getName());/* 记录注册的命令 */ |
||||
|
} |
||||
|
|
||||
|
public void handle(String input) {/* 处理用户输入 */ |
||||
|
String text = input == null ? "" : input.trim();/* 处理空输入 */ |
||||
|
if (text.isEmpty()) { |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String[] args = text.split("\\s+");/* 解析命令行参数 */ |
||||
|
String cmdName = args[0].toLowerCase();/* 提取命令名称并转换为小写 */ |
||||
|
|
||||
|
logger.debug("Processing command: {}", cmdName); |
||||
|
|
||||
|
Command command = commands.get(cmdName);/* 获取命令对象 */ |
||||
|
if (command == null) { |
||||
|
view.printError("Unknown command: " + cmdName); |
||||
|
logger.warn("Unknown command attempted: {}", cmdName); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
command.execute(args, repository);/* 执行命令 */ |
||||
|
} catch (Exception e) { |
||||
|
view.printError("Command execution failed: " + e.getMessage()); |
||||
|
logger.error("Error executing command {}: {}", cmdName, e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,10 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class CrawlerException extends Exception { |
||||
|
public CrawlerException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
public CrawlerException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,56 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class DuplicateArticleException extends Exception { |
||||
|
private final String duplicateUrl; |
||||
|
private final Integer existingIndex; |
||||
|
|
||||
|
public DuplicateArticleException(String message) { |
||||
|
super(message); |
||||
|
this.duplicateUrl = null; |
||||
|
this.existingIndex = null; |
||||
|
} |
||||
|
|
||||
|
public DuplicateArticleException(String message, String duplicateUrl) { |
||||
|
super(message); |
||||
|
this.duplicateUrl = duplicateUrl; |
||||
|
this.existingIndex = null; |
||||
|
} |
||||
|
|
||||
|
public DuplicateArticleException(String message, String duplicateUrl, Integer existingIndex) { |
||||
|
super(message); |
||||
|
this.duplicateUrl = duplicateUrl; |
||||
|
this.existingIndex = existingIndex; |
||||
|
} |
||||
|
|
||||
|
public DuplicateArticleException(String message, String duplicateUrl, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.duplicateUrl = duplicateUrl; |
||||
|
this.existingIndex = null; |
||||
|
} |
||||
|
|
||||
|
public DuplicateArticleException(String message, String duplicateUrl, Integer existingIndex, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.duplicateUrl = duplicateUrl; |
||||
|
this.existingIndex = existingIndex; |
||||
|
} |
||||
|
|
||||
|
public String getDuplicateUrl() { |
||||
|
return duplicateUrl; |
||||
|
} |
||||
|
|
||||
|
public Integer getExistingIndex() { |
||||
|
return existingIndex; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getMessage() { |
||||
|
StringBuilder sb = new StringBuilder(super.getMessage()); |
||||
|
if (duplicateUrl != null) { |
||||
|
sb.append(" [重复URL: ").append(duplicateUrl).append("]"); |
||||
|
} |
||||
|
if (existingIndex != null) { |
||||
|
sb.append(" [已存在位置: ").append(existingIndex).append("]"); |
||||
|
} |
||||
|
return sb.toString(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,63 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class ExportException extends Exception { |
||||
|
private final String filePath; |
||||
|
private final Long estimatedSize; |
||||
|
|
||||
|
public ExportException(String message) { |
||||
|
super(message); |
||||
|
this.filePath = null; |
||||
|
this.estimatedSize = null; |
||||
|
} |
||||
|
|
||||
|
public ExportException(String message, String filePath) { |
||||
|
super(message); |
||||
|
this.filePath = filePath; |
||||
|
this.estimatedSize = null; |
||||
|
} |
||||
|
|
||||
|
public ExportException(String message, String filePath, Long estimatedSize) { |
||||
|
super(message); |
||||
|
this.filePath = filePath; |
||||
|
this.estimatedSize = estimatedSize; |
||||
|
} |
||||
|
|
||||
|
public ExportException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.filePath = null; |
||||
|
this.estimatedSize = null; |
||||
|
} |
||||
|
|
||||
|
public ExportException(String message, String filePath, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.filePath = filePath; |
||||
|
this.estimatedSize = null; |
||||
|
} |
||||
|
|
||||
|
public String getFilePath() { |
||||
|
return filePath; |
||||
|
} |
||||
|
|
||||
|
public Long getEstimatedSize() { |
||||
|
return estimatedSize; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getMessage() { |
||||
|
StringBuilder sb = new StringBuilder(super.getMessage()); |
||||
|
if (filePath != null) { |
||||
|
sb.append(" [文件: ").append(filePath).append("]"); |
||||
|
} |
||||
|
if (estimatedSize != null) { |
||||
|
sb.append(" [预估大小: ").append(formatSize(estimatedSize)).append("]"); |
||||
|
} |
||||
|
return sb.toString(); |
||||
|
} |
||||
|
|
||||
|
private static String formatSize(long size) { |
||||
|
if (size < 1024) return size + " B"; |
||||
|
if (size < 1024 * 1024) return String.format("%.2f KB", size / 1024.0); |
||||
|
if (size < 1024 * 1024 * 1024) return String.format("%.2f MB", size / (1024.0 * 1024)); |
||||
|
return String.format("%.2f GB", size / (1024.0 * 1024 * 1024)); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,56 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class ImportException extends Exception { |
||||
|
private final String filePath; |
||||
|
private final Integer lineNumber; |
||||
|
|
||||
|
public ImportException(String message) { |
||||
|
super(message); |
||||
|
this.filePath = null; |
||||
|
this.lineNumber = null; |
||||
|
} |
||||
|
|
||||
|
public ImportException(String message, String filePath) { |
||||
|
super(message); |
||||
|
this.filePath = filePath; |
||||
|
this.lineNumber = null; |
||||
|
} |
||||
|
|
||||
|
public ImportException(String message, String filePath, Integer lineNumber) { |
||||
|
super(message); |
||||
|
this.filePath = filePath; |
||||
|
this.lineNumber = lineNumber; |
||||
|
} |
||||
|
|
||||
|
public ImportException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.filePath = null; |
||||
|
this.lineNumber = null; |
||||
|
} |
||||
|
|
||||
|
public ImportException(String message, String filePath, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.filePath = filePath; |
||||
|
this.lineNumber = null; |
||||
|
} |
||||
|
|
||||
|
public String getFilePath() { |
||||
|
return filePath; |
||||
|
} |
||||
|
|
||||
|
public Integer getLineNumber() { |
||||
|
return lineNumber; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getMessage() { |
||||
|
StringBuilder sb = new StringBuilder(super.getMessage()); |
||||
|
if (filePath != null) { |
||||
|
sb.append(" [文件: ").append(filePath).append("]"); |
||||
|
} |
||||
|
if (lineNumber != null) { |
||||
|
sb.append(" [行号: ").append(lineNumber).append("]"); |
||||
|
} |
||||
|
return sb.toString(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,10 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class NetworkException extends CrawlerException { |
||||
|
public NetworkException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
public NetworkException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,10 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class ParseException extends CrawlerException { |
||||
|
public ParseException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
public ParseException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,30 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class UrlFormatException extends RuntimeException { |
||||
|
|
||||
|
private final String invalidUrl; |
||||
|
|
||||
|
public UrlFormatException(String message) { |
||||
|
super(message); |
||||
|
this.invalidUrl = null; |
||||
|
} |
||||
|
|
||||
|
public UrlFormatException(String message, String invalidUrl) { |
||||
|
super(message); |
||||
|
this.invalidUrl = invalidUrl; |
||||
|
} |
||||
|
|
||||
|
public UrlFormatException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.invalidUrl = null; |
||||
|
} |
||||
|
|
||||
|
public UrlFormatException(String message, String invalidUrl, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.invalidUrl = invalidUrl; |
||||
|
} |
||||
|
|
||||
|
public String getInvalidUrl() { |
||||
|
return invalidUrl; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,72 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class ValidationException extends Exception { |
||||
|
private final String fieldName; |
||||
|
private final String invalidValue; |
||||
|
private final String validationRule; |
||||
|
|
||||
|
public ValidationException(String message) { |
||||
|
super(message); |
||||
|
this.fieldName = null; |
||||
|
this.invalidValue = null; |
||||
|
this.validationRule = null; |
||||
|
} |
||||
|
|
||||
|
public ValidationException(String message, String fieldName) { |
||||
|
super(message); |
||||
|
this.fieldName = fieldName; |
||||
|
this.invalidValue = null; |
||||
|
this.validationRule = null; |
||||
|
} |
||||
|
|
||||
|
public ValidationException(String message, String fieldName, String invalidValue) { |
||||
|
super(message); |
||||
|
this.fieldName = fieldName; |
||||
|
this.invalidValue = invalidValue; |
||||
|
this.validationRule = null; |
||||
|
} |
||||
|
|
||||
|
public ValidationException(String message, String fieldName, String invalidValue, String validationRule) { |
||||
|
super(message); |
||||
|
this.fieldName = fieldName; |
||||
|
this.invalidValue = invalidValue; |
||||
|
this.validationRule = validationRule; |
||||
|
} |
||||
|
|
||||
|
public ValidationException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.fieldName = null; |
||||
|
this.invalidValue = null; |
||||
|
this.validationRule = null; |
||||
|
} |
||||
|
|
||||
|
public String getFieldName() { |
||||
|
return fieldName; |
||||
|
} |
||||
|
|
||||
|
public String getInvalidValue() { |
||||
|
return invalidValue; |
||||
|
} |
||||
|
|
||||
|
public String getValidationRule() { |
||||
|
return validationRule; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getMessage() { |
||||
|
StringBuilder sb = new StringBuilder(super.getMessage()); |
||||
|
if (fieldName != null) { |
||||
|
sb.append(" [字段: ").append(fieldName).append("]"); |
||||
|
} |
||||
|
if (invalidValue != null) { |
||||
|
String displayValue = invalidValue.length() > 50 |
||||
|
? invalidValue.substring(0, 50) + "..." |
||||
|
: invalidValue; |
||||
|
sb.append(" [值: ").append(displayValue).append("]"); |
||||
|
} |
||||
|
if (validationRule != null) { |
||||
|
sb.append(" [规则: ").append(validationRule).append("]"); |
||||
|
} |
||||
|
return sb.toString(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,99 @@ |
|||||
|
package com.example.datacollect.model; |
||||
|
|
||||
|
import com.fasterxml.jackson.annotation.JsonCreator; |
||||
|
import com.fasterxml.jackson.annotation.JsonProperty; |
||||
|
import java.time.LocalDateTime; |
||||
|
|
||||
|
public class Article { |
||||
|
private String title; |
||||
|
private String url; |
||||
|
private String content; |
||||
|
private LocalDateTime crawledAt; |
||||
|
|
||||
|
public Article() { |
||||
|
this.crawledAt = LocalDateTime.now(); |
||||
|
} |
||||
|
|
||||
|
public Article(String title, String url, String content) { |
||||
|
setTitle(title); |
||||
|
setUrl(url); |
||||
|
setContent(content); |
||||
|
this.crawledAt = LocalDateTime.now(); |
||||
|
} |
||||
|
|
||||
|
@JsonCreator |
||||
|
public Article(@JsonProperty("title") String title, |
||||
|
@JsonProperty("url") String url, |
||||
|
@JsonProperty("content") String content, |
||||
|
@JsonProperty("crawledAt") LocalDateTime crawledAt) { |
||||
|
setTitle(title); |
||||
|
setUrl(url); |
||||
|
setContent(content); |
||||
|
this.crawledAt = crawledAt != null ? crawledAt : LocalDateTime.now(); |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
if (title == null) { |
||||
|
throw new IllegalArgumentException("Title cannot be null"); |
||||
|
} |
||||
|
if (title.trim().isEmpty()) { |
||||
|
throw new IllegalArgumentException("Title cannot be empty"); |
||||
|
} |
||||
|
if (title.length() > 500) { |
||||
|
throw new IllegalArgumentException("Title cannot exceed 500 characters"); |
||||
|
} |
||||
|
this.title = title.trim(); |
||||
|
} |
||||
|
|
||||
|
public String getUrl() { |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
public void setUrl(String url) { |
||||
|
if (url == null) { |
||||
|
throw new IllegalArgumentException("URL cannot be null"); |
||||
|
} |
||||
|
if (url.trim().isEmpty()) { |
||||
|
throw new IllegalArgumentException("URL cannot be empty"); |
||||
|
} |
||||
|
if (!url.startsWith("http://") && !url.startsWith("https://")) { |
||||
|
throw new IllegalArgumentException("URL must start with http:// or https://"); |
||||
|
} |
||||
|
this.url = url.trim(); |
||||
|
} |
||||
|
|
||||
|
public String getContent() { |
||||
|
return content; |
||||
|
} |
||||
|
|
||||
|
public void setContent(String content) { |
||||
|
if (content == null) { |
||||
|
this.content = ""; |
||||
|
} else if (content.length() > 10000) { |
||||
|
this.content = content.substring(0, 10000);/* 截断内容到 10000 个字符 */ |
||||
|
} else { |
||||
|
this.content = content; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public LocalDateTime getCrawledAt() { |
||||
|
return crawledAt; |
||||
|
} |
||||
|
|
||||
|
public void setCrawledAt(LocalDateTime crawledAt) { |
||||
|
this.crawledAt = crawledAt; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Article{" |
||||
|
+ "title='" + title + '\'' |
||||
|
+ ", url='" + url + '\'' |
||||
|
+ ", crawledAt='" + crawledAt + '\'' |
||||
|
+ '}'; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,172 @@ |
|||||
|
package com.example.datacollect.repository; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.util.JsonSerializer; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.Collections; |
||||
|
import java.util.HashSet; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
|
||||
|
public class ArticleRepository implements AutoCloseable { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); |
||||
|
private static final int MAX_TITLE_LENGTH = 500; |
||||
|
private static final int MAX_CONTENT_LENGTH = 10000; |
||||
|
|
||||
|
private final List<Article> articles = new ArrayList<>(); |
||||
|
private final Set<String> urlSet = new HashSet<>(); |
||||
|
|
||||
|
public void add(Article article) { |
||||
|
if (article == null) { |
||||
|
logger.error("Attempted to add null article"); |
||||
|
throw new IllegalArgumentException("Article cannot be null"); |
||||
|
} |
||||
|
|
||||
|
String title = article.getTitle(); |
||||
|
String url = article.getUrl(); |
||||
|
String content = article.getContent(); |
||||
|
|
||||
|
if (title == null || title.trim().isEmpty()) { |
||||
|
logger.warn("Attempted to add article with empty title"); |
||||
|
throw new IllegalArgumentException("Article title cannot be null or empty"); |
||||
|
} |
||||
|
|
||||
|
if (url == null || url.trim().isEmpty()) { |
||||
|
logger.warn("Attempted to add article with empty URL"); |
||||
|
throw new IllegalArgumentException("Article URL cannot be null or empty"); |
||||
|
} |
||||
|
|
||||
|
if (title.length() > MAX_TITLE_LENGTH) { |
||||
|
logger.warn("Article title too long: {} characters (max: {})", title.length(), MAX_TITLE_LENGTH); |
||||
|
throw new IllegalArgumentException("Article title exceeds maximum length of " + MAX_TITLE_LENGTH); |
||||
|
} |
||||
|
|
||||
|
if (content != null && content.length() > MAX_CONTENT_LENGTH) { |
||||
|
logger.warn("Article content too long: {} characters (max: {})", content.length(), MAX_CONTENT_LENGTH); |
||||
|
content = content.substring(0, MAX_CONTENT_LENGTH); |
||||
|
} |
||||
|
|
||||
|
if (!url.startsWith("http://") && !url.startsWith("https://")) { |
||||
|
logger.warn("Invalid URL format: {}", url); |
||||
|
throw new IllegalArgumentException("Article URL must start with http:// or https://"); |
||||
|
} |
||||
|
|
||||
|
if (urlSet.contains(url)) { |
||||
|
logger.warn("Duplicate article URL detected: {}", url); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
Article validatedArticle = new Article(title.trim(), url.trim(), content != null ? content.trim() : ""); |
||||
|
articles.add(validatedArticle); |
||||
|
urlSet.add(url); |
||||
|
logger.debug("Added article: {}", title); |
||||
|
} |
||||
|
|
||||
|
public void addAll(List<Article> articleList) { |
||||
|
if (articleList == null) { |
||||
|
logger.error("Attempted to add null article list"); |
||||
|
throw new IllegalArgumentException("Article list cannot be null"); |
||||
|
} |
||||
|
|
||||
|
int successCount = 0; |
||||
|
int skipCount = 0; |
||||
|
|
||||
|
for (Article article : articleList) { |
||||
|
if (article != null) { |
||||
|
try { |
||||
|
add(article); |
||||
|
successCount++; |
||||
|
} catch (IllegalArgumentException e) { |
||||
|
logger.warn("Skipped invalid article: {}", e.getMessage()); |
||||
|
skipCount++; |
||||
|
} |
||||
|
} else { |
||||
|
logger.warn("Skipped null article in list"); |
||||
|
skipCount++; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("Added {} articles, skipped {} invalid articles", successCount, skipCount); |
||||
|
} |
||||
|
|
||||
|
public List<Article> getAll() { |
||||
|
logger.debug("Retrieving all articles, total: {}", articles.size()); |
||||
|
return Collections.unmodifiableList(articles); |
||||
|
} |
||||
|
|
||||
|
public int size() { |
||||
|
return articles.size(); |
||||
|
} |
||||
|
|
||||
|
public void clear() { |
||||
|
int count = articles.size(); |
||||
|
articles.clear(); |
||||
|
urlSet.clear(); |
||||
|
logger.info("Cleared repository, removed {} articles", count); |
||||
|
} |
||||
|
|
||||
|
public void remove(Article article) { |
||||
|
if (article == null) { |
||||
|
logger.warn("Attempted to remove null article"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String url = article.getUrl(); |
||||
|
if (url != null && urlSet.contains(url)) { |
||||
|
articles.remove(article); |
||||
|
urlSet.remove(url); |
||||
|
logger.debug("Removed article: {}", article.getTitle()); |
||||
|
} else { |
||||
|
logger.warn("Article not found in repository: {}", url); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public Article findByUrl(String url) { |
||||
|
if (url == null || url.trim().isEmpty()) { |
||||
|
logger.debug("findByUrl called with null or empty URL"); |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
for (Article article : articles) { |
||||
|
if (article.getUrl().equals(url)) { |
||||
|
logger.debug("Found article by URL: {}", url); |
||||
|
return article; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.debug("No article found with URL: {}", url); |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public boolean containsUrl(String url) { |
||||
|
return url != null && urlSet.contains(url); |
||||
|
} |
||||
|
|
||||
|
public void saveToJson(String filePath) throws IOException { |
||||
|
JsonSerializer.writeToFile(articles, filePath); |
||||
|
logger.info("Saved {} articles to JSON file: {}", articles.size(), filePath); |
||||
|
} |
||||
|
|
||||
|
public void loadFromJson(String filePath) throws IOException { |
||||
|
List<Article> loadedArticles = JsonSerializer.readListFromFile(filePath, Article.class); |
||||
|
addAll(loadedArticles); |
||||
|
logger.info("Loaded {} articles from JSON file: {}", loadedArticles.size(), filePath); |
||||
|
} |
||||
|
|
||||
|
public String toJsonString() { |
||||
|
return JsonSerializer.serialize(articles); |
||||
|
} |
||||
|
|
||||
|
public String toJsonStringCompact() { |
||||
|
return JsonSerializer.serializeCompact(articles); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void close() { |
||||
|
logger.debug("ArticleRepository closed"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,182 @@ |
|||||
|
package com.example.datacollect.repository; |
||||
|
|
||||
|
import com.example.datacollect.exception.ExportException; |
||||
|
import com.example.datacollect.exception.ImportException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.util.JsonExporter; |
||||
|
import com.example.datacollect.util.JsonImporter; |
||||
|
import com.fasterxml.jackson.databind.ObjectMapper; |
||||
|
import com.fasterxml.jackson.databind.SerializationFeature; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.nio.charset.StandardCharsets; |
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.nio.file.Paths; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.concurrent.atomic.AtomicBoolean; |
||||
|
|
||||
|
public class PersistenceManager implements AutoCloseable { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(PersistenceManager.class); |
||||
|
private static final String DEFAULT_BACKUP_DIR = "data"; |
||||
|
private static final String DEFAULT_BACKUP_FILE = "articles.json"; |
||||
|
private static final String BACKUP_FILE_PATTERN = "articles_%s.json"; |
||||
|
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"); |
||||
|
|
||||
|
private final ObjectMapper objectMapper; |
||||
|
private final Path backupDir; |
||||
|
private final Path backupFile; |
||||
|
private final ArticleRepository repository; |
||||
|
private final AtomicBoolean autoSaveEnabled; |
||||
|
private final JsonExporter jsonExporter; |
||||
|
private final JsonImporter jsonImporter; |
||||
|
|
||||
|
public PersistenceManager(ArticleRepository repository) { |
||||
|
this(repository, DEFAULT_BACKUP_DIR); |
||||
|
} |
||||
|
|
||||
|
public PersistenceManager(ArticleRepository repository, String backupDir) { |
||||
|
this.repository = repository; |
||||
|
this.backupDir = Paths.get(backupDir); |
||||
|
this.backupFile = this.backupDir.resolve(DEFAULT_BACKUP_FILE); |
||||
|
this.autoSaveEnabled = new AtomicBoolean(true); |
||||
|
|
||||
|
this.objectMapper = new ObjectMapper(); |
||||
|
this.objectMapper.enable(SerializationFeature.INDENT_OUTPUT); |
||||
|
|
||||
|
this.jsonExporter = new JsonExporter(repository); |
||||
|
this.jsonImporter = new JsonImporter(repository); |
||||
|
|
||||
|
ensureBackupDirExists(); |
||||
|
logger.info("PersistenceManager initialized with backup directory: {}", backupDir); |
||||
|
} |
||||
|
|
||||
|
private void ensureBackupDirExists() { |
||||
|
try { |
||||
|
if (!Files.exists(backupDir)) { |
||||
|
Files.createDirectories(backupDir); |
||||
|
logger.debug("Created backup directory: {}", backupDir); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
logger.error("Failed to create backup directory: {}", e.getMessage(), e); |
||||
|
throw new RuntimeException("Failed to create backup directory", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void save() throws IOException { |
||||
|
if (!autoSaveEnabled.get()) { |
||||
|
logger.debug("Auto-save is disabled, skipping save"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
List<Article> articles = repository.getAll(); |
||||
|
|
||||
|
try (BufferedWriter writer = Files.newBufferedWriter(backupFile, StandardCharsets.UTF_8)) { |
||||
|
objectMapper.writeValue(writer, articles); |
||||
|
logger.info("Successfully saved {} articles to {}", articles.size(), backupFile); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void load() throws IOException { |
||||
|
if (!Files.exists(backupFile)) { |
||||
|
logger.info("No backup file found at {}, starting fresh", backupFile); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try (var reader = Files.newBufferedReader(backupFile, StandardCharsets.UTF_8)) { |
||||
|
List<Article> articles = objectMapper.readValue(reader, |
||||
|
objectMapper.getTypeFactory().constructCollectionType(List.class, Article.class)); |
||||
|
|
||||
|
if (articles != null && !articles.isEmpty()) { |
||||
|
repository.addAll(articles); |
||||
|
logger.info("Successfully loaded {} articles from {}", articles.size(), backupFile); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void exportTo(String filePath) throws IOException { |
||||
|
try { |
||||
|
JsonExporter.ExportOptions options = new JsonExporter.ExportOptions(); |
||||
|
options.setMode(JsonExporter.ExportMode.MINIMAL); |
||||
|
options.setIncludeMetadata(true); |
||||
|
jsonExporter.exportToFile(Paths.get(filePath), options); |
||||
|
} catch (ExportException e) { |
||||
|
throw new IOException("Export failed: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void importFrom(String filePath) throws IOException { |
||||
|
try { |
||||
|
JsonImporter.ImportOptions options = new JsonImporter.ImportOptions(); |
||||
|
options.setDuplicateStrategy(JsonImporter.DuplicateStrategy.SKIP); |
||||
|
jsonImporter.importFromFile(Paths.get(filePath), options); |
||||
|
} catch (ImportException e) { |
||||
|
throw new IOException("Import failed: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void createSnapshot() throws IOException { |
||||
|
String timestamp = LocalDateTime.now().format(DATE_FORMATTER); |
||||
|
Path snapshotFile = backupDir.resolve(String.format(BACKUP_FILE_PATTERN, timestamp)); |
||||
|
|
||||
|
try { |
||||
|
JsonExporter.ExportOptions options = new JsonExporter.ExportOptions(); |
||||
|
options.setMode(JsonExporter.ExportMode.STANDARD); |
||||
|
options.setIncludeMetadata(true); |
||||
|
jsonExporter.exportToFile(snapshotFile, options); |
||||
|
logger.info("Created snapshot: {} ({} articles)", snapshotFile, repository.size()); |
||||
|
} catch (ExportException e) { |
||||
|
throw new IOException("Failed to create snapshot: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public List<String> listSnapshots() throws IOException { |
||||
|
List<String> snapshots = new ArrayList<>(); |
||||
|
|
||||
|
if (Files.exists(backupDir)) { |
||||
|
try (var stream = Files.list(backupDir)) { |
||||
|
stream.filter(path -> { |
||||
|
String fileName = path.getFileName().toString(); |
||||
|
return fileName.startsWith("articles_") && fileName.endsWith(".json") && !fileName.equals(DEFAULT_BACKUP_FILE); |
||||
|
}).forEach(path -> snapshots.add(path.toString())); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return snapshots; |
||||
|
} |
||||
|
|
||||
|
public void setAutoSaveEnabled(boolean enabled) { |
||||
|
autoSaveEnabled.set(enabled); |
||||
|
logger.info("Auto-save {} {}", enabled ? "enabled" : "disabled"); |
||||
|
} |
||||
|
|
||||
|
public boolean isAutoSaveEnabled() { |
||||
|
return autoSaveEnabled.get(); |
||||
|
} |
||||
|
|
||||
|
public String getBackupFilePath() { |
||||
|
return backupFile.toString(); |
||||
|
} |
||||
|
|
||||
|
public JsonImporter.ImportResult importWithReport(String filePath) throws ImportException { |
||||
|
JsonImporter.ImportOptions options = new JsonImporter.ImportOptions(); |
||||
|
options.setDuplicateStrategy(JsonImporter.DuplicateStrategy.SKIP); |
||||
|
return jsonImporter.importFromFile(Paths.get(filePath), options); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void close() { |
||||
|
try { |
||||
|
save(); |
||||
|
logger.info("PersistenceManager closed, data saved"); |
||||
|
} catch (IOException e) { |
||||
|
logger.error("Failed to save data on close: {}", e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlStrategy { |
||||
|
List<Article> parse(String url, Document doc) throws ParseException; |
||||
|
boolean supports(String url); |
||||
|
} |
||||
@ -0,0 +1,115 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.HashSet; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
|
||||
|
public class CsdnStrategy implements CrawlStrategy { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(CsdnStrategy.class); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("csdn.net"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) throws ParseException { |
||||
|
logger.info("Starting to parse CSDN: {}", url); |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
Set<String> seenUrls = new HashSet<>(); |
||||
|
|
||||
|
try { |
||||
|
Elements links = doc.select("a[href*='/article/details/']"); |
||||
|
logger.debug("Found {} article links", links.size()); |
||||
|
|
||||
|
if (links.isEmpty()) { |
||||
|
links = doc.select("a[href*='csdn.net/article/']"); |
||||
|
logger.debug("Trying alternative selector, found {} items", links.size()); |
||||
|
} |
||||
|
|
||||
|
if (links.isEmpty()) { |
||||
|
links = doc.select("a.title, a.article-title, .article-item a, .list-item a"); |
||||
|
logger.debug("Trying fallback selectors, found {} items", links.size()); |
||||
|
} |
||||
|
|
||||
|
for (Element link : links) { |
||||
|
try { |
||||
|
String href = link.attr("href"); |
||||
|
if (href == null || href.isEmpty()) { |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
String articleUrl = link.attr("abs:href"); |
||||
|
if (articleUrl == null || articleUrl.isEmpty()) { |
||||
|
if (!href.startsWith("http")) { |
||||
|
if (!href.startsWith("//")) { |
||||
|
articleUrl = "https://" + (href.startsWith("/") ? "" : "/") + href; |
||||
|
} else { |
||||
|
articleUrl = "https:" + href; |
||||
|
} |
||||
|
} else { |
||||
|
articleUrl = href; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (!articleUrl.contains("csdn.net")) { |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
if (seenUrls.contains(articleUrl)) { |
||||
|
continue; |
||||
|
} |
||||
|
seenUrls.add(articleUrl); |
||||
|
|
||||
|
String title = link.text().trim(); |
||||
|
|
||||
|
if (title.isEmpty() || title.length() < 5) { |
||||
|
Element titleEl = link.selectFirst("span, h3, h4, .title"); |
||||
|
if (titleEl != null) { |
||||
|
title = titleEl.text().trim(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (title.isEmpty() || title.length() < 5) { |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
String content = ""; |
||||
|
Element parent = link.parent(); |
||||
|
if (parent != null) { |
||||
|
Element desc = parent.selectFirst("p.description, .desc, .summary"); |
||||
|
if (desc != null) { |
||||
|
content = desc.text().trim(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Article article = new Article(title, articleUrl, content); |
||||
|
articles.add(article); |
||||
|
logger.debug("Parsed article: {}", title); |
||||
|
|
||||
|
} catch (Exception e) { |
||||
|
logger.debug("Skipping link due to error: {}", e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (articles.isEmpty()) { |
||||
|
logger.warn("No articles found. CSDN page structure may have changed."); |
||||
|
} |
||||
|
|
||||
|
logger.info("Successfully parsed {} articles from CSDN", articles.size()); |
||||
|
return articles; |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Failed to parse CSDN page: {}", e.getMessage(), e); |
||||
|
throw new ParseException("Failed to parse CSDN: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,77 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/* HNU News 策略 |
||||
|
- 添加 logger 成员 |
||||
|
- 添加异常处理 |
||||
|
- 实现防御性编程 */ |
||||
|
public class HnuNewsStrategy implements CrawlStrategy { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(HnuNewsStrategy.class); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("news.hnu.edu.cn");/* 支持 HNU News 网站 */ |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) throws ParseException { |
||||
|
logger.info("Starting to parse HNU News: {}", url); |
||||
|
List<Article> articles = new ArrayList<>();/* 存储储解析后的文章 */ |
||||
|
|
||||
|
try { |
||||
|
Elements listItems = doc.select("ul.list11 li");/* 选择文章列表项 */ |
||||
|
logger.debug("Found {} list items", listItems.size());/* 记录找到的列表项数量 */ |
||||
|
|
||||
|
for (Element li : listItems) { |
||||
|
try { |
||||
|
Element link = li.selectFirst("a");/* 选择列表项中的链接 */ |
||||
|
if (link == null) { |
||||
|
logger.warn("No link found in list item");/* 记录未找到链接 */ |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
String articleUrl = link.attr("href");/* 获取链接的 href 属性值 */ |
||||
|
if (!articleUrl.startsWith("http")) { |
||||
|
articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", "");/* 补全相对路径 */ |
||||
|
} |
||||
|
|
||||
|
String title = "";/* 存储文章标题 */ |
||||
|
Element titleEl = link.selectFirst("h4.l2.h4s2");/* 选择标题元素 */ |
||||
|
if (titleEl != null) { |
||||
|
title = titleEl.text().trim();/* 提取标题文本并移除首尾空格 */ |
||||
|
} |
||||
|
|
||||
|
String content = "";/* 存储文章内容 */ |
||||
|
Element contentEl = link.selectFirst("p.l3.ps3");/* 选择内容元素 */ |
||||
|
if (contentEl != null) { |
||||
|
content = contentEl.text().trim();/* 提取内容文本并移除首尾空格 */ |
||||
|
} |
||||
|
|
||||
|
if (!title.isEmpty()) { |
||||
|
Article article = new Article(title, articleUrl, content);/* 创建文章对象 */ |
||||
|
articles.add(article);/* 将文章添加到列表 */ |
||||
|
} else { |
||||
|
logger.warn("Empty title found, skipping article"); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Error parsing individual article: {}", e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("Successfully parsed {} articles from HNU News", articles.size()); |
||||
|
return articles; |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Failed to parse HNU News page: {}", e.getMessage(), e); |
||||
|
throw new ParseException("Failed to parse HNU News: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,83 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
/* 人民网策略类 */ |
||||
|
public class PeopleStrategy implements CrawlStrategy { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(PeopleStrategy.class); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("people.com.cn");/* 检查URL是否包含people.com.cn */ |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) throws ParseException { |
||||
|
logger.info("Starting to parse People's Daily News: {}", url); |
||||
|
List<Article> articles = new ArrayList<>();/* 初始化文章列表 */ |
||||
|
|
||||
|
try { |
||||
|
Elements newsItems = doc.select("div.w1000, div.news-item, li.list_item");/* 选择新闻容器 */ |
||||
|
logger.debug("Found {} news containers", newsItems.size()); |
||||
|
|
||||
|
if (newsItems.isEmpty()) { |
||||
|
newsItems = doc.select("a[href*='/n1/']");/* 选择替代选择器 */ |
||||
|
logger.debug("Trying alternative selector, found {} items", newsItems.size()); |
||||
|
} |
||||
|
|
||||
|
for (Element item : newsItems) { |
||||
|
try { |
||||
|
Element link = item.selectFirst("a");/* 选择链接元素 */ |
||||
|
if (link == null) { |
||||
|
link = item.tagName().equals("a") ? item : null;/* 检查是否为链接元素 */ |
||||
|
} |
||||
|
|
||||
|
if (link == null) { |
||||
|
logger.warn("No link found in news item"); |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
String articleUrl = link.attr("href");/* 获取链接URL */ |
||||
|
if (!articleUrl.startsWith("http")) {/* 检查是否为绝对URL */ |
||||
|
if (articleUrl.startsWith("/")) { |
||||
|
articleUrl = "https://www.people.com.cn" + articleUrl; |
||||
|
} else { |
||||
|
articleUrl = "https://www.people.com.cn/" + articleUrl; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
String title = link.text().trim();/* 获取标题文本 */ |
||||
|
|
||||
|
String content = "";/* 初始化内容文本 */ |
||||
|
Element contentEl = item.selectFirst("p, div.ed, div.summary");/* 选择内容元素 */ |
||||
|
if (contentEl != null) { |
||||
|
content = contentEl.text().trim();/* 获取内容文本 */ |
||||
|
} |
||||
|
|
||||
|
if (!title.isEmpty() && title.length() > 5) { |
||||
|
Article article = new Article(title, articleUrl, content);/* 创建文章对象 */ |
||||
|
articles.add(article);/* 添加文章到列表 */ |
||||
|
logger.debug("Parsed article: {}", title);/* 记录解析文章 */ |
||||
|
} else { |
||||
|
logger.warn("Invalid title found, skipping article");/* 记录无效标题 */ |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Error parsing individual article: {}", e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("Successfully parsed {} articles from People's Daily News", articles.size()); |
||||
|
return articles; |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Failed to parse People's Daily News page: {}", e.getMessage(), e); |
||||
|
throw new ParseException("Failed to parse People's Daily News: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,35 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class StrategyFactory { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(StrategyFactory.class); |
||||
|
private final List<CrawlStrategy> strategies = new ArrayList<>(); |
||||
|
|
||||
|
public StrategyFactory() { |
||||
|
strategies.add(new HnuNewsStrategy()); |
||||
|
strategies.add(new YouthStrategy()); |
||||
|
strategies.add(new PeopleStrategy()); |
||||
|
strategies.add(new CsdnStrategy()); |
||||
|
logger.info("Initialized StrategyFactory with {} strategies", strategies.size()); |
||||
|
} |
||||
|
|
||||
|
public CrawlStrategy getStrategy(String url) { |
||||
|
for (CrawlStrategy s : strategies) { |
||||
|
if (s.supports(url)) { |
||||
|
logger.debug("Found strategy {} for URL: {}", s.getClass().getSimpleName(), url); |
||||
|
return s; |
||||
|
} |
||||
|
} |
||||
|
logger.warn("No strategy found for URL: {}", url); |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public void register(CrawlStrategy strategy) { |
||||
|
strategies.add(strategy); |
||||
|
logger.info("Registered new strategy: {}", strategy.getClass().getSimpleName()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,112 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
/* 青年网新闻解析策略*/ |
||||
|
public class YouthStrategy implements CrawlStrategy { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(YouthStrategy.class); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("youth.cn");/* 检查URL是否包含青年网域名 */ |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) throws ParseException { |
||||
|
logger.info("Starting to parse Youth News: {}", url); |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
|
||||
|
try { |
||||
|
Elements newsItems = doc.select("div.news-item, div.article-item, li.news-list-item, div.list-item, ul.list li, .news-list li"); |
||||
|
logger.debug("Found {} news items with primary selectors", newsItems.size()); |
||||
|
|
||||
|
if (newsItems.isEmpty()) { |
||||
|
newsItems = doc.select("a[href*='/n1/'], a[href*='/gn/'], a[href*='/qy/'], a[href*='/jj/']"); |
||||
|
logger.debug("Trying alternative selector (news category links), found {} items", newsItems.size()); |
||||
|
} |
||||
|
|
||||
|
if (newsItems.isEmpty()) { |
||||
|
newsItems = doc.select("a[href$='.html']"); |
||||
|
logger.debug("Trying fallback selector (html links), found {} items", newsItems.size()); |
||||
|
} |
||||
|
|
||||
|
for (Element item : newsItems) { |
||||
|
try { |
||||
|
Element link = item.selectFirst("a"); |
||||
|
if (link == null) { |
||||
|
link = item.tagName().equals("a") ? item : null; |
||||
|
} |
||||
|
|
||||
|
if (link == null) { |
||||
|
logger.debug("No link found in item, skipping"); |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
String articleUrl = link.attr("href"); |
||||
|
|
||||
|
if (!articleUrl.startsWith("http")) { |
||||
|
if (articleUrl.startsWith("/")) { |
||||
|
articleUrl = "https://www.youth.cn" + articleUrl; |
||||
|
} else { |
||||
|
articleUrl = "https://www.youth.cn/" + articleUrl; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
String title = link.text().trim(); |
||||
|
|
||||
|
if (title.isEmpty()) { |
||||
|
Element titleEl = link.selectFirst("span, h3, h4, .title"); |
||||
|
if (titleEl != null) { |
||||
|
title = titleEl.text().trim(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (title.isEmpty()) { |
||||
|
Element parentTitle = item.selectFirst("span, h3, h4, .title, .news-title"); |
||||
|
if (parentTitle != null) { |
||||
|
title = parentTitle.text().trim(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (title.isEmpty()) { |
||||
|
logger.debug("Empty title found, skipping"); |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
String content = ""; |
||||
|
Element contentEl = item.selectFirst("p.summary, p.desc, div.brief, .summary, .desc"); |
||||
|
if (contentEl != null) { |
||||
|
content = contentEl.text().trim(); |
||||
|
} |
||||
|
|
||||
|
if (!title.isEmpty() && title.length() > 5) { |
||||
|
Article article = new Article(title, articleUrl, content); |
||||
|
articles.add(article); |
||||
|
logger.debug("Parsed article: {}", title); |
||||
|
} else { |
||||
|
logger.debug("Invalid title found (length: {}), skipping article", title.length()); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
logger.debug("Error parsing individual article: {}", e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (articles.isEmpty()) { |
||||
|
logger.warn("No articles found. Youth.cn page structure may have changed."); |
||||
|
} |
||||
|
|
||||
|
logger.info("Successfully parsed {} articles from Youth News", articles.size()); |
||||
|
return articles; |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Failed to parse Youth News page: {}", e.getMessage(), e); |
||||
|
throw new ParseException("Failed to parse Youth News: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,261 @@ |
|||||
|
package com.example.datacollect.util; |
||||
|
|
||||
|
import com.example.datacollect.exception.ExportException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.fasterxml.jackson.databind.ObjectMapper; |
||||
|
import com.fasterxml.jackson.databind.SerializationFeature; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.nio.charset.StandardCharsets; |
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.*; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class JsonExporter { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(JsonExporter.class); |
||||
|
private static final String VERSION = "1.0"; |
||||
|
private static final DateTimeFormatter EXPORT_TIME_FORMAT = DateTimeFormatter.ISO_LOCAL_DATE_TIME; |
||||
|
|
||||
|
public enum ExportMode { |
||||
|
STANDARD, |
||||
|
COMPACT, |
||||
|
MINIMAL |
||||
|
} |
||||
|
|
||||
|
public static class ExportOptions { |
||||
|
private ExportMode mode = ExportMode.STANDARD; |
||||
|
private String filterKeyword; |
||||
|
private LocalDateTime startDate; |
||||
|
private LocalDateTime endDate; |
||||
|
private boolean includeMetadata = true; |
||||
|
|
||||
|
public ExportOptions() {} |
||||
|
|
||||
|
public ExportMode getMode() { |
||||
|
return mode; |
||||
|
} |
||||
|
|
||||
|
public void setMode(ExportMode mode) { |
||||
|
this.mode = mode; |
||||
|
} |
||||
|
|
||||
|
public String getFilterKeyword() { |
||||
|
return filterKeyword; |
||||
|
} |
||||
|
|
||||
|
public void setFilterKeyword(String filterKeyword) { |
||||
|
this.filterKeyword = filterKeyword; |
||||
|
} |
||||
|
|
||||
|
public LocalDateTime getStartDate() { |
||||
|
return startDate; |
||||
|
} |
||||
|
|
||||
|
public void setStartDate(LocalDateTime startDate) { |
||||
|
this.startDate = startDate; |
||||
|
} |
||||
|
|
||||
|
public LocalDateTime getEndDate() { |
||||
|
return endDate; |
||||
|
} |
||||
|
|
||||
|
public void setEndDate(LocalDateTime endDate) { |
||||
|
this.endDate = endDate; |
||||
|
} |
||||
|
|
||||
|
public boolean isIncludeMetadata() { |
||||
|
return includeMetadata; |
||||
|
} |
||||
|
|
||||
|
public void setIncludeMetadata(boolean includeMetadata) { |
||||
|
this.includeMetadata = includeMetadata; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class ExportMetadata { |
||||
|
private String exportTime; |
||||
|
private int totalCount; |
||||
|
private String source; |
||||
|
private String exportMode; |
||||
|
private String version; |
||||
|
|
||||
|
public ExportMetadata() {} |
||||
|
|
||||
|
public String getExportTime() { |
||||
|
return exportTime; |
||||
|
} |
||||
|
|
||||
|
public void setExportTime(String exportTime) { |
||||
|
this.exportTime = exportTime; |
||||
|
} |
||||
|
|
||||
|
public int getTotalCount() { |
||||
|
return totalCount; |
||||
|
} |
||||
|
|
||||
|
public void setTotalCount(int totalCount) { |
||||
|
this.totalCount = totalCount; |
||||
|
} |
||||
|
|
||||
|
public String getSource() { |
||||
|
return source; |
||||
|
} |
||||
|
|
||||
|
public void setSource(String source) { |
||||
|
this.source = source; |
||||
|
} |
||||
|
|
||||
|
public String getExportMode() { |
||||
|
return exportMode; |
||||
|
} |
||||
|
|
||||
|
public void setExportMode(String exportMode) { |
||||
|
this.exportMode = exportMode; |
||||
|
} |
||||
|
|
||||
|
public String getVersion() { |
||||
|
return version; |
||||
|
} |
||||
|
|
||||
|
public void setVersion(String version) { |
||||
|
this.version = version; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private final ArticleRepository repository; |
||||
|
private final ObjectMapper objectMapper; |
||||
|
|
||||
|
public JsonExporter(ArticleRepository repository) { |
||||
|
this.repository = repository; |
||||
|
this.objectMapper = new ObjectMapper(); |
||||
|
} |
||||
|
|
||||
|
public void exportToFile(Path targetPath) throws ExportException { |
||||
|
exportToFile(targetPath, new ExportOptions()); |
||||
|
} |
||||
|
|
||||
|
public void exportToFile(Path targetPath, ExportOptions options) throws ExportException { |
||||
|
logger.info("开始导出到文件: {}, 模式: {}", targetPath, options.getMode()); |
||||
|
|
||||
|
validateTargetPath(targetPath); |
||||
|
|
||||
|
try { |
||||
|
List<Article> articles = getFilteredArticles(options); |
||||
|
logger.debug("过滤后待导出文章数: {}", articles.size()); |
||||
|
|
||||
|
String json = generateJson(articles, options); |
||||
|
|
||||
|
try (BufferedWriter writer = Files.newBufferedWriter(targetPath, StandardCharsets.UTF_8)) { |
||||
|
writer.write(json); |
||||
|
} |
||||
|
|
||||
|
logger.info("成功导出 {} 篇文章到: {}", articles.size(), targetPath); |
||||
|
} catch (IOException e) { |
||||
|
logger.error("导出文件失败: {}", e.getMessage(), e); |
||||
|
throw new ExportException("无法写入导出文件: " + e.getMessage(), targetPath.toString(), e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public String exportToString() throws ExportException { |
||||
|
return exportToString(new ExportOptions()); |
||||
|
} |
||||
|
|
||||
|
public String exportToString(ExportOptions options) throws ExportException { |
||||
|
List<Article> articles = getFilteredArticles(options); |
||||
|
return generateJson(articles, options); |
||||
|
} |
||||
|
|
||||
|
private List<Article> getFilteredArticles(ExportOptions options) { |
||||
|
List<Article> articles = repository.getAll(); |
||||
|
|
||||
|
if (options.getFilterKeyword() != null && !options.getFilterKeyword().trim().isEmpty()) { |
||||
|
String keyword = options.getFilterKeyword().toLowerCase(); |
||||
|
articles = articles.stream() |
||||
|
.filter(a -> a.getTitle().toLowerCase().contains(keyword) |
||||
|
|| a.getContent().toLowerCase().contains(keyword)) |
||||
|
.collect(Collectors.toList()); |
||||
|
logger.debug("关键词过滤后剩余文章数: {}", articles.size()); |
||||
|
} |
||||
|
|
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
private String generateJson(List<Article> articles, ExportOptions options) throws ExportException { |
||||
|
try { |
||||
|
Map<String, Object> output = new LinkedHashMap<>(); |
||||
|
|
||||
|
if (options.isIncludeMetadata() && options.getMode() != ExportMode.MINIMAL) { |
||||
|
ExportMetadata metadata = new ExportMetadata(); |
||||
|
metadata.setExportTime(LocalDateTime.now().format(EXPORT_TIME_FORMAT)); |
||||
|
metadata.setTotalCount(articles.size()); |
||||
|
metadata.setSource("CLI Crawler v" + VERSION); |
||||
|
metadata.setExportMode(options.getMode().name()); |
||||
|
metadata.setVersion(VERSION); |
||||
|
output.put("metadata", metadata); |
||||
|
} |
||||
|
|
||||
|
output.put("articles", articles); |
||||
|
|
||||
|
if (options.getMode() == ExportMode.STANDARD) { |
||||
|
return objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(output); |
||||
|
} else { |
||||
|
return objectMapper.writeValueAsString(output); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
logger.error("生成JSON失败: {}", e.getMessage(), e); |
||||
|
throw new ExportException("无法生成JSON: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void validateTargetPath(Path targetPath) throws ExportException { |
||||
|
if (targetPath == null) { |
||||
|
throw new ExportException("导出路径不能为空"); |
||||
|
} |
||||
|
|
||||
|
Path parent = targetPath.getParent(); |
||||
|
if (parent != null && !Files.exists(parent)) { |
||||
|
try { |
||||
|
Files.createDirectories(parent); |
||||
|
logger.info("创建导出目录: {}", parent); |
||||
|
} catch (IOException e) { |
||||
|
throw new ExportException("无法创建导出目录: " + parent, e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public List<Path> exportWithSnapshots(String baseDir) throws ExportException { |
||||
|
logger.info("开始批量导出快照到目录: {}", baseDir); |
||||
|
|
||||
|
List<Path> exportedFiles = new ArrayList<>(); |
||||
|
Path basePath = Path.of(baseDir); |
||||
|
|
||||
|
try { |
||||
|
if (!Files.exists(basePath)) { |
||||
|
Files.createDirectories(basePath); |
||||
|
} |
||||
|
|
||||
|
ExportOptions standardOptions = new ExportOptions(); |
||||
|
standardOptions.setMode(ExportMode.STANDARD); |
||||
|
standardOptions.setIncludeMetadata(true); |
||||
|
|
||||
|
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")); |
||||
|
Path snapshotPath = basePath.resolve("snapshot_" + timestamp + ".json"); |
||||
|
exportToFile(snapshotPath, standardOptions); |
||||
|
exportedFiles.add(snapshotPath); |
||||
|
|
||||
|
logger.info("批量导出完成,共导出 {} 个文件", exportedFiles.size()); |
||||
|
} catch (Exception e) { |
||||
|
logger.error("批量导出失败: {}", e.getMessage(), e); |
||||
|
throw new ExportException("批量导出失败: " + e.getMessage(), e); |
||||
|
} |
||||
|
|
||||
|
return exportedFiles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,386 @@ |
|||||
|
package com.example.datacollect.util; |
||||
|
|
||||
|
import com.example.datacollect.exception.DuplicateArticleException; |
||||
|
import com.example.datacollect.exception.ImportException; |
||||
|
import com.example.datacollect.exception.ValidationException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.fasterxml.jackson.databind.ObjectMapper; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.BufferedReader; |
||||
|
import java.io.IOException; |
||||
|
import java.nio.charset.StandardCharsets; |
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.util.*; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class JsonImporter { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(JsonImporter.class); |
||||
|
|
||||
|
private static final Pattern URL_PATTERN = Pattern.compile("^https?://.*"); |
||||
|
private static final int MAX_TITLE_LENGTH = 500; |
||||
|
private static final int MAX_CONTENT_LENGTH = 10000; |
||||
|
|
||||
|
public enum DuplicateStrategy { |
||||
|
SKIP, |
||||
|
OVERWRITE, |
||||
|
ERROR |
||||
|
} |
||||
|
|
||||
|
public static class ImportOptions { |
||||
|
private DuplicateStrategy duplicateStrategy = DuplicateStrategy.SKIP; |
||||
|
private boolean validateUrl = true; |
||||
|
private boolean validateTitle = true; |
||||
|
private boolean skipInvalid = true; |
||||
|
private int maxContentLength = MAX_CONTENT_LENGTH; |
||||
|
|
||||
|
public ImportOptions() {} |
||||
|
|
||||
|
public DuplicateStrategy getDuplicateStrategy() { |
||||
|
return duplicateStrategy; |
||||
|
} |
||||
|
|
||||
|
public void setDuplicateStrategy(DuplicateStrategy duplicateStrategy) { |
||||
|
this.duplicateStrategy = duplicateStrategy; |
||||
|
} |
||||
|
|
||||
|
public boolean isValidateUrl() { |
||||
|
return validateUrl; |
||||
|
} |
||||
|
|
||||
|
public void setValidateUrl(boolean validateUrl) { |
||||
|
this.validateUrl = validateUrl; |
||||
|
} |
||||
|
|
||||
|
public boolean isValidateTitle() { |
||||
|
return validateTitle; |
||||
|
} |
||||
|
|
||||
|
public void setValidateTitle(boolean validateTitle) { |
||||
|
this.validateTitle = validateTitle; |
||||
|
} |
||||
|
|
||||
|
public boolean isSkipInvalid() { |
||||
|
return skipInvalid; |
||||
|
} |
||||
|
|
||||
|
public void setSkipInvalid(boolean skipInvalid) { |
||||
|
this.skipInvalid = skipInvalid; |
||||
|
} |
||||
|
|
||||
|
public int getMaxContentLength() { |
||||
|
return maxContentLength; |
||||
|
} |
||||
|
|
||||
|
public void setMaxContentLength(int maxContentLength) { |
||||
|
this.maxContentLength = maxContentLength; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class ImportResult { |
||||
|
private int totalFound; |
||||
|
private int imported; |
||||
|
private int skipped; |
||||
|
private int invalid; |
||||
|
private int overwritten; |
||||
|
private List<String> errors; |
||||
|
private List<String> warnings; |
||||
|
|
||||
|
public ImportResult() { |
||||
|
this.errors = new ArrayList<>(); |
||||
|
this.warnings = new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
public int getTotalFound() { |
||||
|
return totalFound; |
||||
|
} |
||||
|
|
||||
|
public void setTotalFound(int totalFound) { |
||||
|
this.totalFound = totalFound; |
||||
|
} |
||||
|
|
||||
|
public int getImported() { |
||||
|
return imported; |
||||
|
} |
||||
|
|
||||
|
public void setImported(int imported) { |
||||
|
this.imported = imported; |
||||
|
} |
||||
|
|
||||
|
public int getSkipped() { |
||||
|
return skipped; |
||||
|
} |
||||
|
|
||||
|
public void setSkipped(int skipped) { |
||||
|
this.skipped = skipped; |
||||
|
} |
||||
|
|
||||
|
public int getInvalid() { |
||||
|
return invalid; |
||||
|
} |
||||
|
|
||||
|
public void setInvalid(int invalid) { |
||||
|
this.invalid = invalid; |
||||
|
} |
||||
|
|
||||
|
public int getOverwritten() { |
||||
|
return overwritten; |
||||
|
} |
||||
|
|
||||
|
public void setOverwritten(int overwritten) { |
||||
|
this.overwritten = overwritten; |
||||
|
} |
||||
|
|
||||
|
public List<String> getErrors() { |
||||
|
return errors; |
||||
|
} |
||||
|
|
||||
|
public void addError(String error) { |
||||
|
this.errors.add(error); |
||||
|
} |
||||
|
|
||||
|
public List<String> getWarnings() { |
||||
|
return warnings; |
||||
|
} |
||||
|
|
||||
|
public void addWarning(String warning) { |
||||
|
this.warnings.add(warning); |
||||
|
} |
||||
|
|
||||
|
public String getSummary() { |
||||
|
return String.format( |
||||
|
"导入完成: 总共找到=%d, 成功导入=%d, 跳过=%d, 无效=%d, 覆盖=%d, 错误=%d", |
||||
|
totalFound, imported, skipped, invalid, overwritten, errors.size() |
||||
|
); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private final ArticleRepository repository; |
||||
|
private final ObjectMapper objectMapper; |
||||
|
|
||||
|
public JsonImporter(ArticleRepository repository) { |
||||
|
this.repository = repository; |
||||
|
this.objectMapper = new ObjectMapper(); |
||||
|
} |
||||
|
|
||||
|
public ImportResult importFromFile(Path sourcePath) throws ImportException { |
||||
|
return importFromFile(sourcePath, new ImportOptions()); |
||||
|
} |
||||
|
|
||||
|
public ImportResult importFromFile(Path sourcePath, ImportOptions options) throws ImportException { |
||||
|
logger.info("开始从文件导入: {}", sourcePath); |
||||
|
validateSourcePath(sourcePath); |
||||
|
|
||||
|
ImportResult result = new ImportResult(); |
||||
|
|
||||
|
try { |
||||
|
String content = readFileContent(sourcePath); |
||||
|
List<Article> articles = parseArticles(content, result); |
||||
|
result.setTotalFound(articles.size()); |
||||
|
|
||||
|
logger.debug("解析到 {} 篇文章", articles.size()); |
||||
|
|
||||
|
for (int i = 0; i < articles.size(); i++) { |
||||
|
Article article = articles.get(i); |
||||
|
try { |
||||
|
processArticle(article, options, result, i); |
||||
|
} catch (ValidationException e) { |
||||
|
logger.warn("文章验证失败 [位置 {}]: {}", i, e.getMessage()); |
||||
|
result.addError("无效文章 at index " + i + ": " + e.getMessage()); |
||||
|
result.setInvalid(result.getInvalid() + 1); |
||||
|
if (!options.isSkipInvalid()) { |
||||
|
throw new ImportException("文章验证失败: " + e.getMessage(), sourcePath.toString(), i); |
||||
|
} |
||||
|
} catch (DuplicateArticleException e) { |
||||
|
logger.warn("重复文章 [位置 {}]: {}", i, e.getMessage()); |
||||
|
result.setSkipped(result.getSkipped() + 1); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("导入完成: {}", result.getSummary()); |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
logger.error("读取文件失败: {}", e.getMessage(), e); |
||||
|
throw new ImportException("无法读取导入文件: " + e.getMessage(), sourcePath.toString(), e); |
||||
|
} catch (ImportException e) { |
||||
|
throw e; |
||||
|
} catch (Exception e) { |
||||
|
logger.error("导入过程出错: {}", e.getMessage(), e); |
||||
|
throw new ImportException("导入失败: " + e.getMessage(), sourcePath.toString(), e); |
||||
|
} |
||||
|
|
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
public List<Article> parseArticles(String json) throws ImportException { |
||||
|
ImportResult result = new ImportResult(); |
||||
|
return parseArticles(json, result); |
||||
|
} |
||||
|
|
||||
|
private List<Article> parseArticles(String json, ImportResult result) throws ImportException { |
||||
|
try { |
||||
|
Map<String, Object> data = objectMapper.readValue(json, Map.class); |
||||
|
|
||||
|
List<?> articlesList = null; |
||||
|
if (data.containsKey("articles")) { |
||||
|
articlesList = (List<?>) data.get("articles"); |
||||
|
} else if (data.containsKey("data")) { |
||||
|
articlesList = (List<?>) data.get("data"); |
||||
|
} else if (data instanceof List) { |
||||
|
articlesList = (List<?>) data; |
||||
|
} |
||||
|
|
||||
|
if (articlesList == null) { |
||||
|
throw new ImportException("JSON格式错误:未找到 'articles' 或 'data' 字段"); |
||||
|
} |
||||
|
|
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
for (int i = 0; i < articlesList.size(); i++) { |
||||
|
try { |
||||
|
Object item = articlesList.get(i); |
||||
|
if (item instanceof Map) { |
||||
|
Article article = mapToArticle((Map<?, ?>) item, i); |
||||
|
articles.add(article); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
logger.warn("解析第 {} 篇文章失败: {}", i, e.getMessage()); |
||||
|
result.addError("解析失败 at index " + i + ": " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return articles; |
||||
|
} catch (ImportException e) { |
||||
|
throw e; |
||||
|
} catch (Exception e) { |
||||
|
logger.error("JSON解析失败: {}", e.getMessage(), e); |
||||
|
throw new ImportException("JSON解析失败: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@SuppressWarnings("unchecked") |
||||
|
private Article mapToArticle(Map<?, ?> map, int index) throws ValidationException { |
||||
|
String title = (String) map.get("title"); |
||||
|
String url = (String) map.get("url"); |
||||
|
String content = (String) map.get("content"); |
||||
|
Object crawledAtObj = map.get("crawledAt"); |
||||
|
LocalDateTime crawledAt = null; |
||||
|
|
||||
|
if (crawledAtObj != null) { |
||||
|
try { |
||||
|
if (crawledAtObj instanceof String) { |
||||
|
crawledAt = LocalDateTime.parse((String) crawledAtObj); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
logger.warn("无法解析 crawledAt 字段: {}, 使用默认值", crawledAtObj); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (title == null || title.trim().isEmpty()) { |
||||
|
throw new ValidationException("标题不能为空", "title", null, "非空字符串"); |
||||
|
} |
||||
|
|
||||
|
if (url == null || url.trim().isEmpty()) { |
||||
|
throw new ValidationException("URL不能为空", "url", null, "非空字符串"); |
||||
|
} |
||||
|
|
||||
|
if (content == null) { |
||||
|
content = ""; |
||||
|
} |
||||
|
|
||||
|
return new Article(title.trim(), url.trim(), content.trim(), crawledAt); |
||||
|
} |
||||
|
|
||||
|
private void processArticle(Article article, ImportOptions options, ImportResult result, int index) |
||||
|
throws ValidationException, DuplicateArticleException { |
||||
|
|
||||
|
if (options.isValidateTitle() && article.getTitle().length() > MAX_TITLE_LENGTH) { |
||||
|
throw new ValidationException( |
||||
|
"标题过长: 最大" + MAX_TITLE_LENGTH + "字符", |
||||
|
"title", |
||||
|
article.getTitle(), |
||||
|
"长度 <= " + MAX_TITLE_LENGTH |
||||
|
); |
||||
|
} |
||||
|
|
||||
|
if (options.isValidateUrl() && !URL_PATTERN.matcher(article.getUrl()).matches()) { |
||||
|
throw new ValidationException( |
||||
|
"URL格式无效: " + article.getUrl(), |
||||
|
"url", |
||||
|
article.getUrl(), |
||||
|
"必须以 http:// 或 https:// 开头" |
||||
|
); |
||||
|
} |
||||
|
|
||||
|
Article existing = repository.findByUrl(article.getUrl()); |
||||
|
if (existing != null) { |
||||
|
switch (options.getDuplicateStrategy()) { |
||||
|
case SKIP: |
||||
|
logger.debug("跳过重复文章: {}", article.getUrl()); |
||||
|
throw new DuplicateArticleException("文章URL已存在: " + article.getUrl(), article.getUrl()); |
||||
|
|
||||
|
case OVERWRITE: |
||||
|
logger.debug("覆盖重复文章: {}", article.getUrl()); |
||||
|
repository.remove(existing); |
||||
|
repository.add(article); |
||||
|
result.setOverwritten(result.getOverwritten() + 1); |
||||
|
result.setImported(result.getImported() + 1); |
||||
|
return; |
||||
|
|
||||
|
case ERROR: |
||||
|
throw new DuplicateArticleException( |
||||
|
"发现重复URL: " + article.getUrl(), |
||||
|
article.getUrl(), |
||||
|
repository.getAll().indexOf(existing) |
||||
|
); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
String content = article.getContent(); |
||||
|
if (content.length() > options.getMaxContentLength()) { |
||||
|
content = content.substring(0, options.getMaxContentLength()); |
||||
|
logger.debug("文章内容已截断到 {} 字符: {}", options.getMaxContentLength(), article.getTitle()); |
||||
|
} |
||||
|
|
||||
|
repository.add(article); |
||||
|
result.setImported(result.getImported() + 1); |
||||
|
logger.debug("成功导入文章: {}", article.getTitle()); |
||||
|
} |
||||
|
|
||||
|
private String readFileContent(Path sourcePath) throws IOException { |
||||
|
StringBuilder content = new StringBuilder(); |
||||
|
try (BufferedReader reader = Files.newBufferedReader(sourcePath, StandardCharsets.UTF_8)) { |
||||
|
String line; |
||||
|
while ((line = reader.readLine()) != null) { |
||||
|
content.append(line).append("\n"); |
||||
|
} |
||||
|
} |
||||
|
return content.toString(); |
||||
|
} |
||||
|
|
||||
|
private void validateSourcePath(Path sourcePath) throws ImportException { |
||||
|
if (sourcePath == null) { |
||||
|
throw new ImportException("导入路径不能为空"); |
||||
|
} |
||||
|
|
||||
|
if (!Files.exists(sourcePath)) { |
||||
|
throw new ImportException("导入文件不存在: " + sourcePath, sourcePath.toString()); |
||||
|
} |
||||
|
|
||||
|
if (!Files.isReadable(sourcePath)) { |
||||
|
throw new ImportException("文件不可读: " + sourcePath, sourcePath.toString()); |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
long size = Files.size(sourcePath); |
||||
|
if (size > 100 * 1024 * 1024) { |
||||
|
logger.warn("导入文件较大 ({} MB),处理可能较慢", size / (1024 * 1024)); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
logger.warn("无法获取文件大小: {}", e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,81 @@ |
|||||
|
package com.example.datacollect.util; |
||||
|
|
||||
|
import com.fasterxml.jackson.databind.ObjectMapper; |
||||
|
import com.fasterxml.jackson.databind.SerializationFeature; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.File; |
||||
|
import java.io.IOException; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class JsonSerializer { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(JsonSerializer.class); |
||||
|
private static final ObjectMapper objectMapper = new ObjectMapper(); |
||||
|
|
||||
|
static { |
||||
|
objectMapper.enable(SerializationFeature.INDENT_OUTPUT); |
||||
|
} |
||||
|
|
||||
|
private JsonSerializer() { |
||||
|
} |
||||
|
|
||||
|
public static <T> String serialize(T obj) { |
||||
|
try { |
||||
|
return objectMapper.writeValueAsString(obj); |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Failed to serialize object", e); |
||||
|
throw new RuntimeException("Failed to serialize object", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static <T> String serializeCompact(T obj) { |
||||
|
try { |
||||
|
ObjectMapper compactMapper = new ObjectMapper(); |
||||
|
return compactMapper.writeValueAsString(obj); |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Failed to serialize object (compact)", e); |
||||
|
throw new RuntimeException("Failed to serialize object", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static <T> T deserialize(String json, Class<T> clazz) { |
||||
|
try { |
||||
|
return objectMapper.readValue(json, clazz); |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Failed to deserialize object", e); |
||||
|
throw new RuntimeException("Failed to deserialize object", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static <T> List<T> deserializeList(String json, Class<T> clazz) { |
||||
|
try { |
||||
|
return objectMapper.readValue(json, |
||||
|
objectMapper.getTypeFactory().constructCollectionType(List.class, clazz)); |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Failed to deserialize list", e); |
||||
|
throw new RuntimeException("Failed to deserialize list", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static <T> void writeToFile(T obj, String filePath) throws IOException { |
||||
|
File file = new File(filePath); |
||||
|
objectMapper.writeValue(file, obj); |
||||
|
logger.debug("Successfully wrote object to file: {}", filePath); |
||||
|
} |
||||
|
|
||||
|
public static <T> T readFromFile(String filePath, Class<T> clazz) throws IOException { |
||||
|
File file = new File(filePath); |
||||
|
T obj = objectMapper.readValue(file, clazz); |
||||
|
logger.debug("Successfully read object from file: {}", filePath); |
||||
|
return obj; |
||||
|
} |
||||
|
|
||||
|
public static <T> List<T> readListFromFile(String filePath, Class<T> clazz) throws IOException { |
||||
|
File file = new File(filePath); |
||||
|
List<T> list = objectMapper.readValue(file, |
||||
|
objectMapper.getTypeFactory().constructCollectionType(List.class, clazz)); |
||||
|
logger.debug("Successfully read list from file: {}", filePath); |
||||
|
return list; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,39 @@ |
|||||
|
package com.example.datacollect.util; |
||||
|
|
||||
|
import com.example.datacollect.exception.NetworkException; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import java.util.concurrent.Callable; |
||||
|
|
||||
|
public class RetryUtils { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(RetryUtils.class); |
||||
|
|
||||
|
private static final int MAX_RETRIES = 3; |
||||
|
private static final long BASE_DELAY_MS = 500; |
||||
|
|
||||
|
public static <T> T executeWithRetry(Callable<T> task) throws NetworkException { |
||||
|
Exception lastException = null; |
||||
|
|
||||
|
for (int attempt = 0; attempt <= MAX_RETRIES; attempt++) { |
||||
|
try { |
||||
|
if (attempt > 0) { |
||||
|
long waitTime = BASE_DELAY_MS * (long) Math.pow(2, attempt - 1); |
||||
|
logger.info("重试 {}/{} 次,等待 {} ms", attempt, MAX_RETRIES, waitTime); |
||||
|
Thread.sleep(waitTime); |
||||
|
} |
||||
|
|
||||
|
return task.call(); |
||||
|
} catch (Exception e) { |
||||
|
lastException = e; |
||||
|
logger.warn("第 {} 次尝试失败: {}", attempt + 1, e.getMessage()); |
||||
|
|
||||
|
if (attempt < MAX_RETRIES) { |
||||
|
continue; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.error("所有 {} 次重试均失败", MAX_RETRIES + 1); |
||||
|
throw new NetworkException("网络错误,已重试三次", lastException); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,52 @@ |
|||||
|
package com.example.datacollect.view; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
import java.util.List; |
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
public class ConsoleView implements AutoCloseable { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class); |
||||
|
private static final String ANSI_RESET = "\u001B[0m"; |
||||
|
private static final String ANSI_GREEN = "\u001B[32m"; |
||||
|
private static final String ANSI_RED = "\u001B[31m"; |
||||
|
private static final String ANSI_BLUE = "\u001B[34m"; |
||||
|
|
||||
|
private final Scanner scanner = new Scanner(System.in); |
||||
|
|
||||
|
@Override |
||||
|
public void close() { |
||||
|
scanner.close();/* 关闭扫描器,释放资源 */ |
||||
|
logger.debug("ConsoleView closed"); |
||||
|
} |
||||
|
|
||||
|
public String readLine() { |
||||
|
System.out.print("> "); |
||||
|
String input = scanner.nextLine(); |
||||
|
return input;/* 返回用户输入 */ |
||||
|
} |
||||
|
|
||||
|
public void printSuccess(String msg) { |
||||
|
System.out.println(ANSI_GREEN + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void printError(String msg) { |
||||
|
System.out.println(ANSI_RED + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void printInfo(String msg) { |
||||
|
System.out.println(ANSI_BLUE + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void display(List<Article> articles) { |
||||
|
if (articles.isEmpty()) { |
||||
|
printInfo("暂无文章,请先执行 crawl。"); |
||||
|
return; |
||||
|
} |
||||
|
for (int i = 0; i < articles.size(); i++) { |
||||
|
Article a = articles.get(i);/* 获取文章 */ |
||||
|
System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl());/* 打印文章标题和URL */ |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,25 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<configuration> |
||||
|
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
||||
|
<encoder> |
||||
|
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender"> |
||||
|
<file>logs/crawler.log</file> |
||||
|
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy"> |
||||
|
<fileNamePattern>logs/crawler.%d{yyyy-MM-dd}.log</fileNamePattern> |
||||
|
<maxHistory>30</maxHistory> |
||||
|
</rollingPolicy> |
||||
|
<encoder> |
||||
|
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<logger name="com.example.datacollect" level="DEBUG" /> |
||||
|
<root level="INFO"> |
||||
|
<appender-ref ref="CONSOLE" /> |
||||
|
<appender-ref ref="FILE" /> |
||||
|
</root> |
||||
|
</configuration> |
||||
@ -0,0 +1,25 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<configuration> |
||||
|
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
||||
|
<encoder> |
||||
|
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender"> |
||||
|
<file>logs/crawler.log</file> |
||||
|
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy"> |
||||
|
<fileNamePattern>logs/crawler.%d{yyyy-MM-dd}.log</fileNamePattern> |
||||
|
<maxHistory>30</maxHistory> |
||||
|
</rollingPolicy> |
||||
|
<encoder> |
||||
|
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<logger name="com.example.datacollect" level="DEBUG" /> |
||||
|
<root level="INFO"> |
||||
|
<appender-ref ref="CONSOLE" /> |
||||
|
<appender-ref ref="FILE" /> |
||||
|
</root> |
||||
|
</configuration> |
||||
@ -0,0 +1,3 @@ |
|||||
|
artifactId=datacollect-cli |
||||
|
groupId=com.example |
||||
|
version=0.1.0 |
||||
@ -0,0 +1,32 @@ |
|||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\repository\PersistenceManager.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\command\ExitCommand.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\command\CrawlCommand.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\exception\ExportException.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\command\ExportCommand.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\command\ImportCommand.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\exception\ImportException.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\exception\DuplicateArticleException.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\exception\CrawlerException.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\command\Command.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\model\Article.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\strategy\PeopleStrategy.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\exception\NetworkException.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\controller\CrawlerController.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\strategy\StrategyFactory.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\util\JsonImporter.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\strategy\HnuNewsStrategy.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\util\RetryUtils.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\strategy\CrawlStrategy.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\command\ListCommand.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\Main.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\exception\UrlFormatException.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\util\JsonSerializer.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\exception\ParseException.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\strategy\YouthStrategy.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\strategy\CsdnStrategy.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\command\HelpCommand.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\repository\ArticleRepository.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\exception\ValidationException.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\view\ConsoleView.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\util\JsonExporter.java |
||||
|
C:\Users\27687\Desktop\java-cli-期末\src\main\java\com\example\datacollect\command\AnalyzeCommand.java |
||||
@ -0,0 +1,92 @@ |
|||||
|
$ErrorActionPreference = "Continue" |
||||
|
|
||||
|
Write-Host "=== 测试 CLI 爬虫程序 ===" -ForegroundColor Cyan |
||||
|
|
||||
|
# 测试1: 启动程序并显示帮助 |
||||
|
Write-Host "`n1. 测试帮助命令..." -ForegroundColor Yellow |
||||
|
$helpOutput = echo "help" | java -jar target\datacollect-cli-0.1.0-jar-with-dependencies.jar 2>&1 |
||||
|
if ($LASTEXITCODE -ne 0) { |
||||
|
Write-Host "帮助命令执行失败" -ForegroundColor Red |
||||
|
Write-Host $helpOutput |
||||
|
} else { |
||||
|
Write-Host "帮助命令执行成功" -ForegroundColor Green |
||||
|
Write-Host $helpOutput | Select-Object -First 15 |
||||
|
} |
||||
|
|
||||
|
# 测试2: 测试 list 命令(空列表) |
||||
|
Write-Host "`n2. 测试 list 命令(空列表)..." -ForegroundColor Yellow |
||||
|
$listOutput = echo "list" | java -jar target\datacollect-cli-0.1.0-jar-with-dependencies.jar 2>&1 |
||||
|
if ($LASTEXITCODE -ne 0) { |
||||
|
Write-Host "list 命令执行失败" -ForegroundColor Red |
||||
|
Write-Host $listOutput |
||||
|
} else { |
||||
|
Write-Host "list 命令执行成功" -ForegroundColor Green |
||||
|
} |
||||
|
|
||||
|
# 测试3: 测试 Juejin 策略 |
||||
|
Write-Host "`n3. 测试 Juejin 策略..." -ForegroundColor Yellow |
||||
|
$juejinOutput = @("crawl https://juejin.cn/", "list", "exit") | java -jar target\datacollect-cli-0.1.0-jar-with-dependencies.jar 2>&1 |
||||
|
if ($LASTEXITCODE -ne 0) { |
||||
|
Write-Host "Juejin 策略测试失败" -ForegroundColor Red |
||||
|
Write-Host $juejinOutput | Select-Object -Last 10 |
||||
|
} else { |
||||
|
$articleCount = ($juejinOutput | Select-String "Crawled" | ForEach-Object { $_.Line -replace "Crawled (\d+) articles\.", '$1' }) |
||||
|
Write-Host "Juejin 策略测试成功 - 爬取到 $articleCount 篇文章" -ForegroundColor Green |
||||
|
} |
||||
|
|
||||
|
# 测试4: 测试 HnuNews 策略 |
||||
|
Write-Host "`n4. 测试 HnuNews 策略..." -ForegroundColor Yellow |
||||
|
$hnuOutput = @("crawl https://news.hnu.edu.cn/", "list", "exit") | java -jar target\datacollect-cli-0.1.0-jar-with-dependencies.jar 2>&1 |
||||
|
if ($LASTEXITCODE -ne 0) { |
||||
|
Write-Host "HnuNews 策略测试失败" -ForegroundColor Red |
||||
|
Write-Host $hnuOutput | Select-Object -Last 10 |
||||
|
} else { |
||||
|
$articleCount = ($hnuOutput | Select-String "Crawled" | ForEach-Object { $_.Line -replace "Crawled (\d+) articles\.", '$1' }) |
||||
|
Write-Host "HnuNews 策略测试成功 - 爬取到 $articleCount 篇文章" -ForegroundColor Green |
||||
|
} |
||||
|
|
||||
|
# 测试5: 测试导出功能 |
||||
|
Write-Host "`n5. 测试导出功能..." -ForegroundColor Yellow |
||||
|
$exportOutput = @("crawl https://juejin.cn/", "export test_export.json", "exit") | java -jar target\datacollect-cli-0.1.0-jar-with-dependencies.jar 2>&1 |
||||
|
if (-not (Test-Path "test_export.json")) { |
||||
|
Write-Host "导出功能测试失败" -ForegroundColor Red |
||||
|
Write-Host $exportOutput | Select-Object -Last 10 |
||||
|
} else { |
||||
|
$fileSize = (Get-Item "test_export.json").Length |
||||
|
Write-Host "导出功能测试成功 - 文件大小: $fileSize 字节" -ForegroundColor Green |
||||
|
Remove-Item "test_export.json" -Force |
||||
|
} |
||||
|
|
||||
|
# 测试6: 测试导入功能 |
||||
|
Write-Host "`n6. 测试导入功能..." -ForegroundColor Yellow |
||||
|
@("crawl https://juejin.cn/", "export import_test.json", "exit") | java -jar target\datacollect-cli-0.1.0-jar-with-dependencies.jar 2>&1 | Out-Null |
||||
|
$importOutput = @("import import_test.json", "list", "exit") | java -jar target\datacollect-cli-0.1.0-jar-with-dependencies.jar 2>&1 |
||||
|
if ($LASTEXITCODE -ne 0) { |
||||
|
Write-Host "导入功能测试失败" -ForegroundColor Red |
||||
|
Write-Host $importOutput | Select-Object -Last 10 |
||||
|
} else { |
||||
|
Write-Host "导入功能测试成功" -ForegroundColor Green |
||||
|
Remove-Item "import_test.json" -Force |
||||
|
} |
||||
|
|
||||
|
# 测试7: 测试未知命令 |
||||
|
Write-Host "`n7. 测试未知命令处理..." -ForegroundColor Yellow |
||||
|
$unknownOutput = echo "unknown_command" | java -jar target\datacollect-cli-0.1.0-jar-with-dependencies.jar 2>&1 |
||||
|
if ($unknownOutput -match "Unknown command") { |
||||
|
Write-Host "未知命令处理测试成功" -ForegroundColor Green |
||||
|
} else { |
||||
|
Write-Host "未知命令处理测试失败" -ForegroundColor Red |
||||
|
} |
||||
|
|
||||
|
# 测试8: 测试会话持久化(退出后重新启动) |
||||
|
Write-Host "`n8. 测试会话持久化..." -ForegroundColor Yellow |
||||
|
@("crawl https://juejin.cn/", "exit") | java -jar target\datacollect-cli-0.1.0-jar-with-dependencies.jar 2>&1 | Out-Null |
||||
|
$restoreOutput = echo "list" | java -jar target\datacollect-cli-0.1.0-jar-with-dependencies.jar 2>&1 |
||||
|
if ($restoreOutput -match "Loaded") { |
||||
|
Write-Host "会话持久化测试成功" -ForegroundColor Green |
||||
|
} else { |
||||
|
Write-Host "会话持久化测试失败" -ForegroundColor Red |
||||
|
Write-Host $restoreOutput | Select-Object -Last 5 |
||||
|
} |
||||
|
|
||||
|
Write-Host "`n=== 测试完成 ===" -ForegroundColor Cyan |
||||
@ -0,0 +1,17 @@ |
|||||
|
[ { |
||||
|
"title" : "7月1日起施行 超龄劳动者迎来权益保障新规", |
||||
|
"url" : "http://society.people.com.cn/n1/2026/0525/c1008-40727022.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "经港珠澳大桥出入境港澳单牌车总量突破1000万辆次", |
||||
|
"url" : "http://gba.people.cn/n1/2026/0525/c42272-40726946.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "外交部谈美伊谈判", |
||||
|
"url" : "http://world.people.com.cn/n1/2026/0525/c1002-40726926.html", |
||||
|
"content" : "" |
||||
|
}, { |
||||
|
"title" : "重庆发布今年首个地质灾害红色预警", |
||||
|
"url" : "http://society.people.com.cn/n1/2026/0525/c1008-40726849.html", |
||||
|
"content" : "" |
||||
|
} ] |
||||
@ -0,0 +1,236 @@ |
|||||
|
# Test Script for CLI Crawler - Data Import/Export Features |
||||
|
# This script automates the test sequence |
||||
|
|
||||
|
$ErrorActionPreference = "Stop" |
||||
|
$env:JAVA_HOME = "C:\Program Files\Java\latest\jdk-25" |
||||
|
$APP_JAR = "target\datacollect-cli-0.1.0-jar-with-dependencies.jar" |
||||
|
$TEST_EXPORT_FILE = "data\test_export.json" |
||||
|
$USERPROFILE_PATH = "$env:USERPROFILE\.datacollect" |
||||
|
|
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "CLI Crawler - Import/Export Test Suite" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Clean up function |
||||
|
function Clean-Up { |
||||
|
Write-Host "[CLEANUP] Removing old data files..." -ForegroundColor Yellow |
||||
|
if (Test-Path $USERPROFILE_PATH) { |
||||
|
Remove-Item "$USERPROFILE_PATH\*" -Force -Recurse -ErrorAction SilentlyContinue |
||||
|
} |
||||
|
if (Test-Path $TEST_EXPORT_FILE) { |
||||
|
Remove-Item $TEST_EXPORT_FILE -Force -ErrorAction SilentlyContinue |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
# Run CLI command function |
||||
|
function Run-CLI { |
||||
|
param([string]$Commands) |
||||
|
$commandsArray = $Commands -split "`n" |
||||
|
foreach ($cmd in $commandsArray) { |
||||
|
$cmd = $cmd.Trim() |
||||
|
if ($cmd -ne "") { |
||||
|
Write-Host "[CLI] $cmd" -ForegroundColor Gray |
||||
|
$result = & java -jar $APP_JAR $cmd 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
# Step 1: Initial Cleanup |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 1: Initial Cleanup" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Clean-Up |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Step 2: Crawl some data |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 2: Crawl Data (CSDN)" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Command: crawl https://www.csdn.net/" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR "crawl https://www.csdn.net/" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
Start-Sleep -Seconds 2 |
||||
|
|
||||
|
# Step 3: List articles |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 3: List Articles" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Command: list" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR "list" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
Start-Sleep -Seconds 1 |
||||
|
|
||||
|
# Step 4: Export to JSON |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 4: Export to JSON" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Command: export data\test_export.json --format json" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR "export data\test_export.json --format json" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
Start-Sleep -Seconds 1 |
||||
|
|
||||
|
# Step 5: Check JSON file |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 5: Check Exported JSON File" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
if (Test-Path $TEST_EXPORT_FILE) { |
||||
|
Write-Host "[SUCCESS] JSON file created: $TEST_EXPORT_FILE" -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
Write-Host "JSON File Content Preview (first 1500 chars):" -ForegroundColor Cyan |
||||
|
$content = Get-Content $TEST_EXPORT_FILE -Raw |
||||
|
if ($content.Length -gt 1500) { |
||||
|
Write-Host ($content.Substring(0, 1500) + "...") -ForegroundColor White |
||||
|
} else { |
||||
|
Write-Host $content -ForegroundColor White |
||||
|
} |
||||
|
|
||||
|
# Check for crawledAt field |
||||
|
if ($content -match "crawledAt") { |
||||
|
Write-Host "" |
||||
|
Write-Host "[SUCCESS] crawledAt field found in JSON!" -ForegroundColor Green |
||||
|
} else { |
||||
|
Write-Host "" |
||||
|
Write-Host "[ERROR] crawledAt field NOT found in JSON!" -ForegroundColor Red |
||||
|
} |
||||
|
|
||||
|
# Check for metadata |
||||
|
if ($content -match "metadata") { |
||||
|
Write-Host "[SUCCESS] metadata field found in JSON!" -ForegroundColor Green |
||||
|
} else { |
||||
|
Write-Host "[WARNING] metadata field NOT found in JSON!" -ForegroundColor Yellow |
||||
|
} |
||||
|
} else { |
||||
|
Write-Host "[ERROR] JSON file NOT created!" -ForegroundColor Red |
||||
|
} |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Step 6: Get article count before clear |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 6: Get Article Count Before Clear" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Command: list" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR "list" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
|
||||
|
# Count articles |
||||
|
$articleCount = 0 |
||||
|
$lines = $result -split "`n" |
||||
|
foreach ($line in $lines) { |
||||
|
if ($line -match "Total: (\d+) articles") { |
||||
|
$articleCount = [int]$matches[1] |
||||
|
break |
||||
|
} |
||||
|
} |
||||
|
Write-Host "" |
||||
|
Write-Host "Current article count: $articleCount" -ForegroundColor Cyan |
||||
|
Write-Host "" |
||||
|
Start-Sleep -Seconds 1 |
||||
|
|
||||
|
# Step 7: Clear all data |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 7: Clear All Data" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Command: clear" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR "clear" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
Start-Sleep -Seconds 1 |
||||
|
|
||||
|
# Step 8: Verify data is cleared |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 8: Verify Data Cleared" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Command: list" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR "list" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
Start-Sleep -Seconds 1 |
||||
|
|
||||
|
# Step 9: Import data from JSON |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 9: Import Data from JSON" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Command: import data\test_export.json" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR "import data\test_export.json" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
Start-Sleep -Seconds 1 |
||||
|
|
||||
|
# Step 10: Verify data restored |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 10: Verify Data Restored" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Command: list" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR "list" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Count articles after import |
||||
|
$articleCountAfterImport = 0 |
||||
|
$lines = $result -split "`n" |
||||
|
foreach ($line in $lines) { |
||||
|
if ($line -match "Total: (\d+) articles") { |
||||
|
$articleCountAfterImport = [int]$matches[1] |
||||
|
break |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if ($articleCountAfterImport -eq $articleCount) { |
||||
|
Write-Host "[SUCCESS] Data restored successfully! Article count matches: $articleCountAfterImport" -ForegroundColor Green |
||||
|
} else { |
||||
|
Write-Host "[WARNING] Article count mismatch. Before: $articleCount, After: $articleCountAfterImport" -ForegroundColor Yellow |
||||
|
} |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Step 11: Test duplicate import (should not duplicate) |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 11: Test Duplicate Import (No Duplication)" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Command: import data\test_export.json (second time)" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR "import data\test_export.json" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
Start-Sleep -Seconds 1 |
||||
|
|
||||
|
# Step 12: Final article count |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "STEP 12: Final Article Count" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Command: list" -ForegroundColor Yellow |
||||
|
$result = & java -jar $APP_JAR "list" 2>&1 |
||||
|
Write-Host $result -ForegroundColor Green |
||||
|
Write-Host "" |
||||
|
|
||||
|
# Final count |
||||
|
$finalCount = 0 |
||||
|
$lines = $result -split "`n" |
||||
|
foreach ($line in $lines) { |
||||
|
if ($line -match "Total: (\d+) articles") { |
||||
|
$finalCount = [int]$matches[1] |
||||
|
break |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "TEST SUMMARY" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "Articles after first import: $articleCountAfterImport" -ForegroundColor White |
||||
|
Write-Host "Articles after second import: $finalCount" -ForegroundColor White |
||||
|
Write-Host "" |
||||
|
|
||||
|
if ($finalCount -eq $articleCountAfterImport) { |
||||
|
Write-Host "[SUCCESS] Duplicate import correctly skipped! No duplication occurred." -ForegroundColor Green |
||||
|
} else { |
||||
|
Write-Host "[ERROR] Duplicate import created duplicates! Count increased from $articleCountAfterImport to $finalCount" -ForegroundColor Red |
||||
|
} |
||||
|
|
||||
|
Write-Host "" |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
|
Write-Host "ALL TESTS COMPLETED" -ForegroundColor Cyan |
||||
|
Write-Host "========================================" -ForegroundColor Cyan |
||||
@ -0,0 +1,2 @@ |
|||||
|
export data/test_standard_export.json |
||||
|
exit |
||||
Loading…
Reference in new issue