diff --git a/project2/crawl2 b/project2/crawl2 deleted file mode 160000 index 9acfa1a..0000000 --- a/project2/crawl2 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9acfa1a73f1cda0ad3b9f55c0e3041b672a7151d diff --git a/project2/crawl2/.gitignore b/project2/crawl2/.gitignore new file mode 100644 index 0000000..10a5d23 --- /dev/null +++ b/project2/crawl2/.gitignore @@ -0,0 +1,3 @@ +target/ +*.log +*.tmp diff --git a/project2/crawl2/README.md b/project2/crawl2/README.md new file mode 100644 index 0000000..92a128a --- /dev/null +++ b/project2/crawl2/README.md @@ -0,0 +1,55 @@ +# University News Crawler + +Java homework project for crawling: + +- `https://news.hnu.edu.cn/` +- `https://news.csu.edu.cn/` +- `https://news.hunnu.edu.cn/` + +The code demonstrates the required architecture: + +- CLI interactive command line +- MVC: `model`, `view`, `controller` +- Command pattern: `command` package +- Strategy pattern: `strategy` package, one strategy per target website +- Custom exception hierarchy: `exception` package +- File persistence: JSON or CSV output + +## Run + +```powershell +mvn test +mvn exec:java -Dexec.args="crawl --site all --limit 5 --format json --out data/news.json" +``` + +Interactive CLI: + +```powershell +mvn exec:java +``` + +Useful commands: + +```text +help +sites +crawl --site all --limit 10 --format json --out data/news.json +crawl --site hnu --limit 5 --format csv --out data/hnu.csv +exit +``` + +## Output Fields + +Each crawled news item includes: + +- school +- site key +- title +- url +- publish time +- source +- author +- summary +- content preview +- crawled time + diff --git a/project2/crawl2/data/sample-news.json b/project2/crawl2/data/sample-news.json new file mode 100644 index 0000000..bea9e03 --- /dev/null +++ b/project2/crawl2/data/sample-news.json @@ -0,0 +1,67 @@ +[ { + "school" : "湖南大学", + "siteKey" : "hnu", + "title" : "学校举行校庆工作座谈会", + "url" : "https://news.hnu.edu.cn/info/1003/39617.htm", + "publishTime" : "2026-05-29", + "source" : "融媒体中心 校庆办 责任编辑:刘嘉欢 最新更新 2026-05-29 学校举行校庆工作座谈会 2026-05-28 我校汤素娥教授在中宣部召开的党的创新理论传播工程推进会上发言 2026-05-27 “大文学观视域下的地方性写作”学术研讨会在我校举行 2026-05-27 学习弘扬袁隆平科技创新精神暨杂交水稻高质量发展座谈会召开 2026-05-27 电气院:首届“电启未来”科技文化节开幕式暨表彰大会召开 2026-05-27 经贸院:学生获全国高校首届数字经济实践成果大赛总决赛一等奖 2026-05-26 2026年校庆教职工羽毛球团体赛举行 2026-05-25 “寻迹百年 定向未来” 湖南大学第十届定向运动大赛举行 版权所有:湖南大学党委宣传部(新闻办公室) | 投稿热线:88822804 88823984 | 邮箱:news@hnu.edu.cn TOP", + "author" : "", + "summary" : "5月27日上午,学校在超算中心报告厅举行校庆工作座谈会。校党委书记邓卫出席座谈会并讲话,校长段献忠主持座谈会,校党委副书记唐珍名介绍校庆筹备工作情况。张大方、刘克利、赵跃宇、蒋昌忠、李红、李树丞、章兢、唐亚阳、王文沐、杨春如、张强、陈收、曹升元、陈伟、杨胜刚、谢赤、彭求实、王红悦、龚明金、周梦君等在长沙的老领导参加会议,结合自身工作和亲身经历,围绕校庆安排、校史文化、学校高质量发展等方面提出意见建...", + "contentPreview" : "5月27日上午,学校在超算中心报告厅举行校庆工作座谈会。校党委书记邓卫出席座谈会并讲话,校长段献忠主持座谈会,校党委副书记唐珍名介绍校庆筹备工作情况。 张大方、刘克利、赵跃宇、蒋昌忠、李红、李树丞、章兢、唐亚阳、王文沐、杨春如、张强、陈收、曹升元、陈伟、杨胜刚、谢赤、彭求实、王红悦、龚明金、周梦君等在长沙的老领导参加会议,结合自身工作和亲身经历,围绕校庆安排、校史文化、学校高质量发展等方面提出意见建议。 邓卫代表学校向出席会议的各位老领导致以诚挚问候和衷心感谢。他表示,今年是岳麓书院创建1050周年暨湖南大学定名100周年,各位老领导经验丰富、人脉广泛、德高望重,希望老领导们在联系校友、方正校史、指导校务等方面发挥积极作用,一如既往...", + "crawledAt" : "2026-05-31T15:10:52.7493755" +}, { + "school" : "湖南大学", + "siteKey" : "hnu", + "title" : "我校汤素娥教授在中宣部召开的党的创新理论传播工程推进会上发言", + "url" : "https://news.hnu.edu.cn/info/1003/39613.htm", + "publishTime" : "2026-05-28", + "source" : "马克思主义学院 责任编辑:文亦佳 最新更新 2026-05-29 学校举行校庆工作座谈会 2026-05-28 我校汤素娥教授在中宣部召开的党的创新理论传播工程推进会上发言 2026-05-27 “大文学观视域下的地方性写作”学术研讨会在我校举行 2026-05-27 学习弘扬袁隆平科技创新精神暨杂交水稻高质量发展座谈会召开 2026-05-27 电气院:首届“电启未来”科技文化节开幕式暨表彰大会召开 2026-05-27 经贸院:学生获全国高校首届数字经济实践成果大赛总决赛一等奖 2026-05-26 2026年校庆教职工羽毛球团体赛举行 2026-05-25 “寻迹百年 定向未来” 湖南大学第十届定向运动大赛举行 版权所有:湖南大学党委宣传部(新闻办公室) | 投稿热线:88822804 88823984 | 邮箱:news@hnu.edu.cn TOP", + "author" : "", + "summary" : "5月26日,中宣部在北京召开党的创新理论传播工程推进会。我校汤素娥教授作为全国高校思政课教师代表,应邀出席会议,并以《努力讲好思政课,推动党的创新理论进学生头脑》为题发言。中央有关部门、各地区党委宣传部、重点理论工作平台负责同志和专家学者、媒体工作者、基层宣讲骨干代表等参会。会议强调,实施党的创新理论传播工程是党中央作出的重大决策部署,是推动党的创新理论大众化普及、精准化传播的战略安排,要把学习宣...", + "contentPreview" : "5月26日,中宣部在北京召开党的创新理论传播工程推进会。我校汤素娥教授作为全国高校思政课教师代表,应邀出席会议,并以《努力讲好思政课,推动党的创新理论进学生头脑》为题发言。 中央有关部门、各地区党委宣传部、重点理论工作平台负责同志和专家学者、媒体工作者、基层宣讲骨干代表等参会。会议强调,实施党的创新理论传播工程是党中央作出的重大决策部署,是推动党的创新理论大众化普及、精准化传播的战略安排,要把学习宣传贯彻习近平新时代中国特色社会主义思想作为首要政治任务,顺应数字化、网络化、智能化发展趋势,持续推动党的创新理论“飞入寻常百姓家”。 汤素娥教授系我校马克思主义学院青年教师,曾主持多项国家社科基金课题,在《马克思主义研究》等期刊发表高水平...", + "crawledAt" : "2026-05-31T15:10:53.3025883" +}, { + "school" : "中南大学", + "siteKey" : "csu", + "title" : "2026年全国五一劳动奖揭晓 中南大学多个集体、师生上榜", + "url" : "https://news.csu.edu.cn/info/1002/164024.htm", + "publishTime" : "2026-05-01", + "source" : "", + "author" : "—— 本网讯 4月28日,中华全国总工会召开庆祝“五一”国际劳动节暨全国五一劳动奖表彰大会,发布关于表彰2026年全国五一劳动奖的决定。中南大学湘雅三医院党委副书记、院长、主任医师、教授江泓被授予“全国五一劳动奖章”,中南大学湘雅医院临床药理研究所和湘雅二医院肾内科2个集体荣获“全国工人先锋号”。此外,中南大学粉末冶金研究院2022级博士生、湖南湘投金天钛金属股份有限公司副总工程师刘正乔被授予“全国五一劳动奖章”。 江泓,深耕神经病学与罕见病领域三十余年,带领团队建立了国内最大的遗传性共济失调临床与遗传数据库,鉴定出10余种致病基因及修饰基因,开发了致病性预测等生物信息分析工具,发表SCI论文100余篇,获发明专利9项,获湖南省自然科学一等奖1项;牵头制定的首部《遗传性共济失调诊断与治疗专家共识》,推动我国该领域跻身国际先进行列;入选国家“万人计划”科技创新领军人才、科技部中青年科技创新领军人才,担任国家重点研发计划项目首席科学家;主讲国家一流本科课程《神经病学》,主编参编教材专著10余部,获湖南省教学成果特等奖,培养博士、硕士研究生50余人。 中南大学湘雅医院临床药理研究所是全国首家集个体化医学教学、科研与社会服务于一体的单位,由中国工程院院士、我国遗传药理学和药物基因组学奠基人周宏灏教授创建。研究所聚焦遗传药理学,率先提出并施行以基因为导向的个体化药物治疗,在药物反应种族差异等领域实现国际原创突破,累计获国家科技进步奖二等奖等国家、省部级奖20余项,成果发表于《新英格兰医学杂志》《柳叶刀》等国际顶刊。团队研发出全球首个高血压个体化用药基因检测试剂盒,实现成果转化2.2亿元,6个产品获批国家三类医疗器械证,建立了“理论研究—产品研制—教育培训—推广应用”全链条转化体系;研究成果服务于84.2万患者个体化用药,完成检测咨询100万人次;累计培养800余名高层次人才,为我国造就了一支由院士领衔,国家高层次领军人才、国家四青人才等组成的专业队伍。 中南大学湘雅二医院肾内科始建于1983年,是集临床诊疗、科学研究、人才培养、应急救援于一体的国家临床重点专科、国家肾脏病临床医学研究中心核心单位。科室长期深耕肾脏病领域,主编我国首部《腹膜透析》专著,创建了IgA肾病湘雅病理分型,率先提出正常白蛋白尿糖尿病肾病诊断标准和“肾三联”治疗方案;AKI(急性肾损伤)相关研究成果被国际指南引用,获国家科技进步奖2项;研发的“肾复舒”惠及中国及东南亚。科室现有国家级人才2人,近五年培养博士、硕士研究生136人,主编教材专著15部,作为核心成果参与并获国家级教学成果一等奖2项。科室年门诊逾14万人次,年出院9000余人次,危重症患者救治成功率97%以上;5名骨干作为国家首批援鄂医疗队驰援武汉金银潭医院,获全国新冠疫情防控先进个人等荣誉;科室成员担任援塞拉利昂医疗队队长,获湖南省援外医疗工作三等功。 刘正乔深耕钛金属加工一线,始终以国家战略需求为导向,聚焦核电、军工等关键领域“卡脖子”材料难题,带领团队全力攻坚高端钛带卷国产化替代,打破国外长期技术垄断,推动高端钛材国产化,彰显新时代产业工人的担当与风采。她牵头和参与国家、行业标准制修订10项,ISO标准2项,授权发明专利31项,实用新型专利30项,发表论文9篇,是“芙蓉计划省企业科技创新创业团队”负责人、长沙市卓越工程师,担任湖南省高性能钛板带工程技术研究中心副主任,曾获湖南省巾帼建功标兵、湖南省芙蓉百岗明星、全国有色金属标准化先进个人、长沙市五一劳动奖章、长沙五一巾帼奖章等荣誉称号。 据悉,“全国五一劳动奖章”“全国工人先锋号”是中华全国总工会授予在中国特色社会主义建设中作出突出贡献的劳动者和集体的光荣称号,是中国工人阶级最高奖项之一。今年,全国共有1462名职工获全国五一劳动奖章,1183个集体被授予全国工人先锋号。 (一审:韩艳 二审:唐潇珺 三审:王建湘) 分享: 图说中南 【特别报道... 【聚焦党代... 【聚焦党代... 中南大学原... 新闻排行 友情链接 新华网 | 人民网 | 光明网 | 中新网 | 中青在线 | 中央电视台 | 教育部网站 | 湖南在线 | 中国大学生在线 | 红网 | 校媒网 | 凤凰网 中国记协网 | 清华大学新闻网 | 北大新闻网 | 浙大新闻网 | 复旦新闻网 | 华中大新闻网 | 更多》 QQ:1594252309 EMAIL:xwwz@mail.csu.edu.cn 地址:湖南省长沙市岳麓区 Copyright © 2014-2019 中南大学党委宣传部(新闻中心)版权所有 湘ICP备05005659号-1 站长统计 管理员登陆", + "summary" : "本网讯 4月28日,中华全国总工会召开庆祝“五一”国际劳动节暨全国五一劳动奖表彰大会,发布关于表彰2026年全国五一劳动奖的决定。中南大学湘雅三医院党委副书记、院长、主任医师、教授江泓被授予“全国五一劳动奖章”,中南大学湘雅医院临床药理研究所和湘雅二医院肾内科2个集体荣获“全国工人先锋号”。此外,中南大学粉末冶金研究院2022级博士生、湖南湘投金天钛金属股份有限公司副总工程师刘正乔被授予“全国五一劳动奖章”...", + "contentPreview" : "本网讯 4月28日,中华全国总工会召开庆祝“五一”国际劳动节暨全国五一劳动奖表彰大会,发布关于表彰2026年全国五一劳动奖的决定。中南大学湘雅三医院党委副书记、院长、主任医师、教授江泓被授予“全国五一劳动奖章”,中南大学湘雅医院临床药理研究所和湘雅二医院肾内科2个集体荣获“全国工人先锋号”。此外,中南大学粉末冶金研究院2022级博士生、湖南湘投金天钛金属股份有限公司副总工程师刘正乔被授予“全国五一劳动奖章”。 江泓,深耕神经病学与罕见病领域三十余年,带领团队建立了国内最大的遗传性共济失调临床与遗传数据库,鉴定出10余种致病基因及修饰基因,开发了致病性预测等生物信息分析工具,发表SCI论文100余篇,获发明专利9项,获湖南省自然科学一...", + "crawledAt" : "2026-05-31T15:10:53.7885803" +}, { + "school" : "中南大学", + "siteKey" : "csu", + "title" : "【学习】习近平在加强基础研究座谈会上强调 以更大力度更实举措加强基础研究 进一步打牢科技强国建设根基", + "url" : "https://news.csu.edu.cn/info/1002/164025.htm", + "publishTime" : "2026-04-30", + "source" : "", + "author" : "—— 习近平在加强基础研究座谈会上强调 以更大力度更实举措加强基础研究 进一步打牢科技强国建设根基 蔡奇出席 丁薛祥主持 4月30日上午,中共中央总书记、国家主席、中央军委主席习近平在上海出席加强基础研究座谈会并发表重要讲话。新华社记者 谢环驰 摄 新华社上海4月30日电 中共中央总书记、国家主席、中央军委主席习近平30日上午在上海出席加强基础研究座谈会并发表重要讲话。他强调,基础研究是整个科学体系的源头,是所有技术问题的总机关。要以更大力度、更实举措加强基础研究,提升我国原始创新能力,进一步打牢科技强国建设根基。 中共中央政治局常委、中央办公厅主任蔡奇出席座谈会,中共中央政治局常委、国务院副总理丁薛祥主持座谈会。 座谈会上,科技部部长阴和俊、教育部部长怀进鹏、中国科学院院长侯建国、上海市委书记陈吉宁、北京大学数学科学学院院长刘若川、中国科学院深圳先进技术研究院院长刘陈立、浦江实验室教授乔宇、西部超导材料科技股份有限公司首席科学家张平祥先后发言,就加强基础研究介绍工作情况、提出意见建议。 4月30日上午,中共中央总书记、国家主席、中央军委主席习近平在上海出席加强基础研究座谈会并发表重要讲话。新华社记者 翟健岚 摄 在听取大家发言后,习近平发表重要讲话。他指出,党的十八大以来,党中央高度重视基础研究,通过优化科研布局、加大投入保障、创新体制机制等,推动我国基础研究水平显著提升。当前,新一轮科技革命和产业变革加速突破,全球科技竞争更加聚焦基础前沿领域,原创性颠覆性创新的重要性日益凸显。我们要抓住机遇、应对挑战,切实把基础研究工作摆上重要日程,持续抓下去,不断抓出新成效。 4月30日上午,中共中央总书记、国家主席、中央军委主席习近平在上海出席加强基础研究座谈会并发表重要讲话。新华社记者 燕雁 摄 习近平强调,要加强统筹谋划和顶层设计,优化基础研究系统布局。坚持“四个面向”战略导向,进一步明确基础研究的主攻方向和重点领域。强化国家科研机构、高水平研究型大学等引领作用,鼓励和规范发展新型研发机构,推动企业主导的产学研用深度融合,打通基础研究、应用开发、成果转化的创新链条。加强基础学科建设,促进应用学科与基础学科协调发展。 习近平指出,要一体推进教育科技人才发展,全方位做好培养、引进、使用工作,壮大基础研究人才队伍。遵循人才成长规律,提高教育质量,源源不断培养基础研究后备力量。优化科教协同育人机制,注重在科研一线发现和培养人才。坚持任务牵引、以老带新,大力扶持青年人才。弘扬科学家精神,加强科普宣传,激发青少年的想象力和探求欲,让投身基础研究成为更多青少年的人生追求。 习近平强调,要加强对基础研究的支持保障。逐步提高基础研究经费占比,形成多元化投入格局。体系化布局建设重大科技基础设施,建设智能化科研平台系统。健全符合基础研究特点的分类评价体系,改善基础研究人员的工作和生活条件,营造开放包容、宽容失败的创新环境。加强科研诚信建设。 习近平指出,要主动融入全球创新网络,深化基础研究国际交流合作,联合开展气候变化、能源环境、生命健康等重大科学问题攻关,积极参与全球科技治理。 丁薛祥主持会议时表示,习近平总书记重要讲话充分肯定我国基础研究取得的成就,全面分析面临的新形势新挑战,对加强基础研究作出战略部署、提出明确要求。讲话高屋建瓴、内涵丰富,具有很强的政治性、思想性、指导性,为加强基础研究指明了前进方向、提供了根本遵循。我们要深学细悟总书记重要讲话精神,准确把握党中央战略意图,增强紧迫感、责任感、使命感,以更加坚定的信心和决心、更加务实的举措和行动,全面加强基础研究,着力提升原始创新能力,为实现高水平科技自立自强、建设科技强国努力奋斗。 尹力、石泰峰、刘国中、张国清、黄坤明出席座谈会。 中央和国家机关有关部门、军队有关单位、部分省市主要负责同志,有关高校、科研机构、国家实验室、企业负责人和科研人员代表等参加座谈会。 (一审:韩艳 二审:唐潇珺 三审:王建湘) 分享: 图说中南 【特别报道... 【聚焦党代... 【聚焦党代... 中南大学原... 新闻排行 友情链接 新华网 | 人民网 | 光明网 | 中新网 | 中青在线 | 中央电视台 | 教育部网站 | 湖南在线 | 中国大学生在线 | 红网 | 校媒网 | 凤凰网 中国记协网 | 清华大学新闻网 | 北大新闻网 | 浙大新闻网 | 复旦新闻网 | 华中大新闻网 | 更多》 QQ:1594252309 EMAIL:xwwz@mail.csu.edu.cn 地址:湖南省长沙市岳麓区 Copyright © 2014-2019 中南大学党委宣传部(新闻中心)版权所有 湘ICP备05005659号-1 站长统计 管理员登陆", + "summary" : "习近平在加强基础研究座谈会上强调以更大力度更实举措加强基础研究进一步打牢科技强国建设根基蔡奇出席 丁薛祥主持 4月30日上午,中共中央总书记、国家主席、中央军委主席习近平在上海出席加强基础研究座谈会并发表重要讲话。新华社记者 谢环驰 摄新华社上海4月30日电 中共中央总书记、国家主席、中央军委主席习近平30日上午在上海出席加强基础研究座谈会并发表重要讲话。他强调,基础研究是整个科学体系的源头,是所有技...", + "contentPreview" : "习近平在加强基础研究座谈会上强调 以更大力度更实举措加强基础研究 进一步打牢科技强国建设根基 蔡奇出席 丁薛祥主持 4月30日上午,中共中央总书记、国家主席、中央军委主席习近平在上海出席加强基础研究座谈会并发表重要讲话。新华社记者 谢环驰 摄 新华社上海4月30日电 中共中央总书记、国家主席、中央军委主席习近平30日上午在上海出席加强基础研究座谈会并发表重要讲话。他强调,基础研究是整个科学体系的源头,是所有技术问题的总机关。要以更大力度、更实举措加强基础研究,提升我国原始创新能力,进一步打牢科技强国建设根基。 中共中央政治局常委、中央办公厅主任蔡奇出席座谈会,中共中央政治局常委、国务院副总理丁薛祥主持座谈会。 座谈会上,科技部部长阴...", + "crawledAt" : "2026-05-31T15:10:54.0789471" +}, { + "school" : "湖南师范大学", + "siteKey" : "hunnu", + "title" : "校党委理论学习中心组专题学习《中国共产党思想政治工作条例》", + "url" : "https://news.hunnu.edu.cn/info/1005/95374.htm", + "publishTime" : "2026-05-30", + "source" : "湖南师范大学新闻网", + "author" : "吴乐 马铁泉", + "summary" : "(记者 吴乐 马铁泉)5月28日下午,校党委理论学习中心组(扩大)在图书馆报告厅开展思想政治工作专题学习,深入学习领会习近平总书记关于思想政治工作的重要论述和《中国共产党思想政治工作条例》(以下简称《条例》)。校党委副书记、校长刘仲华主持会议。全体在家校领导、党委委员,各二级党组织书记,职能部门主要负责人等参加学习。马克思主义学部党工委书记、主任唐未兵教授以《新时代高校思想政治工作的重要制度保障——认真学习贯彻〈中国共产党思想政治工作条例》为题作专题辅导报告,他从《条例》的主要...", + "contentPreview" : "(记者 吴乐 马铁泉)5月28日下午,校党委理论学习中心组(扩大)在图书馆报告厅开展思想政治工作专题学习,深入学习领会习近平总书记关于思想政治工作的重要论述和《中国共产党思想政治工作条例》(以下简称《条例》)。校党委副书记、校长刘仲华主持会议。全体在家校领导、党委委员,各二级党组织书记,职能部门主要负责人等参加学习。 马克思主义学部党工委书记、主任唐未兵教授以《新时代高校思想政治工作的重要制度保障——认真学习贯彻〈中国共产党思想政治工作条例》为题作专题辅导报告,他从《条例》的主要内容、贯彻落实《条例》的基本要求和路径思考三个方面对《条例》进行了深入解读。 校党委副书记段巍介绍了学校《关于全面构建思想政治工作体系的实施方案》,并部署学...", + "crawledAt" : "2026-05-31T15:10:54.8472187" +}, { + "school" : "湖南师范大学", + "siteKey" : "hunnu", + "title" : "Micro LED共封装光学(CPO)联合研究实验室在我校揭牌", + "url" : "https://news.hunnu.edu.cn/info/1005/95364.htm", + "publishTime" : "2026-05-30", + "source" : "湖南师范大学新闻网", + "author" : "黄依依 张艺洋", + "summary" : "(记者 黄依依 张艺洋)5月28日下午,我校与江西兆驰半导体有限公司共建Micro LED共封装光学(CPO)联合研究实验室协议签约暨揭牌仪式在未来技术研究院举行。我校校长、中国工程院院士刘仲华,副校长杨震、潘安练,兆驰集团董事、副总裁金从龙,长沙师元光电科技有限公司董事长翟继鑫出席仪式。刘仲华代表学校向企业嘉宾的到来表示热烈欢迎,对三方携手共建联合实验室表示祝贺。他指出,我校作为综合性师范大学,长期深耕理工科交叉融合发展领域,持续探索特色工科发展新路径。未来技术研究院是学校聚...", + "contentPreview" : "(记者 黄依依 张艺洋)5月28日下午,我校与江西兆驰半导体有限公司共建Micro LED共封装光学(CPO)联合研究实验室协议签约暨揭牌仪式在未来技术研究院举行。我校校长、中国工程院院士刘仲华,副校长杨震、潘安练,兆驰集团董事、副总裁金从龙,长沙师元光电科技有限公司董事长翟继鑫出席仪式。 刘仲华代表学校向企业嘉宾的到来表示热烈欢迎,对三方携手共建联合实验室表示祝贺。他指出,我校作为综合性师范大学,长期深耕理工科交叉融合发展领域,持续探索特色工科发展新路径。未来技术研究院是学校聚焦未来产业布局、打通学科交叉壁垒、推动科创成果落地的核心平台。此次三方联合共建创新实验室,是深化产学研用协同创新的生动实践。学校将全力充分发挥人才队伍、科研...", + "crawledAt" : "2026-05-31T15:10:55.1960342" +} ] \ No newline at end of file diff --git a/project2/crawl2/pom.xml b/project2/crawl2/pom.xml new file mode 100644 index 0000000..3b3eba7 --- /dev/null +++ b/project2/crawl2/pom.xml @@ -0,0 +1,58 @@ + + 4.0.0 + + edu.homework + university-news-crawler + 1.0.0 + University News Crawler + + + 17 + UTF-8 + 5.10.2 + + + + + org.jsoup + jsoup + 1.17.2 + + + com.fasterxml.jackson.core + jackson-databind + 2.17.2 + + + com.fasterxml.jackson.datatype + jackson-datatype-jsr310 + 2.17.2 + + + org.junit.jupiter + junit-jupiter + ${junit.version} + test + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.2.5 + + + org.codehaus.mojo + exec-maven-plugin + 3.3.0 + + edu.homework.crawler.Main + + + + + diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/Main.java b/project2/crawl2/src/main/java/edu/homework/crawler/Main.java new file mode 100644 index 0000000..e778fb8 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/Main.java @@ -0,0 +1,12 @@ +package edu.homework.crawler; + +import edu.homework.crawler.cli.CliApplication; + +public final class Main { + private Main() { + } + + public static void main(String[] args) { + CliApplication.defaultApplication().run(args); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/cli/CliApplication.java b/project2/crawl2/src/main/java/edu/homework/crawler/cli/CliApplication.java new file mode 100644 index 0000000..5bfbc46 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/cli/CliApplication.java @@ -0,0 +1,74 @@ +package edu.homework.crawler.cli; + +import edu.homework.crawler.command.CommandContext; +import edu.homework.crawler.command.CommandRegistry; +import edu.homework.crawler.command.CrawlCommand; +import edu.homework.crawler.command.ExitCommand; +import edu.homework.crawler.command.HelpCommand; +import edu.homework.crawler.command.SitesCommand; +import edu.homework.crawler.controller.CrawlerController; +import edu.homework.crawler.exception.CrawlException; +import edu.homework.crawler.repository.FileNewsRepository; +import edu.homework.crawler.service.NewsCrawlerService; +import edu.homework.crawler.strategy.SiteRegistry; +import edu.homework.crawler.view.ConsoleView; + +import java.nio.charset.StandardCharsets; +import java.util.Scanner; + +public class CliApplication { + private final CommandContext context; + private final CommandRegistry commandRegistry; + + public CliApplication(CommandContext context, CommandRegistry commandRegistry) { + this.context = context; + this.commandRegistry = commandRegistry; + } + + public static CliApplication defaultApplication() { + ConsoleView view = new ConsoleView(); + SiteRegistry siteRegistry = SiteRegistry.defaults(); + NewsCrawlerService service = new NewsCrawlerService(siteRegistry); + FileNewsRepository repository = new FileNewsRepository(); + CrawlerController controller = new CrawlerController(service, repository); + + CommandRegistry registry = new CommandRegistry(); + CommandContext context = new CommandContext(controller, view, registry, siteRegistry); + registry.register(new HelpCommand()); + registry.register(new SitesCommand()); + registry.register(new CrawlCommand()); + registry.register(new ExitCommand()); + return new CliApplication(context, registry); + } + + public void run(String[] args) { + if (args.length > 0) { + executeLine(String.join(" ", args)); + return; + } + + context.view().printWelcome(); + try (Scanner scanner = new Scanner(System.in, StandardCharsets.UTF_8)) { + while (context.isRunning()) { + context.view().printPrompt(); + if (!scanner.hasNextLine()) { + break; + } + executeLine(scanner.nextLine()); + } + } + } + + private void executeLine(String line) { + try { + commandRegistry.execute(context, line); + } catch (CrawlException e) { + context.view().printError(e.getMessage()); + if (e.getCause() != null) { + context.view().printError("Cause: " + e.getCause().getMessage()); + } + } catch (RuntimeException e) { + context.view().printError("Unexpected error: " + e.getMessage()); + } + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/command/Command.java b/project2/crawl2/src/main/java/edu/homework/crawler/command/Command.java new file mode 100644 index 0000000..09654d2 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/command/Command.java @@ -0,0 +1,11 @@ +package edu.homework.crawler.command; + +import edu.homework.crawler.exception.CrawlException; + +public interface Command { + String name(); + + String description(); + + void execute(CommandContext context, String[] args) throws CrawlException; +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/command/CommandContext.java b/project2/crawl2/src/main/java/edu/homework/crawler/command/CommandContext.java new file mode 100644 index 0000000..0979c83 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/command/CommandContext.java @@ -0,0 +1,44 @@ +package edu.homework.crawler.command; + +import edu.homework.crawler.controller.CrawlerController; +import edu.homework.crawler.strategy.SiteRegistry; +import edu.homework.crawler.view.ConsoleView; + +public class CommandContext { + private final CrawlerController controller; + private final ConsoleView view; + private final CommandRegistry commandRegistry; + private final SiteRegistry siteRegistry; + private boolean running = true; + + public CommandContext(CrawlerController controller, ConsoleView view, CommandRegistry commandRegistry, SiteRegistry siteRegistry) { + this.controller = controller; + this.view = view; + this.commandRegistry = commandRegistry; + this.siteRegistry = siteRegistry; + } + + public CrawlerController controller() { + return controller; + } + + public ConsoleView view() { + return view; + } + + public CommandRegistry commandRegistry() { + return commandRegistry; + } + + public SiteRegistry siteRegistry() { + return siteRegistry; + } + + public boolean isRunning() { + return running; + } + + public void stop() { + this.running = false; + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/command/CommandRegistry.java b/project2/crawl2/src/main/java/edu/homework/crawler/command/CommandRegistry.java new file mode 100644 index 0000000..bad04e6 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/command/CommandRegistry.java @@ -0,0 +1,82 @@ +package edu.homework.crawler.command; + +import edu.homework.crawler.exception.CommandException; +import edu.homework.crawler.exception.CrawlException; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +public class CommandRegistry { + private final Map commands = new LinkedHashMap<>(); + + public void register(Command command) { + commands.put(command.name(), command); + } + + public Collection commands() { + return commands.values(); + } + + public void execute(CommandContext context, String line) throws CrawlException { + List tokens = tokenize(line); + if (tokens.isEmpty()) { + return; + } + Command command = commands.get(tokens.get(0)); + if (command == null) { + throw new CommandException("Unknown command: " + tokens.get(0) + ". Type help to see commands."); + } + String[] args = tokens.subList(1, tokens.size()).toArray(String[]::new); + command.execute(context, args); + } + + public List tokenize(String line) throws CommandException { + List tokens = new ArrayList<>(); + StringBuilder current = new StringBuilder(); + boolean inQuotes = false; + for (int i = 0; i < line.length(); i++) { + char ch = line.charAt(i); + if (ch == '"') { + inQuotes = !inQuotes; + continue; + } + if (Character.isWhitespace(ch) && !inQuotes) { + addToken(tokens, current); + } else { + current.append(ch); + } + } + if (inQuotes) { + throw new CommandException("Missing closing quote in command line."); + } + addToken(tokens, current); + return tokens; + } + + private void addToken(List tokens, StringBuilder current) { + if (current.length() > 0) { + tokens.add(current.toString()); + current.setLength(0); + } + } + + public Map parseOptions(String[] args) throws CommandException { + Map options = new LinkedHashMap<>(); + List list = Arrays.asList(args); + for (int i = 0; i < list.size(); i++) { + String key = list.get(i); + if (!key.startsWith("--")) { + throw new CommandException("Invalid argument: " + key + ". Options must start with --."); + } + if (i + 1 >= list.size() || list.get(i + 1).startsWith("--")) { + throw new CommandException("Missing value for option " + key + "."); + } + options.put(key.substring(2), list.get(++i)); + } + return options; + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/command/CrawlCommand.java b/project2/crawl2/src/main/java/edu/homework/crawler/command/CrawlCommand.java new file mode 100644 index 0000000..63bb50a --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/command/CrawlCommand.java @@ -0,0 +1,55 @@ +package edu.homework.crawler.command; + +import edu.homework.crawler.exception.CommandException; +import edu.homework.crawler.exception.CrawlException; +import edu.homework.crawler.model.CrawlRequest; +import edu.homework.crawler.repository.OutputFormat; + +import java.nio.file.Path; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Map; + +public class CrawlCommand implements Command { + private static final DateTimeFormatter FILE_TIME = DateTimeFormatter.ofPattern("yyyyMMdd-HHmmss"); + + @Override + public String name() { + return "crawl"; + } + + @Override + public String description() { + return "Crawl news and save to a JSON or CSV file."; + } + + @Override + public void execute(CommandContext context, String[] args) throws CrawlException { + Map options = context.commandRegistry().parseOptions(args); + String site = options.getOrDefault("site", "all"); + int limit = parseLimit(options.getOrDefault("limit", "10")); + OutputFormat format = OutputFormat.from(options.getOrDefault("format", "json")); + Path outputPath = Path.of(options.getOrDefault("out", defaultOutput(format))); + + context.view().printInfo("Starting crawl: site=" + site + ", limit=" + limit + ", format=" + format.name().toLowerCase()); + CrawlRequest request = new CrawlRequest(site, limit, format, outputPath); + context.view().printSummary(context.controller().crawl(request)); + } + + private int parseLimit(String value) throws CommandException { + try { + int limit = Integer.parseInt(value); + if (limit <= 0 || limit > 100) { + throw new CommandException("Limit must be between 1 and 100."); + } + return limit; + } catch (NumberFormatException e) { + throw new CommandException("Limit must be a number: " + value); + } + } + + private String defaultOutput(OutputFormat format) { + String suffix = format == OutputFormat.JSON ? ".json" : ".csv"; + return "data/news-" + LocalDateTime.now().format(FILE_TIME) + suffix; + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/command/ExitCommand.java b/project2/crawl2/src/main/java/edu/homework/crawler/command/ExitCommand.java new file mode 100644 index 0000000..559343a --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/command/ExitCommand.java @@ -0,0 +1,21 @@ +package edu.homework.crawler.command; + +import edu.homework.crawler.exception.CrawlException; + +public class ExitCommand implements Command { + @Override + public String name() { + return "exit"; + } + + @Override + public String description() { + return "Exit interactive CLI."; + } + + @Override + public void execute(CommandContext context, String[] args) throws CrawlException { + context.stop(); + context.view().printInfo("Bye."); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/command/HelpCommand.java b/project2/crawl2/src/main/java/edu/homework/crawler/command/HelpCommand.java new file mode 100644 index 0000000..137b739 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/command/HelpCommand.java @@ -0,0 +1,28 @@ +package edu.homework.crawler.command; + +import edu.homework.crawler.exception.CrawlException; + +public class HelpCommand implements Command { + @Override + public String name() { + return "help"; + } + + @Override + public String description() { + return "Show command usage."; + } + + @Override + public void execute(CommandContext context, String[] args) throws CrawlException { + StringBuilder builder = new StringBuilder(); + builder.append("Commands:\n"); + for (Command command : context.commandRegistry().commands()) { + builder.append(" ").append(command.name()).append(" - ").append(command.description()).append('\n'); + } + builder.append("\nExamples:\n"); + builder.append(" crawl --site all --limit 10 --format json --out data/news.json\n"); + builder.append(" crawl --site hnu --limit 5 --format csv --out data/hnu.csv\n"); + context.view().printHelp(builder.toString()); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/command/SitesCommand.java b/project2/crawl2/src/main/java/edu/homework/crawler/command/SitesCommand.java new file mode 100644 index 0000000..04fb098 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/command/SitesCommand.java @@ -0,0 +1,22 @@ +package edu.homework.crawler.command; + +import edu.homework.crawler.exception.CrawlException; + +public class SitesCommand implements Command { + @Override + public String name() { + return "sites"; + } + + @Override + public String description() { + return "List supported websites."; + } + + @Override + public void execute(CommandContext context, String[] args) throws CrawlException { + context.view().printInfo("Supported sites:"); + context.siteRegistry().all().forEach(strategy -> + context.view().printInfo(" - " + strategy.key() + ": " + strategy.schoolName() + " (" + strategy.baseUrl() + ")")); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/controller/CrawlerController.java b/project2/crawl2/src/main/java/edu/homework/crawler/controller/CrawlerController.java new file mode 100644 index 0000000..791e5f5 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/controller/CrawlerController.java @@ -0,0 +1,30 @@ +package edu.homework.crawler.controller; + +import edu.homework.crawler.exception.CrawlException; +import edu.homework.crawler.model.CrawlRequest; +import edu.homework.crawler.model.CrawlSummary; +import edu.homework.crawler.model.NewsItem; +import edu.homework.crawler.repository.FileNewsRepository; +import edu.homework.crawler.service.NewsCrawlerService; + +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class CrawlerController { + private final NewsCrawlerService crawlerService; + private final FileNewsRepository newsRepository; + + public CrawlerController(NewsCrawlerService crawlerService, FileNewsRepository newsRepository) { + this.crawlerService = crawlerService; + this.newsRepository = newsRepository; + } + + public CrawlSummary crawl(CrawlRequest request) throws CrawlException { + List items = crawlerService.crawl(request.siteKey(), request.limitPerSite()); + newsRepository.save(items, request.outputFormat(), request.outputPath()); + Map counts = items.stream() + .collect(Collectors.groupingBy(NewsItem::getSiteKey, Collectors.collectingAndThen(Collectors.counting(), Long::intValue))); + return new CrawlSummary(items, counts, request.outputPath()); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/exception/CommandException.java b/project2/crawl2/src/main/java/edu/homework/crawler/exception/CommandException.java new file mode 100644 index 0000000..b53d572 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/exception/CommandException.java @@ -0,0 +1,7 @@ +package edu.homework.crawler.exception; + +public class CommandException extends CrawlException { + public CommandException(String message) { + super(message); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/exception/CrawlException.java b/project2/crawl2/src/main/java/edu/homework/crawler/exception/CrawlException.java new file mode 100644 index 0000000..a3f6e89 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/exception/CrawlException.java @@ -0,0 +1,11 @@ +package edu.homework.crawler.exception; + +public class CrawlException extends Exception { + public CrawlException(String message) { + super(message); + } + + public CrawlException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/exception/NetworkException.java b/project2/crawl2/src/main/java/edu/homework/crawler/exception/NetworkException.java new file mode 100644 index 0000000..0693327 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/exception/NetworkException.java @@ -0,0 +1,7 @@ +package edu.homework.crawler.exception; + +public class NetworkException extends CrawlException { + public NetworkException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/exception/ParseException.java b/project2/crawl2/src/main/java/edu/homework/crawler/exception/ParseException.java new file mode 100644 index 0000000..b797f9e --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/exception/ParseException.java @@ -0,0 +1,11 @@ +package edu.homework.crawler.exception; + +public class ParseException extends CrawlException { + public ParseException(String message) { + super(message); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/exception/SiteNotFoundException.java b/project2/crawl2/src/main/java/edu/homework/crawler/exception/SiteNotFoundException.java new file mode 100644 index 0000000..9a15e4e --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/exception/SiteNotFoundException.java @@ -0,0 +1,7 @@ +package edu.homework.crawler.exception; + +public class SiteNotFoundException extends CrawlException { + public SiteNotFoundException(String message) { + super(message); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/exception/StorageException.java b/project2/crawl2/src/main/java/edu/homework/crawler/exception/StorageException.java new file mode 100644 index 0000000..3131bc3 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/exception/StorageException.java @@ -0,0 +1,7 @@ +package edu.homework.crawler.exception; + +public class StorageException extends CrawlException { + public StorageException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/model/CrawlRequest.java b/project2/crawl2/src/main/java/edu/homework/crawler/model/CrawlRequest.java new file mode 100644 index 0000000..5e1db70 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/model/CrawlRequest.java @@ -0,0 +1,8 @@ +package edu.homework.crawler.model; + +import edu.homework.crawler.repository.OutputFormat; + +import java.nio.file.Path; + +public record CrawlRequest(String siteKey, int limitPerSite, OutputFormat outputFormat, Path outputPath) { +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/model/CrawlSummary.java b/project2/crawl2/src/main/java/edu/homework/crawler/model/CrawlSummary.java new file mode 100644 index 0000000..064b4d7 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/model/CrawlSummary.java @@ -0,0 +1,11 @@ +package edu.homework.crawler.model; + +import java.nio.file.Path; +import java.util.List; +import java.util.Map; + +public record CrawlSummary(List items, Map siteCounts, Path outputPath) { + public int totalCount() { + return items.size(); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/model/NewsCandidate.java b/project2/crawl2/src/main/java/edu/homework/crawler/model/NewsCandidate.java new file mode 100644 index 0000000..ff8e17f --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/model/NewsCandidate.java @@ -0,0 +1,4 @@ +package edu.homework.crawler.model; + +public record NewsCandidate(String title, String url, String publishTime) { +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/model/NewsItem.java b/project2/crawl2/src/main/java/edu/homework/crawler/model/NewsItem.java new file mode 100644 index 0000000..5617c38 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/model/NewsItem.java @@ -0,0 +1,107 @@ +package edu.homework.crawler.model; + +import java.time.LocalDateTime; + +public class NewsItem { + private String school; + private String siteKey; + private String title; + private String url; + private String publishTime; + private String source; + private String author; + private String summary; + private String contentPreview; + private LocalDateTime crawledAt; + + public NewsItem() { + } + + public NewsItem(String school, String siteKey, String title, String url) { + this.school = school; + this.siteKey = siteKey; + this.title = title; + this.url = url; + this.crawledAt = LocalDateTime.now(); + } + + public String getSchool() { + return school; + } + + public void setSchool(String school) { + this.school = school; + } + + public String getSiteKey() { + return siteKey; + } + + public void setSiteKey(String siteKey) { + this.siteKey = siteKey; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getPublishTime() { + return publishTime; + } + + public void setPublishTime(String publishTime) { + this.publishTime = publishTime; + } + + public String getSource() { + return source; + } + + public void setSource(String source) { + this.source = source; + } + + public String getAuthor() { + return author; + } + + public void setAuthor(String author) { + this.author = author; + } + + public String getSummary() { + return summary; + } + + public void setSummary(String summary) { + this.summary = summary; + } + + public String getContentPreview() { + return contentPreview; + } + + public void setContentPreview(String contentPreview) { + this.contentPreview = contentPreview; + } + + public LocalDateTime getCrawledAt() { + return crawledAt; + } + + public void setCrawledAt(LocalDateTime crawledAt) { + this.crawledAt = crawledAt; + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/repository/FileNewsRepository.java b/project2/crawl2/src/main/java/edu/homework/crawler/repository/FileNewsRepository.java new file mode 100644 index 0000000..4b0645e --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/repository/FileNewsRepository.java @@ -0,0 +1,67 @@ +package edu.homework.crawler.repository; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import edu.homework.crawler.exception.StorageException; +import edu.homework.crawler.model.NewsItem; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +public class FileNewsRepository { + private final ObjectMapper objectMapper; + + public FileNewsRepository() { + this.objectMapper = new ObjectMapper() + .registerModule(new JavaTimeModule()) + .enable(SerializationFeature.INDENT_OUTPUT) + .disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); + } + + public void save(List items, OutputFormat format, Path outputPath) throws StorageException { + try { + Path parent = outputPath.toAbsolutePath().getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + if (format == OutputFormat.JSON) { + objectMapper.writeValue(outputPath.toFile(), items); + } else { + writeCsv(items, outputPath); + } + } catch (IOException e) { + throw new StorageException("Failed to save crawler data to " + outputPath, e); + } + } + + private void writeCsv(List items, Path outputPath) throws IOException { + try (BufferedWriter writer = Files.newBufferedWriter(outputPath, StandardCharsets.UTF_8)) { + writer.write("school,siteKey,title,url,publishTime,source,author,summary,contentPreview,crawledAt"); + writer.newLine(); + for (NewsItem item : items) { + writer.write(String.join(",", + csv(item.getSchool()), + csv(item.getSiteKey()), + csv(item.getTitle()), + csv(item.getUrl()), + csv(item.getPublishTime()), + csv(item.getSource()), + csv(item.getAuthor()), + csv(item.getSummary()), + csv(item.getContentPreview()), + csv(item.getCrawledAt() == null ? "" : item.getCrawledAt().toString()))); + writer.newLine(); + } + } + } + + private String csv(String value) { + String safeValue = value == null ? "" : value; + return "\"" + safeValue.replace("\"", "\"\"") + "\""; + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/repository/OutputFormat.java b/project2/crawl2/src/main/java/edu/homework/crawler/repository/OutputFormat.java new file mode 100644 index 0000000..7b3f454 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/repository/OutputFormat.java @@ -0,0 +1,17 @@ +package edu.homework.crawler.repository; + +import edu.homework.crawler.exception.CommandException; + +public enum OutputFormat { + JSON, + CSV; + + public static OutputFormat from(String value) throws CommandException { + for (OutputFormat format : values()) { + if (format.name().equalsIgnoreCase(value)) { + return format; + } + } + throw new CommandException("Unsupported output format: " + value + ". Use json or csv."); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/service/NewsCrawlerService.java b/project2/crawl2/src/main/java/edu/homework/crawler/service/NewsCrawlerService.java new file mode 100644 index 0000000..d58dccc --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/service/NewsCrawlerService.java @@ -0,0 +1,37 @@ +package edu.homework.crawler.service; + +import edu.homework.crawler.exception.CrawlException; +import edu.homework.crawler.exception.SiteNotFoundException; +import edu.homework.crawler.model.NewsItem; +import edu.homework.crawler.strategy.CrawlStrategy; +import edu.homework.crawler.strategy.SiteRegistry; +import edu.homework.crawler.util.HttpFetcher; + +import java.util.ArrayList; +import java.util.List; + +public class NewsCrawlerService { + private final SiteRegistry siteRegistry; + private final HttpFetcher httpFetcher; + + public NewsCrawlerService(SiteRegistry siteRegistry) { + this.siteRegistry = siteRegistry; + this.httpFetcher = new HttpFetcher(); + } + + public List crawl(String siteKey, int limitPerSite) throws CrawlException { + List strategies = resolveStrategies(siteKey); + List items = new ArrayList<>(); + for (CrawlStrategy strategy : strategies) { + items.addAll(strategy.crawl(httpFetcher, limitPerSite)); + } + return items; + } + + private List resolveStrategies(String siteKey) throws SiteNotFoundException { + if ("all".equalsIgnoreCase(siteKey)) { + return siteRegistry.all(); + } + return List.of(siteRegistry.get(siteKey)); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/strategy/AbstractVisualSiteBuilderStrategy.java b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/AbstractVisualSiteBuilderStrategy.java new file mode 100644 index 0000000..14741a0 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/AbstractVisualSiteBuilderStrategy.java @@ -0,0 +1,177 @@ +package edu.homework.crawler.strategy; + +import edu.homework.crawler.exception.CrawlException; +import edu.homework.crawler.exception.ParseException; +import edu.homework.crawler.model.NewsCandidate; +import edu.homework.crawler.model.NewsItem; +import edu.homework.crawler.util.HttpFetcher; +import edu.homework.crawler.util.TextExtractors; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.net.URI; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +public abstract class AbstractVisualSiteBuilderStrategy implements CrawlStrategy { + private static final String NEWS_LINK_SELECTOR = "a[href*=info/][href$=.htm],a[href$=.htm]"; + + @Override + public List crawl(HttpFetcher fetcher, int limit) throws CrawlException { + Map mergedCandidates = new LinkedHashMap<>(); + CrawlException lastFailure = null; + for (String startUrl : startUrls()) { + try { + Document page = fetcher.fetch(startUrl); + for (NewsCandidate candidate : extractCandidates(page, limit * 8)) { + mergedCandidates.putIfAbsent(candidate.url(), candidate); + } + } catch (CrawlException e) { + lastFailure = e; + } + } + List candidates = sortCandidates(new ArrayList<>(mergedCandidates.values()), limit * 8); + if (candidates.isEmpty()) { + if (lastFailure != null) { + throw lastFailure; + } + throw new ParseException("No news links found on " + baseUrl()); + } + + List items = new ArrayList<>(); + for (NewsCandidate candidate : candidates) { + if (items.size() >= limit) { + break; + } + try { + fetcher.politePause(); + Document detail = fetcher.fetch(candidate.url()); + items.add(parseDetail(candidate, detail)); + } catch (CrawlException e) { + NewsItem fallback = new NewsItem(schoolName(), key(), candidate.title(), candidate.url()); + fallback.setPublishTime(candidate.publishTime()); + fallback.setSummary("Detail page failed: " + e.getMessage()); + fallback.setCrawledAt(LocalDateTime.now()); + items.add(fallback); + } + } + return items; + } + + protected List startUrls() { + return List.of(baseUrl()); + } + + protected List extractCandidates(Document document, int maxCandidates) { + Map candidates = new LinkedHashMap<>(); + Elements links = document.select(candidateSelector()); + for (Element link : links) { + String url = link.absUrl("href"); + if (!isAcceptableUrl(url)) { + continue; + } + String title = extractCandidateTitle(link); + if (title.isBlank() || title.length() < 4) { + continue; + } + String date = TextExtractors.findDate(neighborText(link)); + candidates.putIfAbsent(url, new NewsCandidate(title, url, date)); + if (candidates.size() >= maxCandidates) { + break; + } + } + return sortCandidates(new ArrayList<>(candidates.values()), maxCandidates); + } + + private List sortCandidates(List sorted, int maxCandidates) { + sorted.sort((left, right) -> { + boolean leftHasDate = !left.publishTime().isBlank(); + boolean rightHasDate = !right.publishTime().isBlank(); + if (leftHasDate != rightHasDate) { + return leftHasDate ? -1 : 1; + } + return right.publishTime().compareTo(left.publishTime()); + }); + if (sorted.size() > maxCandidates) { + return sorted.subList(0, maxCandidates); + } + return sorted; + } + + protected String candidateSelector() { + return NEWS_LINK_SELECTOR; + } + + protected NewsItem parseDetail(NewsCandidate candidate, Document detail) { + NewsItem item = new NewsItem(schoolName(), key(), extractTitle(candidate, detail), candidate.url()); + String pageText = TextExtractors.clean(detail.text()); + item.setPublishTime(TextExtractors.firstNonBlank( + TextExtractors.findPublishTime(pageText), + candidate.publishTime())); + item.setSource(TextExtractors.findLabelValue(pageText, "来源")); + item.setAuthor(TextExtractors.findLabelValue(pageText, "作者")); + item.setSummary(TextExtractors.clean(detail.select("meta[name=description]").attr("content"))); + item.setContentPreview(extractContentPreview(detail)); + item.setCrawledAt(LocalDateTime.now()); + return item; + } + + protected String extractTitle(NewsCandidate candidate, Document detail) { + String title = TextExtractors.firstNonBlank( + detail.select("h1").first() == null ? "" : detail.select("h1").first().text(), + detail.select(".ar_tit h3").first() == null ? "" : detail.select(".ar_tit h3").first().text(), + detail.select(".subTitle2 span").first() == null ? "" : detail.select(".subTitle2 span").first().text(), + detail.select("meta[name=pageTitle]").attr("content"), + candidate.title()); + return TextExtractors.limit(TextExtractors.clean(title), 160); + } + + protected String extractContentPreview(Document detail) { + String content = TextExtractors.firstNonBlank( + detail.select(".v_news_content").text(), + detail.select("#vsb_content").text(), + detail.select("#vsb_content_6").text(), + detail.body() == null ? "" : detail.body().text()); + return TextExtractors.limit(TextExtractors.clean(content), 320); + } + + private String extractCandidateTitle(Element link) { + String nestedHeading = ""; + Element heading = link.selectFirst("h1,h2,h3,h4,h5,.tit,.title,.pXZCont,.c59665"); + if (heading != null) { + nestedHeading = heading.text(); + } + return TextExtractors.limit(TextExtractors.clean(TextExtractors.firstNonBlank( + link.attr("title"), + nestedHeading, + link.ownText(), + link.text())), 160); + } + + private String neighborText(Element link) { + StringBuilder builder = new StringBuilder(link.text()).append(' '); + Element node = link; + for (int i = 0; i < 4 && node != null; i++) { + builder.append(node.text()).append(' '); + node = node.parent(); + } + return TextExtractors.clean(builder.toString()); + } + + protected boolean isAcceptableUrl(String url) { + if (url == null || url.isBlank()) { + return false; + } + try { + URI base = URI.create(baseUrl()); + URI candidate = URI.create(url); + return base.getHost().equalsIgnoreCase(candidate.getHost()) && candidate.getPath().contains("/info/"); + } catch (IllegalArgumentException e) { + return false; + } + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/strategy/CrawlStrategy.java b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/CrawlStrategy.java new file mode 100644 index 0000000..512ec90 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/CrawlStrategy.java @@ -0,0 +1,17 @@ +package edu.homework.crawler.strategy; + +import edu.homework.crawler.exception.CrawlException; +import edu.homework.crawler.model.NewsItem; +import edu.homework.crawler.util.HttpFetcher; + +import java.util.List; + +public interface CrawlStrategy { + String key(); + + String schoolName(); + + String baseUrl(); + + List crawl(HttpFetcher fetcher, int limit) throws CrawlException; +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/strategy/CsuNewsStrategy.java b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/CsuNewsStrategy.java new file mode 100644 index 0000000..efc296f --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/CsuNewsStrategy.java @@ -0,0 +1,25 @@ +package edu.homework.crawler.strategy; + +import java.util.List; + +public class CsuNewsStrategy extends AbstractVisualSiteBuilderStrategy { + @Override + public String key() { + return "csu"; + } + + @Override + public String schoolName() { + return "中南大学"; + } + + @Override + public String baseUrl() { + return "https://news.csu.edu.cn/"; + } + + @Override + protected List startUrls() { + return List.of("https://news.csu.edu.cn/xxyw.htm", baseUrl()); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/strategy/HnuNewsStrategy.java b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..fc2564d --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/HnuNewsStrategy.java @@ -0,0 +1,25 @@ +package edu.homework.crawler.strategy; + +import java.util.List; + +public class HnuNewsStrategy extends AbstractVisualSiteBuilderStrategy { + @Override + public String key() { + return "hnu"; + } + + @Override + public String schoolName() { + return "湖南大学"; + } + + @Override + public String baseUrl() { + return "https://news.hnu.edu.cn/"; + } + + @Override + protected List startUrls() { + return List.of("https://news.hnu.edu.cn/xw/zhxw.htm", baseUrl()); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/strategy/HunnuNewsStrategy.java b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/HunnuNewsStrategy.java new file mode 100644 index 0000000..f4f138f --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/HunnuNewsStrategy.java @@ -0,0 +1,30 @@ +package edu.homework.crawler.strategy; + +import java.util.List; + +public class HunnuNewsStrategy extends AbstractVisualSiteBuilderStrategy { + @Override + public String key() { + return "hunnu"; + } + + @Override + public String schoolName() { + return "湖南师范大学"; + } + + @Override + public String baseUrl() { + return "https://news.hunnu.edu.cn/"; + } + + @Override + protected List startUrls() { + return List.of("https://news.hunnu.edu.cn/sdxw.htm", baseUrl()); + } + + @Override + protected boolean isAcceptableUrl(String url) { + return super.isAcceptableUrl(url) && url.contains("/info/1005/"); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/strategy/SiteRegistry.java b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/SiteRegistry.java new file mode 100644 index 0000000..1b35047 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/strategy/SiteRegistry.java @@ -0,0 +1,36 @@ +package edu.homework.crawler.strategy; + +import edu.homework.crawler.exception.SiteNotFoundException; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +public class SiteRegistry { + private final Map strategies = new LinkedHashMap<>(); + + public static SiteRegistry defaults() { + SiteRegistry registry = new SiteRegistry(); + registry.register(new HnuNewsStrategy()); + registry.register(new CsuNewsStrategy()); + registry.register(new HunnuNewsStrategy()); + return registry; + } + + public void register(CrawlStrategy strategy) { + strategies.put(strategy.key(), strategy); + } + + public CrawlStrategy get(String key) throws SiteNotFoundException { + CrawlStrategy strategy = strategies.get(key.toLowerCase()); + if (strategy == null) { + throw new SiteNotFoundException("Unsupported site: " + key + ". Use all, hnu, csu, or hunnu."); + } + return strategy; + } + + public List all() { + return new ArrayList<>(strategies.values()); + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/util/HttpFetcher.java b/project2/crawl2/src/main/java/edu/homework/crawler/util/HttpFetcher.java new file mode 100644 index 0000000..aaf340d --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/util/HttpFetcher.java @@ -0,0 +1,39 @@ +package edu.homework.crawler.util; + +import edu.homework.crawler.exception.NetworkException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.IOException; + +public class HttpFetcher { + private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"; + + public Document fetch(String url) throws NetworkException { + IOException lastFailure = null; + for (int attempt = 1; attempt <= 3; attempt++) { + try { + return Jsoup.connect(url) + .userAgent(USER_AGENT) + .referrer("https://www.baidu.com/") + .timeout(20_000) + .maxBodySize(5 * 1024 * 1024) + .followRedirects(true) + .get(); + } catch (IOException e) { + lastFailure = e; + politePause(); + } + } + throw new NetworkException("Network request failed: " + url, lastFailure); + } + + public void politePause() { + try { + Thread.sleep(250); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/util/TextExtractors.java b/project2/crawl2/src/main/java/edu/homework/crawler/util/TextExtractors.java new file mode 100644 index 0000000..2e205c9 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/util/TextExtractors.java @@ -0,0 +1,98 @@ +package edu.homework.crawler.util; + +import java.time.LocalDate; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public final class TextExtractors { + private static final Pattern DATE = Pattern.compile("(20\\d{2})[-年./](\\d{1,2})[-月./](\\d{1,2})日?(?:\\s+\\d{1,2}:\\d{2})?"); + private static final Pattern SPLIT_DAY_YEAR_MONTH = Pattern.compile("(?= 0) { + String slice = cleaned.substring(index, Math.min(cleaned.length(), index + 80)); + String date = findDate(slice); + if (!date.isBlank()) { + return date; + } + } + return findDate(cleaned); + } + + public static String findDate(String text) { + Matcher matcher = SPLIT_DAY_YEAR_MONTH.matcher(clean(text)); + if (matcher.find()) { + return matcher.group(2) + "-" + pad(matcher.group(3)) + "-" + pad(matcher.group(1)); + } + matcher = DATE.matcher(clean(text)); + if (matcher.find()) { + return normalizeFullDate(matcher); + } + matcher = MONTH_DAY.matcher(clean(text)); + if (matcher.find()) { + int month = Integer.parseInt(matcher.group(1)); + int day = Integer.parseInt(matcher.group(2)); + LocalDate today = LocalDate.now(); + int year = today.getYear(); + if (LocalDate.of(year, month, day).isAfter(today.plusDays(7))) { + year--; + } + return year + "-" + pad(matcher.group(1)) + "-" + pad(matcher.group(2)); + } + return ""; + } + + public static String findLabelValue(String text, String label) { + String cleaned = clean(text); + Pattern pattern = Pattern.compile(label + "[::]\\s*(.*?)(?=\\s*(?:来源|作者|发布时间|点击)[::]|$)"); + Matcher matcher = pattern.matcher(cleaned); + if (matcher.find()) { + String value = clean(matcher.group(1)); + if (!value.contains("点击") && !value.contains("发布时间")) { + return value; + } + } + return ""; + } + + private static String normalizeFullDate(Matcher matcher) { + return matcher.group(1) + "-" + pad(matcher.group(2)) + "-" + pad(matcher.group(3)); + } + + private static String pad(String value) { + return value.length() == 1 ? "0" + value : value; + } +} diff --git a/project2/crawl2/src/main/java/edu/homework/crawler/view/ConsoleView.java b/project2/crawl2/src/main/java/edu/homework/crawler/view/ConsoleView.java new file mode 100644 index 0000000..563a034 --- /dev/null +++ b/project2/crawl2/src/main/java/edu/homework/crawler/view/ConsoleView.java @@ -0,0 +1,37 @@ +package edu.homework.crawler.view; + +import edu.homework.crawler.model.CrawlSummary; + +public class ConsoleView { + public void printWelcome() { + println("University News Crawler"); + println("Type help to see commands."); + } + + public void printPrompt() { + System.out.print("crawler> "); + } + + public void printHelp(String text) { + println(text); + } + + public void printInfo(String text) { + println(text); + } + + public void printError(String text) { + System.err.println("[ERROR] " + text); + } + + public void printSummary(CrawlSummary summary) { + println("Crawl finished."); + println("Total items: " + summary.totalCount()); + summary.siteCounts().forEach((site, count) -> println(" - " + site + ": " + count)); + println("Saved to: " + summary.outputPath().toAbsolutePath()); + } + + private void println(String text) { + System.out.println(text); + } +} diff --git a/project2/crawl2/src/test/java/edu/homework/crawler/command/CommandRegistryTest.java b/project2/crawl2/src/test/java/edu/homework/crawler/command/CommandRegistryTest.java new file mode 100644 index 0000000..ad964d8 --- /dev/null +++ b/project2/crawl2/src/test/java/edu/homework/crawler/command/CommandRegistryTest.java @@ -0,0 +1,38 @@ +package edu.homework.crawler.command; + +import edu.homework.crawler.exception.CommandException; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +class CommandRegistryTest { + @Test + void tokenizesQuotedOutputPath() throws Exception { + CommandRegistry registry = new CommandRegistry(); + + List tokens = registry.tokenize("crawl --site all --out \"data/my news.json\""); + + assertEquals(List.of("crawl", "--site", "all", "--out", "data/my news.json"), tokens); + } + + @Test + void parsesOptionsAsKeyValuePairs() throws Exception { + CommandRegistry registry = new CommandRegistry(); + + Map options = registry.parseOptions(new String[]{"--site", "hnu", "--limit", "5"}); + + assertEquals("hnu", options.get("site")); + assertEquals("5", options.get("limit")); + } + + @Test + void rejectsMissingOptionValue() { + CommandRegistry registry = new CommandRegistry(); + + assertThrows(CommandException.class, () -> registry.parseOptions(new String[]{"--site"})); + } +} diff --git a/project2/crawl2/src/test/java/edu/homework/crawler/repository/FileNewsRepositoryTest.java b/project2/crawl2/src/test/java/edu/homework/crawler/repository/FileNewsRepositoryTest.java new file mode 100644 index 0000000..71a20c4 --- /dev/null +++ b/project2/crawl2/src/test/java/edu/homework/crawler/repository/FileNewsRepositoryTest.java @@ -0,0 +1,29 @@ +package edu.homework.crawler.repository; + +import edu.homework.crawler.model.NewsItem; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +class FileNewsRepositoryTest { + @TempDir + Path tempDir; + + @Test + void savesJsonFile() throws Exception { + FileNewsRepository repository = new FileNewsRepository(); + NewsItem item = new NewsItem("湖南大学", "hnu", "测试新闻", "https://example.com/news"); + Path output = tempDir.resolve("news.json"); + + repository.save(List.of(item), OutputFormat.JSON, output); + + String json = Files.readString(output); + assertTrue(json.contains("测试新闻")); + assertTrue(json.contains("hnu")); + } +} diff --git a/project2/crawl2/src/test/java/edu/homework/crawler/util/TextExtractorsTest.java b/project2/crawl2/src/test/java/edu/homework/crawler/util/TextExtractorsTest.java new file mode 100644 index 0000000..a5f3b08 --- /dev/null +++ b/project2/crawl2/src/test/java/edu/homework/crawler/util/TextExtractorsTest.java @@ -0,0 +1,29 @@ +package edu.homework.crawler.util; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class TextExtractorsTest { + @Test + void normalizesChinesePublishDate() { + String text = "来源:新闻网 作者:张三 发布时间:2026年05月28日 17:14 点击:100次"; + + assertEquals("2026-05-28", TextExtractors.findPublishTime(text)); + } + + @Test + void extractsSimpleLabelValue() { + String text = "来源:新闻网 作者:李四 发布时间:2026-05-28"; + + assertEquals("新闻网", TextExtractors.findLabelValue(text, "来源")); + assertEquals("李四", TextExtractors.findLabelValue(text, "作者")); + } + + @Test + void normalizesSplitDayYearMonthDate() { + String text = "28 2026-05"; + + assertEquals("2026-05-28", TextExtractors.findDate(text)); + } +}