146 changed files with 3903 additions and 1 deletions
@ -0,0 +1,3 @@ |
|||||
|
crawl govnews --count=5 |
||||
|
list storage |
||||
|
exit |
||||
@ -0,0 +1,68 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> |
||||
|
<modelVersion>4.0.0</modelVersion> |
||||
|
<groupId>com.crawler</groupId> |
||||
|
<artifactId>multi-site-crawler</artifactId> |
||||
|
<name>Multi-Site Crawler</name> |
||||
|
<version>1.0-SNAPSHOT</version> |
||||
|
<description>多网站爬虫项目 - 支持B站、抖音、小红书等平台</description> |
||||
|
<build> |
||||
|
<plugins> |
||||
|
<plugin> |
||||
|
<artifactId>maven-jar-plugin</artifactId> |
||||
|
<version>3.3.0</version> |
||||
|
<configuration> |
||||
|
<archive> |
||||
|
<manifest> |
||||
|
<mainClass>com.crawler.Main</mainClass> |
||||
|
</manifest> |
||||
|
</archive> |
||||
|
</configuration> |
||||
|
</plugin> |
||||
|
<plugin> |
||||
|
<artifactId>maven-shade-plugin</artifactId> |
||||
|
<version>3.5.1</version> |
||||
|
<executions> |
||||
|
<execution> |
||||
|
<phase>package</phase> |
||||
|
<goals> |
||||
|
<goal>shade</goal> |
||||
|
</goals> |
||||
|
<configuration> |
||||
|
<filters> |
||||
|
<filter> |
||||
|
<artifact>*:*</artifact> |
||||
|
<excludes> |
||||
|
<exclude>META-INF/*.SF</exclude> |
||||
|
<exclude>META-INF/*.DSA</exclude> |
||||
|
<exclude>META-INF/*.RSA</exclude> |
||||
|
</excludes> |
||||
|
</filter> |
||||
|
</filters> |
||||
|
<transformers> |
||||
|
<transformer> |
||||
|
<mainClass>com.crawler.Main</mainClass> |
||||
|
</transformer> |
||||
|
</transformers> |
||||
|
</configuration> |
||||
|
</execution> |
||||
|
</executions> |
||||
|
</plugin> |
||||
|
</plugins> |
||||
|
</build> |
||||
|
<dependencies> |
||||
|
<dependency> |
||||
|
<groupId>org.projectlombok</groupId> |
||||
|
<artifactId>lombok</artifactId> |
||||
|
<version>1.18.30</version> |
||||
|
<scope>provided</scope> |
||||
|
</dependency> |
||||
|
</dependencies> |
||||
|
<properties> |
||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
||||
|
<maven.compiler.target>21</maven.compiler.target> |
||||
|
<jackson.version>2.15.2</jackson.version> |
||||
|
<slf4j.version>2.0.9</slf4j.version> |
||||
|
<maven.compiler.source>21</maven.compiler.source> |
||||
|
</properties> |
||||
|
</project> |
||||
@ -0,0 +1,31 @@ |
|||||
|
package w8; |
||||
|
|
||||
|
public class Pair<K, V> { |
||||
|
private K key; |
||||
|
private V value; |
||||
|
|
||||
|
public Pair(K key, V value) { |
||||
|
this.key = key; |
||||
|
this.value = value; |
||||
|
} |
||||
|
|
||||
|
public Pair<V, K> swap() { |
||||
|
return new Pair<>(this.value, this.key); |
||||
|
} |
||||
|
|
||||
|
public K getKey() { |
||||
|
return key; |
||||
|
} |
||||
|
|
||||
|
public void setKey(K key) { |
||||
|
this.key = key; |
||||
|
} |
||||
|
|
||||
|
public V getValue() { |
||||
|
return value; |
||||
|
} |
||||
|
|
||||
|
public void setValue(V value) { |
||||
|
this.value = value; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,32 @@ |
|||||
|
package w8; |
||||
|
|
||||
|
import java.util.HashMap; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class Cache<K, V> { |
||||
|
private final Map<K, V> cacheMap; |
||||
|
|
||||
|
public Cache() { |
||||
|
cacheMap = new HashMap<>(); |
||||
|
} |
||||
|
|
||||
|
public void put(K key, V value) { |
||||
|
cacheMap.put(key, value); |
||||
|
} |
||||
|
|
||||
|
public V get(K key) { |
||||
|
return cacheMap.get(key); |
||||
|
} |
||||
|
|
||||
|
public void remove(K key) { |
||||
|
cacheMap.remove(key); |
||||
|
} |
||||
|
|
||||
|
public void clear() { |
||||
|
cacheMap.clear(); |
||||
|
} |
||||
|
|
||||
|
public int size() { |
||||
|
return cacheMap.size(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,64 @@ |
|||||
|
爬取时间: 2026-05-14 11:10:24 |
||||
|
数据条数: 10 |
||||
|
================================ |
||||
|
|
||||
|
【政务新闻】国务院办公厅关于进一步优化营商环境更好服务市场主体的实施意见 |
||||
|
来源: 中国政府网 |
||||
|
发布时间: 2024-01-15 |
||||
|
分类: 政策文件 |
||||
|
链接: http://www.gov.cn/zhengce/content/2024-01/15/content_6865015.htm |
||||
|
|
||||
|
【政务新闻】教育部发布2024年义务教育招生入学工作通知 |
||||
|
来源: 教育部官网 |
||||
|
发布时间: 2024-01-14 |
||||
|
分类: 教育动态 |
||||
|
链接: http://www.moe.gov.cn/jyb_xwfb/gzdt_gzdt/s5987/202401/t20240114_1118607.html |
||||
|
|
||||
|
【政务新闻】人社部公布2024年春节假期安排 |
||||
|
来源: 人力资源和社会保障部 |
||||
|
发布时间: 2024-01-13 |
||||
|
分类: 人事信息 |
||||
|
链接: http://www.mohrss.gov.cn/SYrlzyhshbzb/zwgk/szrs/t202401/t20240113_490258.html |
||||
|
|
||||
|
【政务新闻】国家医保局:进一步完善医保支付政策 |
||||
|
来源: 国家医疗保障局 |
||||
|
发布时间: 2024-01-12 |
||||
|
分类: 医疗健康 |
||||
|
链接: http://www.nhsa.gov.cn/art/2024/1/12/art_10_1015.html |
||||
|
|
||||
|
【政务新闻】生态环境部发布2023年全国环境质量状况 |
||||
|
来源: 生态环境部 |
||||
|
发布时间: 2024-01-11 |
||||
|
分类: 环境保护 |
||||
|
链接: http://www.mee.gov.cn/hjzl/sthjzk/202401/t20240111_1062058.shtml |
||||
|
|
||||
|
【政务新闻】财政部发布2024年财政预算报告 |
||||
|
来源: 财政部 |
||||
|
发布时间: 2024-01-10 |
||||
|
分类: 财政金融 |
||||
|
链接: http://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/202401/t20240110_3912858.htm |
||||
|
|
||||
|
【政务新闻】工信部部署2024年工业和信息化工作 |
||||
|
来源: 工业和信息化部 |
||||
|
发布时间: 2024-01-09 |
||||
|
分类: 工业信息 |
||||
|
链接: http://www.miit.gov.cn/jgsj/xwfb/202401/t20240109_428906.html |
||||
|
|
||||
|
【政务新闻】交通运输部推进交通强国建设 |
||||
|
来源: 交通运输部 |
||||
|
发布时间: 2024-01-08 |
||||
|
分类: 交通建设 |
||||
|
链接: http://www.mot.gov.cn/zcwj/202401/t20240108_3793593.html |
||||
|
|
||||
|
【政务新闻】农业农村部部署春季农业生产 |
||||
|
来源: 农业农村部 |
||||
|
发布时间: 2024-01-07 |
||||
|
分类: 农业农村 |
||||
|
链接: http://www.moa.gov.cn/xw/bmdt/202401/t20240107_6408851.htm |
||||
|
|
||||
|
【政务新闻】国家统计局发布2023年国民经济运行数据 |
||||
|
来源: 国家统计局 |
||||
|
发布时间: 2024-01-06 |
||||
|
分类: 统计数据 |
||||
|
链接: http://www.stats.gov.cn/tjsj/zxfb/202401/t20240117_1930858.html |
||||
|
|
||||
@ -0,0 +1,33 @@ |
|||||
|
================================================== |
||||
|
爬虫数据 - govnews |
||||
|
爬取时间: 2026-05-30 20:20:03 |
||||
|
数据条数: 5 |
||||
|
================================================== |
||||
|
|
||||
|
[1] 国务院办公厅关于进一步优化营商环境更好服务市场主体的实施意见 |
||||
|
作者: 中国政府网 |
||||
|
平台: govnews |
||||
|
链接: http://www.gov.cn |
||||
|
|
||||
|
[2] 教育部发布2024年义务教育招生入学工作通知 |
||||
|
作者: 教育部官网 |
||||
|
平台: govnews |
||||
|
链接: http://www.moe.gov.cn |
||||
|
|
||||
|
[3] 人社部公布2024年春节假期安排 |
||||
|
作者: 人力资源和社会保障部 |
||||
|
平台: govnews |
||||
|
链接: http://www.mohrss.gov.cn |
||||
|
|
||||
|
[4] 国家医保局:进一步完善医保支付政策 |
||||
|
作者: 国家医疗保障局 |
||||
|
平台: govnews |
||||
|
链接: http://www.nhsa.gov.cn |
||||
|
|
||||
|
[5] 生态环境部发布2023年全国环境质量状况 |
||||
|
作者: 生态环境部 |
||||
|
平台: govnews |
||||
|
链接: http://www.mee.gov.cn |
||||
|
|
||||
|
================================================== |
||||
|
共计 5 条记录 |
||||
@ -0,0 +1,94 @@ |
|||||
|
爬取时间: 2026-05-14 11:10:24 |
||||
|
数据条数: 10 |
||||
|
================================ |
||||
|
|
||||
|
【图书馆书目】Java编程思想(第4版) |
||||
|
作者: Bruce Eckel |
||||
|
出版社: 机械工业出版社 |
||||
|
ISBN: 978-7-111-21382-6 |
||||
|
出版年份: 2007 |
||||
|
馆藏位置: A区-3排-15架 |
||||
|
状态: 可借阅 |
||||
|
索书号: TP312/EC4 |
||||
|
|
||||
|
【图书馆书目】深入理解计算机系统 |
||||
|
作者: Randal E. Bryant |
||||
|
出版社: 机械工业出版社 |
||||
|
ISBN: 978-7-111-54493-7 |
||||
|
出版年份: 2016 |
||||
|
馆藏位置: A区-2排-8架 |
||||
|
状态: 可借阅 |
||||
|
索书号: TP301/B83 |
||||
|
|
||||
|
【图书馆书目】算法导论(第3版) |
||||
|
作者: Thomas H. Cormen |
||||
|
出版社: 机械工业出版社 |
||||
|
ISBN: 978-7-111-40701-0 |
||||
|
出版年份: 2012 |
||||
|
馆藏位置: A区-4排-22架 |
||||
|
状态: 已借出 |
||||
|
索书号: TP301/C62 |
||||
|
|
||||
|
【图书馆书目】设计模式:可复用面向对象软件的基础 |
||||
|
作者: Erich Gamma |
||||
|
出版社: 机械工业出版社 |
||||
|
ISBN: 978-7-111-07554-7 |
||||
|
出版年份: 2000 |
||||
|
馆藏位置: A区-1排-10架 |
||||
|
状态: 可借阅 |
||||
|
索书号: TP311.5/G16 |
||||
|
|
||||
|
【图书馆书目】代码大全(第2版) |
||||
|
作者: Steve McConnell |
||||
|
出版社: 电子工业出版社 |
||||
|
ISBN: 978-7-121-02298-5 |
||||
|
出版年份: 2006 |
||||
|
馆藏位置: B区-5排-18架 |
||||
|
状态: 可借阅 |
||||
|
索书号: TP311.5/M13 |
||||
|
|
||||
|
【图书馆书目】人月神话 |
||||
|
作者: Frederick P. Brooks |
||||
|
出版社: 清华大学出版社 |
||||
|
ISBN: 978-7-302-22587-5 |
||||
|
出版年份: 2010 |
||||
|
馆藏位置: B区-3排-5架 |
||||
|
状态: 可借阅 |
||||
|
索书号: TP311.5/B88 |
||||
|
|
||||
|
【图书馆书目】重构:改善既有代码的设计 |
||||
|
作者: Martin Fowler |
||||
|
出版社: 人民邮电出版社 |
||||
|
ISBN: 978-7-115-12057-5 |
||||
|
出版年份: 2010 |
||||
|
馆藏位置: B区-2排-12架 |
||||
|
状态: 已借出 |
||||
|
索书号: TP311.5/F68 |
||||
|
|
||||
|
【图书馆书目】Head First设计模式 |
||||
|
作者: Eric Freeman |
||||
|
出版社: 中国电力出版社 |
||||
|
ISBN: 978-7-5083-5393-7 |
||||
|
出版年份: 2007 |
||||
|
馆藏位置: C区-1排-20架 |
||||
|
状态: 可借阅 |
||||
|
索书号: TP311.5/F84 |
||||
|
|
||||
|
【图书馆书目】Effective Java(第3版) |
||||
|
作者: Joshua Bloch |
||||
|
出版社: 机械工业出版社 |
||||
|
ISBN: 978-7-111-61275-6 |
||||
|
出版年份: 2020 |
||||
|
馆藏位置: C区-4排-8架 |
||||
|
状态: 可借阅 |
||||
|
索书号: TP312/B57 |
||||
|
|
||||
|
【图书馆书目】Clean Code |
||||
|
作者: Robert C. Martin |
||||
|
出版社: 人民邮电出版社 |
||||
|
ISBN: 978-7-115-23385-8 |
||||
|
出版年份: 2010 |
||||
|
馆藏位置: C区-5排-15架 |
||||
|
状态: 可借阅 |
||||
|
索书号: TP311.5/M27 |
||||
|
|
||||
@ -0,0 +1,64 @@ |
|||||
|
爬取时间: 2026-05-14 11:10:24 |
||||
|
数据条数: 10 |
||||
|
================================ |
||||
|
|
||||
|
【天气预报】北京 2026-05-14 |
||||
|
天气: 晴 |
||||
|
温度: -5°C ~ 8°C |
||||
|
风向: 北风 3-4级 |
||||
|
湿度: 35% |
||||
|
|
||||
|
【天气预报】上海 2026-05-14 |
||||
|
天气: 多云 |
||||
|
温度: 8°C ~ 15°C |
||||
|
风向: 东风 2-3级 |
||||
|
湿度: 65% |
||||
|
|
||||
|
【天气预报】广州 2026-05-14 |
||||
|
天气: 小雨 |
||||
|
温度: 18°C ~ 23°C |
||||
|
风向: 南风 4-5级 |
||||
|
湿度: 85% |
||||
|
|
||||
|
【天气预报】深圳 2026-05-14 |
||||
|
天气: 阴 |
||||
|
温度: 20°C ~ 25°C |
||||
|
风向: 东南风 3-4级 |
||||
|
湿度: 80% |
||||
|
|
||||
|
【天气预报】杭州 2026-05-14 |
||||
|
天气: 晴转多云 |
||||
|
温度: 10°C ~ 18°C |
||||
|
风向: 西北风 2-3级 |
||||
|
湿度: 55% |
||||
|
|
||||
|
【天气预报】南京 2026-05-14 |
||||
|
天气: 多云转晴 |
||||
|
温度: 7°C ~ 14°C |
||||
|
风向: 东北风 3-4级 |
||||
|
湿度: 50% |
||||
|
|
||||
|
【天气预报】武汉 2026-05-14 |
||||
|
天气: 小雨 |
||||
|
温度: 5°C ~ 12°C |
||||
|
风向: 北风 4-5级 |
||||
|
湿度: 75% |
||||
|
|
||||
|
【天气预报】成都 2026-05-14 |
||||
|
天气: 阴转小雨 |
||||
|
温度: 6°C ~ 13°C |
||||
|
风向: 南风 2-3级 |
||||
|
湿度: 82% |
||||
|
|
||||
|
【天气预报】重庆 2026-05-14 |
||||
|
天气: 小雨 |
||||
|
温度: 10°C ~ 16°C |
||||
|
风向: 西南风 3-4级 |
||||
|
湿度: 88% |
||||
|
|
||||
|
【天气预报】西安 2026-05-14 |
||||
|
天气: 晴 |
||||
|
温度: -2°C ~ 10°C |
||||
|
风向: 西风 2-3级 |
||||
|
湿度: 40% |
||||
|
|
||||
@ -0,0 +1,58 @@ |
|||||
|
================================================== |
||||
|
爬虫数据 - weather |
||||
|
爬取时间: 2026-05-30 20:18:20 |
||||
|
数据条数: 10 |
||||
|
================================================== |
||||
|
|
||||
|
[1] 北京 2026-05-30 晴 |
||||
|
作者: null |
||||
|
平台: weather |
||||
|
链接: null |
||||
|
|
||||
|
[2] 上海 2026-05-30 多云 |
||||
|
作者: null |
||||
|
平台: weather |
||||
|
链接: null |
||||
|
|
||||
|
[3] 广州 2026-05-30 小雨 |
||||
|
作者: null |
||||
|
平台: weather |
||||
|
链接: null |
||||
|
|
||||
|
[4] 深圳 2026-05-30 阴 |
||||
|
作者: null |
||||
|
平台: weather |
||||
|
链接: null |
||||
|
|
||||
|
[5] 杭州 2026-05-30 晴转多云 |
||||
|
作者: null |
||||
|
平台: weather |
||||
|
链接: null |
||||
|
|
||||
|
[6] 南京 2026-05-30 多云转晴 |
||||
|
作者: null |
||||
|
平台: weather |
||||
|
链接: null |
||||
|
|
||||
|
[7] 武汉 2026-05-30 小雨 |
||||
|
作者: null |
||||
|
平台: weather |
||||
|
链接: null |
||||
|
|
||||
|
[8] 成都 2026-05-30 阴转小雨 |
||||
|
作者: null |
||||
|
平台: weather |
||||
|
链接: null |
||||
|
|
||||
|
[9] 重庆 2026-05-30 小雨 |
||||
|
作者: null |
||||
|
平台: weather |
||||
|
链接: null |
||||
|
|
||||
|
[10] 西安 2026-05-30 晴 |
||||
|
作者: null |
||||
|
平台: weather |
||||
|
链接: null |
||||
|
|
||||
|
================================================== |
||||
|
共计 10 条记录 |
||||
@ -0,0 +1,94 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" |
||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
||||
|
<modelVersion>4.0.0</modelVersion> |
||||
|
|
||||
|
<groupId>com.crawler</groupId> |
||||
|
<artifactId>multi-site-crawler</artifactId> |
||||
|
<version>1.0-SNAPSHOT</version> |
||||
|
<packaging>jar</packaging> |
||||
|
|
||||
|
<name>Multi-Site Crawler</name> |
||||
|
<description>多网站爬虫项目 - 支持B站、抖音、小红书等平台</description> |
||||
|
|
||||
|
<properties> |
||||
|
<maven.compiler.source>21</maven.compiler.source> |
||||
|
<maven.compiler.target>21</maven.compiler.target> |
||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
||||
|
<jackson.version>2.15.2</jackson.version> |
||||
|
<slf4j.version>2.0.9</slf4j.version> |
||||
|
</properties> |
||||
|
|
||||
|
<dependencies> |
||||
|
<dependency> |
||||
|
<groupId>com.fasterxml.jackson.core</groupId> |
||||
|
<artifactId>jackson-databind</artifactId> |
||||
|
<version>${jackson.version}</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>org.slf4j</groupId> |
||||
|
<artifactId>slf4j-api</artifactId> |
||||
|
<version>${slf4j.version}</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>org.slf4j</groupId> |
||||
|
<artifactId>slf4j-simple</artifactId> |
||||
|
<version>${slf4j.version}</version> |
||||
|
<scope>runtime</scope> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>org.projectlombok</groupId> |
||||
|
<artifactId>lombok</artifactId> |
||||
|
<version>1.18.30</version> |
||||
|
<scope>provided</scope> |
||||
|
</dependency> |
||||
|
</dependencies> |
||||
|
|
||||
|
<build> |
||||
|
<plugins> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-jar-plugin</artifactId> |
||||
|
<version>3.3.0</version> |
||||
|
<configuration> |
||||
|
<archive> |
||||
|
<manifest> |
||||
|
<mainClass>com.crawler.Main</mainClass> |
||||
|
</manifest> |
||||
|
</archive> |
||||
|
</configuration> |
||||
|
</plugin> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-shade-plugin</artifactId> |
||||
|
<version>3.5.1</version> |
||||
|
<executions> |
||||
|
<execution> |
||||
|
<phase>package</phase> |
||||
|
<goals> |
||||
|
<goal>shade</goal> |
||||
|
</goals> |
||||
|
<configuration> |
||||
|
<filters> |
||||
|
<filter> |
||||
|
<artifact>*:*</artifact> |
||||
|
<excludes> |
||||
|
<exclude>META-INF/*.SF</exclude> |
||||
|
<exclude>META-INF/*.DSA</exclude> |
||||
|
<exclude>META-INF/*.RSA</exclude> |
||||
|
</excludes> |
||||
|
</filter> |
||||
|
</filters> |
||||
|
<transformers> |
||||
|
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> |
||||
|
<mainClass>com.crawler.Main</mainClass> |
||||
|
</transformer> |
||||
|
</transformers> |
||||
|
</configuration> |
||||
|
</execution> |
||||
|
</executions> |
||||
|
</plugin> |
||||
|
</plugins> |
||||
|
</build> |
||||
|
</project> |
||||
@ -0,0 +1,10 @@ |
|||||
|
package com.crawler; |
||||
|
|
||||
|
import com.crawler.controller.CrawlerController; |
||||
|
|
||||
|
public class Main { |
||||
|
public static void main(String[] args) { |
||||
|
CrawlerController controller = new CrawlerController(); |
||||
|
controller.start(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,119 @@ |
|||||
|
package com.crawler; |
||||
|
|
||||
|
import com.crawler.crawler.GovNewsCrawler; |
||||
|
import com.crawler.crawler.WeatherCrawler; |
||||
|
import com.crawler.crawler.LibraryBookCrawler; |
||||
|
import com.crawler.model.GovNewsData; |
||||
|
import com.crawler.model.WeatherData; |
||||
|
import com.crawler.model.LibraryBookData; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.nio.file.Paths; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class MultiCrawlerMain { |
||||
|
private static final String OUTPUT_DIR = "output"; |
||||
|
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"); |
||||
|
private static final DateTimeFormatter DISPLAY_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
System.out.println("========================================"); |
||||
|
System.out.println(" 多爬虫任务执行器"); |
||||
|
System.out.println("========================================"); |
||||
|
System.out.println("开始执行爬虫任务..."); |
||||
|
System.out.println("当前时间: " + LocalDateTime.now().format(DISPLAY_FORMATTER)); |
||||
|
System.out.println("========================================\n"); |
||||
|
|
||||
|
try { |
||||
|
Path outputPath = Paths.get(OUTPUT_DIR); |
||||
|
if (!Files.exists(outputPath)) { |
||||
|
Files.createDirectories(outputPath); |
||||
|
} |
||||
|
|
||||
|
// 爬取政务新闻资讯
|
||||
|
System.out.println("【任务1/3】爬取政务新闻资讯..."); |
||||
|
GovNewsCrawler govNewsCrawler = new GovNewsCrawler(); |
||||
|
List<GovNewsData> newsList = govNewsCrawler.startCrawl(); |
||||
|
saveData(newsList, "gov_news"); |
||||
|
printNewsData(newsList); |
||||
|
System.out.println(); |
||||
|
|
||||
|
// 爬取天气预报
|
||||
|
System.out.println("【任务2/3】爬取天气预报..."); |
||||
|
WeatherCrawler weatherCrawler = new WeatherCrawler(); |
||||
|
List<WeatherData> weatherList = weatherCrawler.startCrawl(); |
||||
|
saveData(weatherList, "weather"); |
||||
|
printWeatherData(weatherList); |
||||
|
System.out.println(); |
||||
|
|
||||
|
// 爬取图书馆书目资讯
|
||||
|
System.out.println("【任务3/3】爬取图书馆书目资讯..."); |
||||
|
LibraryBookCrawler libraryCrawler = new LibraryBookCrawler(); |
||||
|
List<LibraryBookData> bookList = libraryCrawler.startCrawl(); |
||||
|
saveData(bookList, "library_books"); |
||||
|
printBookData(bookList); |
||||
|
System.out.println(); |
||||
|
|
||||
|
System.out.println("========================================"); |
||||
|
System.out.println(" 所有爬虫任务执行完成"); |
||||
|
System.out.println("========================================"); |
||||
|
System.out.println("政务新闻: " + newsList.size() + " 条"); |
||||
|
System.out.println("天气预报: " + weatherList.size() + " 条"); |
||||
|
System.out.println("图书馆书目: " + bookList.size() + " 条"); |
||||
|
System.out.println("========================================"); |
||||
|
|
||||
|
} catch (Exception e) { |
||||
|
System.err.println("爬虫任务执行失败: " + e.getMessage()); |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static <T> void saveData(List<T> data, String prefix) { |
||||
|
String fileName = prefix + "_" + LocalDateTime.now().format(DATE_FORMATTER) + ".txt"; |
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(Paths.get(OUTPUT_DIR, fileName).toFile()))) { |
||||
|
writer.write("爬取时间: " + LocalDateTime.now().format(DISPLAY_FORMATTER) + "\n"); |
||||
|
writer.write("数据条数: " + data.size() + "\n"); |
||||
|
writer.write("================================\n\n"); |
||||
|
|
||||
|
for (T item : data) { |
||||
|
writer.write(item.toString()); |
||||
|
writer.newLine(); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
System.out.println("数据已保存到文件: " + fileName); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("保存文件失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static void printNewsData(List<GovNewsData> data) { |
||||
|
System.out.println("\n-------- 政务新闻资讯 --------"); |
||||
|
for (int i = 0; i < data.size(); i++) { |
||||
|
System.out.println((i + 1) + ". " + data.get(i).getTitle()); |
||||
|
System.out.println(" 来源: " + data.get(i).getSource() + " | 时间: " + data.get(i).getPublishTime()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static void printWeatherData(List<WeatherData> data) { |
||||
|
System.out.println("\n-------- 天气预报 --------"); |
||||
|
for (int i = 0; i < data.size(); i++) { |
||||
|
WeatherData w = data.get(i); |
||||
|
System.out.println((i + 1) + ". " + w.getCity() + ": " + w.getWeather() + " " + w.getTemperature()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static void printBookData(List<LibraryBookData> data) { |
||||
|
System.out.println("\n-------- 图书馆书目资讯 --------"); |
||||
|
for (int i = 0; i < data.size(); i++) { |
||||
|
LibraryBookData b = data.get(i); |
||||
|
System.out.println((i + 1) + ". 《" + b.getTitle() + "》"); |
||||
|
System.out.println(" 作者: " + b.getAuthor() + " | 状态: " + b.getStatus()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,75 @@ |
|||||
|
package com.crawler.cli; |
||||
|
|
||||
|
import com.crawler.cli.command.CommandContext; |
||||
|
import com.crawler.cli.command.CommandOutput; |
||||
|
import com.crawler.constant.AnsiColor; |
||||
|
|
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
public class CliApplication { |
||||
|
private final CommandRegistry registry; |
||||
|
private final CommandOutput output; |
||||
|
private final Scanner scanner; |
||||
|
private boolean running; |
||||
|
|
||||
|
public CliApplication() { |
||||
|
this.registry = new CommandRegistry(); |
||||
|
this.output = new CommandOutput(); |
||||
|
this.scanner = new Scanner(System.in); |
||||
|
this.running = true; |
||||
|
} |
||||
|
|
||||
|
public void start() { |
||||
|
printBanner(); |
||||
|
|
||||
|
while (running) { |
||||
|
printPrompt(); |
||||
|
String input = scanner.nextLine(); |
||||
|
|
||||
|
if (input == null || input.isEmpty()) { |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
if (input.equalsIgnoreCase("exit") || input.equalsIgnoreCase("quit")) { |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
CommandRegistry.CommandResult result = registry.execute(input); |
||||
|
|
||||
|
if (!result.isSuccess() && result.getMessage() != null) { |
||||
|
output.printError(result.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
printExit(); |
||||
|
} |
||||
|
|
||||
|
private void printBanner() { |
||||
|
System.out.println(); |
||||
|
System.out.println(AnsiColor.CYAN + "╔═══════════════════════════════════════════════════════════╗" + AnsiColor.RESET); |
||||
|
System.out.println(AnsiColor.CYAN + "║ ║" + AnsiColor.RESET); |
||||
|
System.out.println(AnsiColor.CYAN + "║ " + AnsiColor.BOLD + AnsiColor.WHITE + " 多平台爬虫系统 v2.0 - CLI 模式 " + AnsiColor.CYAN + " ║" + AnsiColor.RESET); |
||||
|
System.out.println(AnsiColor.CYAN + "║ ║" + AnsiColor.RESET); |
||||
|
System.out.println(AnsiColor.CYAN + "║ 支持平台: 政务新闻 | 天气预报 | 图书馆书目 ║" + AnsiColor.RESET); |
||||
|
System.out.println(AnsiColor.CYAN + "║ ║" + AnsiColor.RESET); |
||||
|
System.out.println(AnsiColor.CYAN + "╚═══════════════════════════════════════════════════════════╝" + AnsiColor.RESET); |
||||
|
System.out.println(); |
||||
|
System.out.println("输入 " + AnsiColor.GREEN + "help" + AnsiColor.RESET + " 查看可用命令"); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
private void printPrompt() { |
||||
|
System.out.print(AnsiColor.BLUE + "crawler> " + AnsiColor.RESET); |
||||
|
} |
||||
|
|
||||
|
private void printExit() { |
||||
|
System.out.println(); |
||||
|
System.out.println(AnsiColor.GREEN + "感谢使用爬虫系统,再见!" + AnsiColor.RESET); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
CliApplication app = new CliApplication(); |
||||
|
app.start(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,142 @@ |
|||||
|
package com.crawler.cli; |
||||
|
|
||||
|
import com.crawler.cli.command.Command; |
||||
|
import com.crawler.cli.command.CommandCategory; |
||||
|
import com.crawler.cli.command.CommandContext; |
||||
|
import com.crawler.cli.command.system.ExitCommand; |
||||
|
import com.crawler.cli.command.system.HelpCommand; |
||||
|
import com.crawler.cli.command.crawler.CrawlCommand; |
||||
|
import com.crawler.cli.command.crawler.ListCommand; |
||||
|
import com.crawler.exception.CommandException; |
||||
|
|
||||
|
import java.util.*; |
||||
|
|
||||
|
public class CommandRegistry { |
||||
|
private final Map<String, Command> commandMap = new LinkedHashMap<>(); |
||||
|
private final Map<String, Command> aliasMap = new HashMap<>(); |
||||
|
private final Command[] commands; |
||||
|
|
||||
|
public CommandRegistry() { |
||||
|
initCommands(); |
||||
|
this.commands = commandMap.values().toArray(new Command[0]); |
||||
|
} |
||||
|
|
||||
|
private void initCommands() { |
||||
|
register(new HelpCommand(this)); |
||||
|
register(new ListCommand()); |
||||
|
register(new CrawlCommand()); |
||||
|
register(new ExitCommand()); |
||||
|
} |
||||
|
|
||||
|
public void register(Command command) { |
||||
|
commandMap.put(command.getName().toLowerCase(), command); |
||||
|
|
||||
|
for (String alias : command.getAliases()) { |
||||
|
aliasMap.put(alias.toLowerCase(), command); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public Command getCommand(String name) { |
||||
|
String key = name.toLowerCase(); |
||||
|
|
||||
|
Command command = commandMap.get(key); |
||||
|
if (command != null) { |
||||
|
return command; |
||||
|
} |
||||
|
|
||||
|
return aliasMap.get(key); |
||||
|
} |
||||
|
|
||||
|
public boolean exists(String name) { |
||||
|
return getCommand(name) != null; |
||||
|
} |
||||
|
|
||||
|
public String[] getCommandNames() { |
||||
|
return commandMap.keySet().toArray(new String[0]); |
||||
|
} |
||||
|
|
||||
|
public Command[] getCommands() { |
||||
|
return commands; |
||||
|
} |
||||
|
|
||||
|
public Map<CommandCategory, Command[]> getCommandsByCategory() { |
||||
|
Map<CommandCategory, List<Command>> categoryMap = new EnumMap<>(CommandCategory.class); |
||||
|
|
||||
|
for (CommandCategory category : CommandCategory.values()) { |
||||
|
categoryMap.put(category, new ArrayList<>()); |
||||
|
} |
||||
|
|
||||
|
for (Command command : commands) { |
||||
|
CommandCategory category = command.getCategory(); |
||||
|
categoryMap.get(category).add(command); |
||||
|
} |
||||
|
|
||||
|
Map<CommandCategory, Command[]> result = new EnumMap<>(CommandCategory.class); |
||||
|
for (Map.Entry<CommandCategory, List<Command>> entry : categoryMap.entrySet()) { |
||||
|
result.put(entry.getKey(), entry.getValue().toArray(new Command[0])); |
||||
|
} |
||||
|
|
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
public CommandResult execute(String input) { |
||||
|
if (input == null || input.isEmpty()) { |
||||
|
return CommandResult.failure("输入不能为空"); |
||||
|
} |
||||
|
|
||||
|
String[] parts = input.trim().split("\\s+"); |
||||
|
String commandName = parts[0].toLowerCase(); |
||||
|
|
||||
|
Command command = getCommand(commandName); |
||||
|
if (command == null) { |
||||
|
return CommandResult.failure("未知命令: " + commandName + "\n输入 help 查看可用命令"); |
||||
|
} |
||||
|
|
||||
|
String[] args = new String[parts.length - 1]; |
||||
|
if (parts.length > 1) { |
||||
|
System.arraycopy(parts, 1, args, 0, parts.length - 1); |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
if (!command.validate(args)) { |
||||
|
return CommandResult.failure("命令参数验证失败: " + command.getUsage()); |
||||
|
} |
||||
|
|
||||
|
CommandContext context = new CommandContext(); |
||||
|
context.setRawArgs(args); |
||||
|
command.execute(context); |
||||
|
|
||||
|
return CommandResult.success(); |
||||
|
} catch (CommandException e) { |
||||
|
return CommandResult.failure(e.getFullMessage()); |
||||
|
} catch (Exception e) { |
||||
|
return CommandResult.failure("命令执行出错: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class CommandResult { |
||||
|
private final boolean success; |
||||
|
private final String message; |
||||
|
|
||||
|
private CommandResult(boolean success, String message) { |
||||
|
this.success = success; |
||||
|
this.message = message; |
||||
|
} |
||||
|
|
||||
|
public static CommandResult success() { |
||||
|
return new CommandResult(true, null); |
||||
|
} |
||||
|
|
||||
|
public static CommandResult failure(String message) { |
||||
|
return new CommandResult(false, message); |
||||
|
} |
||||
|
|
||||
|
public boolean isSuccess() { |
||||
|
return success; |
||||
|
} |
||||
|
|
||||
|
public String getMessage() { |
||||
|
return message; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,15 @@ |
|||||
|
package com.crawler.cli.command; |
||||
|
|
||||
|
import com.crawler.exception.CommandException; |
||||
|
|
||||
|
public interface Command { |
||||
|
String getName(); |
||||
|
String getDescription(); |
||||
|
String getUsage(); |
||||
|
CommandCategory getCategory(); |
||||
|
void execute(CommandContext context) throws CommandException; |
||||
|
boolean validate(String[] args); |
||||
|
default String[] getAliases() { return new String[0]; } |
||||
|
default int getMinArgs() { return 0; } |
||||
|
default int getMaxArgs() { return Integer.MAX_VALUE; } |
||||
|
} |
||||
@ -0,0 +1,18 @@ |
|||||
|
package com.crawler.cli.command; |
||||
|
|
||||
|
public enum CommandCategory { |
||||
|
SYSTEM("系统命令"), |
||||
|
CRAWLER("爬虫命令"), |
||||
|
STORAGE("存储命令"), |
||||
|
UTILITY("工具命令"); |
||||
|
|
||||
|
private final String description; |
||||
|
|
||||
|
CommandCategory(String description) { |
||||
|
this.description = description; |
||||
|
} |
||||
|
|
||||
|
public String getDescription() { |
||||
|
return description; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,59 @@ |
|||||
|
package com.crawler.cli.command; |
||||
|
|
||||
|
import com.crawler.exception.CommandException; |
||||
|
import java.util.HashMap; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class CommandContext { |
||||
|
private final Map<String, Object> attributes = new HashMap<>(); |
||||
|
private String[] rawArgs; |
||||
|
private CommandOutput output; |
||||
|
|
||||
|
public CommandContext() { |
||||
|
this.output = new CommandOutput(); |
||||
|
} |
||||
|
|
||||
|
public void setAttribute(String key, Object value) { |
||||
|
attributes.put(key, value); |
||||
|
} |
||||
|
|
||||
|
public Object getAttribute(String key) { |
||||
|
return attributes.get(key); |
||||
|
} |
||||
|
|
||||
|
@SuppressWarnings("unchecked") |
||||
|
public <T> T getAttribute(String key, Class<T> type) { |
||||
|
Object value = attributes.get(key); |
||||
|
if (value != null && type.isInstance(value)) { |
||||
|
return (T) value; |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public String[] getRawArgs() { |
||||
|
return rawArgs; |
||||
|
} |
||||
|
|
||||
|
public void setRawArgs(String[] rawArgs) { |
||||
|
this.rawArgs = rawArgs; |
||||
|
} |
||||
|
|
||||
|
public CommandOutput getOutput() { |
||||
|
return output; |
||||
|
} |
||||
|
|
||||
|
public void setOutput(CommandOutput output) { |
||||
|
this.output = output; |
||||
|
} |
||||
|
|
||||
|
public String getArg(int index) { |
||||
|
if (rawArgs != null && index < rawArgs.length && index >= 0) { |
||||
|
return rawArgs[index]; |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public int getArgCount() { |
||||
|
return rawArgs != null ? rawArgs.length : 0; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,93 @@ |
|||||
|
package com.crawler.cli.command; |
||||
|
|
||||
|
import com.crawler.constant.AnsiColor; |
||||
|
|
||||
|
public class CommandOutput { |
||||
|
private boolean useColor = true; |
||||
|
|
||||
|
public void print(String message) { |
||||
|
System.out.print(message); |
||||
|
} |
||||
|
|
||||
|
public void println() { |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void println(String message) { |
||||
|
System.out.println(message); |
||||
|
} |
||||
|
|
||||
|
public void printSuccess(String message) { |
||||
|
System.out.println((useColor ? AnsiColor.GREEN : "") + "✓ " + message + (useColor ? AnsiColor.RESET : "")); |
||||
|
} |
||||
|
|
||||
|
public void printError(String message) { |
||||
|
System.err.println((useColor ? AnsiColor.RED : "") + "✗ " + message + (useColor ? AnsiColor.RESET : "")); |
||||
|
} |
||||
|
|
||||
|
public void printInfo(String message) { |
||||
|
System.out.println((useColor ? AnsiColor.CYAN : "") + "ℹ " + message + (useColor ? AnsiColor.RESET : "")); |
||||
|
} |
||||
|
|
||||
|
public void printWarning(String message) { |
||||
|
System.out.println((useColor ? AnsiColor.YELLOW : "") + "⚠ " + message + (useColor ? AnsiColor.RESET : "")); |
||||
|
} |
||||
|
|
||||
|
public void printHeader(String message) { |
||||
|
System.out.println(); |
||||
|
System.out.println((useColor ? AnsiColor.BOLD + AnsiColor.CYAN : "") + "═══ " + message + " ═══" + (useColor ? AnsiColor.RESET : "")); |
||||
|
} |
||||
|
|
||||
|
public void printSubHeader(String message) { |
||||
|
System.out.println((useColor ? AnsiColor.BOLD : "") + "-- " + message + " --" + (useColor ? AnsiColor.RESET : "")); |
||||
|
} |
||||
|
|
||||
|
public void printSeparator() { |
||||
|
System.out.println((useColor ? AnsiColor.DIM : "") + "─".repeat(60) + (useColor ? AnsiColor.RESET : "")); |
||||
|
} |
||||
|
|
||||
|
public void printTable(String[] headers, String[][] rows) { |
||||
|
if (headers == null || headers.length == 0) return; |
||||
|
|
||||
|
int[] colWidths = new int[headers.length]; |
||||
|
for (int i = 0; i < headers.length; i++) { |
||||
|
colWidths[i] = headers[i].length(); |
||||
|
} |
||||
|
|
||||
|
for (String[] row : rows) { |
||||
|
if (row != null) { |
||||
|
for (int i = 0; i < Math.min(row.length, colWidths.length); i++) { |
||||
|
colWidths[i] = Math.max(colWidths[i], row[i] != null ? row[i].length() : 0); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
printSeparator(); |
||||
|
StringBuilder headerLine = new StringBuilder("│"); |
||||
|
for (int i = 0; i < headers.length; i++) { |
||||
|
headerLine.append(String.format(" %-" + colWidths[i] + "s │", headers[i])); |
||||
|
} |
||||
|
System.out.println((useColor ? AnsiColor.BOLD + AnsiColor.CYAN : "") + headerLine + (useColor ? AnsiColor.RESET : "")); |
||||
|
printSeparator(); |
||||
|
|
||||
|
for (String[] row : rows) { |
||||
|
if (row != null) { |
||||
|
StringBuilder rowLine = new StringBuilder("│"); |
||||
|
for (int i = 0; i < colWidths.length; i++) { |
||||
|
String cell = i < row.length && row[i] != null ? row[i] : ""; |
||||
|
rowLine.append(String.format(" %-" + colWidths[i] + "s │", cell)); |
||||
|
} |
||||
|
System.out.println(rowLine); |
||||
|
} |
||||
|
} |
||||
|
printSeparator(); |
||||
|
} |
||||
|
|
||||
|
public void newLine() { |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void setUseColor(boolean useColor) { |
||||
|
this.useColor = useColor; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,193 @@ |
|||||
|
package com.crawler.cli.command.crawler; |
||||
|
|
||||
|
import com.crawler.cli.command.Command; |
||||
|
import com.crawler.cli.command.CommandCategory; |
||||
|
import com.crawler.cli.command.CommandContext; |
||||
|
import com.crawler.cli.command.CommandOutput; |
||||
|
import com.crawler.exception.CommandException; |
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.factory.CrawlerFactory; |
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
import com.crawler.strategy.crawler.CrawlStrategy; |
||||
|
import com.crawler.strategy.crawler.CrawlStrategyFactory; |
||||
|
import com.crawler.strategy.storage.StorageStrategy; |
||||
|
import com.crawler.strategy.storage.StorageStrategyFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.Arrays; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CrawlCommand implements Command { |
||||
|
private final CommandOutput output = new CommandOutput(); |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "crawl"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return "爬取指定平台的热门内容"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getUsage() { |
||||
|
return "crawl [平台名] [--count=N] [--format=json|txt|csv] | crawl all"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CommandCategory getCategory() { |
||||
|
return CommandCategory.CRAWLER; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String[] getAliases() { |
||||
|
return new String[]{"爬取", "抓取"}; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public int getMinArgs() { |
||||
|
return 0; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean validate(String[] args) { |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(CommandContext context) throws CommandException { |
||||
|
String[] args = context.getRawArgs(); |
||||
|
|
||||
|
if (args.length == 0) { |
||||
|
output.printError("请指定要爬取的平台"); |
||||
|
output.printInfo("使用方法: " + getUsage()); |
||||
|
output.printInfo("支持的平台: " + String.join(", ", CrawlerFactory.getSupportedPlatforms())); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String platform = args[0].toLowerCase(); |
||||
|
int count = getCountFromArgs(args); |
||||
|
String format = getFormatFromArgs(args); |
||||
|
|
||||
|
try { |
||||
|
if ("all".equals(platform)) { |
||||
|
crawlAllPlatforms(context, count, format); |
||||
|
} else if (CrawlerFactory.supports(platform)) { |
||||
|
crawlSinglePlatform(context, platform, count, format); |
||||
|
} else { |
||||
|
throw new CommandException.UnknownCommandException(platform); |
||||
|
} |
||||
|
} catch (CrawlerException | CommandException e) { |
||||
|
throw new CommandException.CommandExecutionException(getName(), e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlSinglePlatform(CommandContext context, String platform, int count, String format) throws CommandException { |
||||
|
output.printHeader("爬取 " + platform + " 热门内容"); |
||||
|
|
||||
|
try { |
||||
|
CrawlStrategy<?> strategy = CrawlStrategyFactory.getStrategy(platform); |
||||
|
List<BaseMediaData> dataList = new ArrayList<>(); |
||||
|
for (BaseMediaData item : strategy.crawl(count)) { |
||||
|
dataList.add(item); |
||||
|
} |
||||
|
|
||||
|
output.println("成功获取 " + dataList.size() + " 条数据"); |
||||
|
output.newLine(); |
||||
|
|
||||
|
printResults(dataList); |
||||
|
|
||||
|
StorageStrategy storage = StorageStrategyFactory.getStrategy(format); |
||||
|
String filePath = storage.save(dataList, platform); |
||||
|
|
||||
|
output.printSuccess("数据已保存到: " + filePath); |
||||
|
context.setAttribute("lastResult", dataList); |
||||
|
context.setAttribute("lastPlatform", platform); |
||||
|
|
||||
|
} catch (Exception e) { |
||||
|
throw new CommandException.CommandExecutionException(getName(), e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlAllPlatforms(CommandContext context, int count, String format) throws CommandException { |
||||
|
output.printHeader("爬取所有平台"); |
||||
|
List<BaseMediaData> allData = new ArrayList<>(); |
||||
|
|
||||
|
String[] skipPlatforms = {"b站", "抖音", "小红书"}; |
||||
|
|
||||
|
for (String platform : CrawlerFactory.getSupportedPlatforms()) { |
||||
|
if (!Arrays.asList(skipPlatforms).contains(platform)) { |
||||
|
output.printInfo("正在爬取: " + platform + "..."); |
||||
|
|
||||
|
try { |
||||
|
CrawlStrategy<?> strategy = CrawlStrategyFactory.getStrategy(platform); |
||||
|
List<BaseMediaData> data = new ArrayList<>(); |
||||
|
for (BaseMediaData item : strategy.crawl(count)) { |
||||
|
data.add(item); |
||||
|
} |
||||
|
allData.addAll(data); |
||||
|
output.printSuccess(platform + ": 获取 " + data.size() + " 条"); |
||||
|
} catch (Exception e) { |
||||
|
output.printWarning(platform + " 爬取失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
output.newLine(); |
||||
|
output.printSuccess("所有平台爬取完成,共获取 " + allData.size() + " 条数据"); |
||||
|
|
||||
|
printResults(allData); |
||||
|
|
||||
|
StorageStrategy storage = StorageStrategyFactory.getStrategy(format); |
||||
|
String filePath = storage.save(allData, "all_platforms"); |
||||
|
output.printSuccess("数据已保存到: " + filePath); |
||||
|
} |
||||
|
|
||||
|
private void printResults(List<BaseMediaData> dataList) { |
||||
|
if (dataList.isEmpty()) { |
||||
|
output.printWarning("暂无数据"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String[][] rows = new String[dataList.size()][4]; |
||||
|
for (int i = 0; i < dataList.size(); i++) { |
||||
|
BaseMediaData item = dataList.get(i); |
||||
|
rows[i][0] = String.valueOf(i + 1); |
||||
|
rows[i][1] = truncate(item.getTitle(), 30); |
||||
|
rows[i][2] = truncate(item.getAuthor(), 12); |
||||
|
rows[i][3] = item.getPlatform(); |
||||
|
} |
||||
|
|
||||
|
output.printTable(new String[]{"序号", "标题", "作者", "平台"}, rows); |
||||
|
} |
||||
|
|
||||
|
private String truncate(String str, int maxLen) { |
||||
|
if (str == null) return ""; |
||||
|
return str.length() > maxLen ? str.substring(0, maxLen - 3) + "..." : str; |
||||
|
} |
||||
|
|
||||
|
private int getCountFromArgs(String[] args) { |
||||
|
for (String arg : args) { |
||||
|
if (arg.startsWith("--count=")) { |
||||
|
try { |
||||
|
return Integer.parseInt(arg.substring(8)); |
||||
|
} catch (NumberFormatException ignored) { |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
return 10; |
||||
|
} |
||||
|
|
||||
|
private String getFormatFromArgs(String[] args) { |
||||
|
for (String arg : args) { |
||||
|
if (arg.startsWith("--format=")) { |
||||
|
String format = arg.substring(9).toLowerCase(); |
||||
|
if (format.equals("json") || format.equals("txt") || format.equals("csv")) { |
||||
|
return format; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
return "txt"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,103 @@ |
|||||
|
package com.crawler.cli.command.crawler; |
||||
|
|
||||
|
import com.crawler.cli.command.Command; |
||||
|
import com.crawler.cli.command.CommandCategory; |
||||
|
import com.crawler.cli.command.CommandContext; |
||||
|
import com.crawler.cli.command.CommandOutput; |
||||
|
import com.crawler.exception.CommandException; |
||||
|
import com.crawler.factory.CrawlerFactory; |
||||
|
|
||||
|
public class ListCommand implements Command { |
||||
|
private final CommandOutput output = new CommandOutput(); |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "list"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return "列出所有支持的平台"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getUsage() { |
||||
|
return "list [platform|crawler|storage]"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CommandCategory getCategory() { |
||||
|
return CommandCategory.UTILITY; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String[] getAliases() { |
||||
|
return new String[]{"ls", "列表"}; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean validate(String[] args) { |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(CommandContext context) throws CommandException { |
||||
|
String[] args = context.getRawArgs(); |
||||
|
String filter = args.length > 0 ? args[0].toLowerCase() : "platform"; |
||||
|
|
||||
|
switch (filter) { |
||||
|
case "platform": |
||||
|
case "crawler": |
||||
|
listPlatforms(); |
||||
|
break; |
||||
|
case "storage": |
||||
|
listStorage(); |
||||
|
break; |
||||
|
default: |
||||
|
listPlatforms(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void listPlatforms() { |
||||
|
output.printHeader("支持的爬虫平台"); |
||||
|
|
||||
|
String[] platforms = CrawlerFactory.getSupportedPlatforms(); |
||||
|
String[][] rows = new String[platforms.length][2]; |
||||
|
|
||||
|
for (int i = 0; i < platforms.length; i++) { |
||||
|
rows[i][0] = platforms[i]; |
||||
|
rows[i][1] = getPlatformDescription(platforms[i]); |
||||
|
} |
||||
|
|
||||
|
output.printTable(new String[]{"平台名称", "描述"}, rows); |
||||
|
output.newLine(); |
||||
|
output.printInfo("使用方法: crawl [平台名]"); |
||||
|
output.printInfo("示例: crawl bilibili"); |
||||
|
} |
||||
|
|
||||
|
private void listStorage() { |
||||
|
output.printHeader("支持的存储格式"); |
||||
|
|
||||
|
String[][] rows = { |
||||
|
{"txt", "文本格式 (默认)"}, |
||||
|
{"json", "JSON格式"}, |
||||
|
{"csv", "CSV表格格式"} |
||||
|
}; |
||||
|
|
||||
|
output.printTable(new String[]{"格式", "描述"}, rows); |
||||
|
output.newLine(); |
||||
|
output.printInfo("使用示例: crawl bilibili --format=json"); |
||||
|
} |
||||
|
|
||||
|
private String getPlatformDescription(String platform) { |
||||
|
return switch (platform) { |
||||
|
case "bilibili", "b站" -> "哔哩哔哩热门视频"; |
||||
|
case "douyin", "抖音" -> "抖音热门视频"; |
||||
|
case "xiaohongshu", "小红书" -> "小红书热门笔记"; |
||||
|
case "govnews", "政务新闻" -> "政务新闻资讯"; |
||||
|
case "weather", "天气预报" -> "天气预报数据"; |
||||
|
case "library", "图书馆" -> "图书馆书目资讯"; |
||||
|
default -> "未知平台"; |
||||
|
}; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,49 @@ |
|||||
|
package com.crawler.cli.command.system; |
||||
|
|
||||
|
import com.crawler.cli.command.Command; |
||||
|
import com.crawler.cli.command.CommandCategory; |
||||
|
import com.crawler.cli.command.CommandContext; |
||||
|
import com.crawler.cli.command.CommandOutput; |
||||
|
import com.crawler.exception.CommandException; |
||||
|
|
||||
|
public class ExitCommand implements Command { |
||||
|
private final CommandOutput output = new CommandOutput(); |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "exit"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return "退出程序"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getUsage() { |
||||
|
return "exit"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CommandCategory getCategory() { |
||||
|
return CommandCategory.SYSTEM; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String[] getAliases() { |
||||
|
return new String[]{"quit", "q", "退出"}; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean validate(String[] args) { |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(CommandContext context) throws CommandException { |
||||
|
output.println(); |
||||
|
output.printSuccess("感谢使用爬虫系统,再见!"); |
||||
|
output.println(); |
||||
|
System.exit(0); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,108 @@ |
|||||
|
package com.crawler.cli.command.system; |
||||
|
|
||||
|
import com.crawler.cli.command.Command; |
||||
|
import com.crawler.cli.command.CommandCategory; |
||||
|
import com.crawler.cli.command.CommandContext; |
||||
|
import com.crawler.cli.command.CommandOutput; |
||||
|
import com.crawler.cli.CommandRegistry; |
||||
|
import com.crawler.exception.CommandException; |
||||
|
import com.crawler.factory.CrawlerFactory; |
||||
|
|
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class HelpCommand implements Command { |
||||
|
private final CommandOutput output = new CommandOutput(); |
||||
|
private final CommandRegistry registry; |
||||
|
|
||||
|
public HelpCommand(CommandRegistry registry) { |
||||
|
this.registry = registry; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "help"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return "显示帮助信息"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getUsage() { |
||||
|
return "help [命令名]"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CommandCategory getCategory() { |
||||
|
return CommandCategory.SYSTEM; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String[] getAliases() { |
||||
|
return new String[]{"h", "帮助", "?"}; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean validate(String[] args) { |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(CommandContext context) throws CommandException { |
||||
|
String[] args = context.getRawArgs(); |
||||
|
|
||||
|
if (args.length > 0) { |
||||
|
showCommandHelp(args[0]); |
||||
|
} else { |
||||
|
showGeneralHelp(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void showGeneralHelp() { |
||||
|
output.printHeader("帮助信息"); |
||||
|
output.println("欢迎使用多平台爬虫系统!"); |
||||
|
output.newLine(); |
||||
|
|
||||
|
Map<CommandCategory, Command[]> commandsByCategory = registry.getCommandsByCategory(); |
||||
|
|
||||
|
for (CommandCategory category : CommandCategory.values()) { |
||||
|
Command[] commands = commandsByCategory.get(category); |
||||
|
if (commands != null && commands.length > 0) { |
||||
|
output.printSubHeader(category.getDescription()); |
||||
|
for (Command cmd : commands) { |
||||
|
output.println(String.format(" %-15s %s", cmd.getName(), cmd.getDescription())); |
||||
|
for (String alias : cmd.getAliases()) { |
||||
|
output.println(String.format(" %-15s (别名)", alias)); |
||||
|
} |
||||
|
} |
||||
|
output.newLine(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
output.printSeparator(); |
||||
|
output.printInfo("支持的爬虫平台: " + String.join(", ", CrawlerFactory.getSupportedPlatforms())); |
||||
|
output.newLine(); |
||||
|
output.printInfo("使用示例:"); |
||||
|
output.println(" crawl bilibili # 爬取B站热门视频"); |
||||
|
output.println(" crawl weather # 爬取天气预报"); |
||||
|
output.println(" crawl all # 爬取所有平台"); |
||||
|
output.println(" crawl govnews --count=5 # 爬取5条政务新闻"); |
||||
|
} |
||||
|
|
||||
|
private void showCommandHelp(String commandName) throws CommandException { |
||||
|
Command command = registry.getCommand(commandName); |
||||
|
if (command == null) { |
||||
|
throw new CommandException.UnknownCommandException(commandName); |
||||
|
} |
||||
|
|
||||
|
output.printHeader("命令: " + command.getName()); |
||||
|
output.println("描述: " + command.getDescription()); |
||||
|
output.println("用法: " + command.getUsage()); |
||||
|
output.println("类别: " + command.getCategory().getDescription()); |
||||
|
|
||||
|
if (command.getAliases().length > 0) { |
||||
|
output.println("别名: " + String.join(", ", command.getAliases())); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,8 @@ |
|||||
|
package com.crawler.command; |
||||
|
|
||||
|
public interface Command { |
||||
|
String getName(); |
||||
|
String getDescription(); |
||||
|
String getUsage(); |
||||
|
void execute(String[] args); |
||||
|
} |
||||
@ -0,0 +1,107 @@ |
|||||
|
package com.crawler.command; |
||||
|
|
||||
|
import com.crawler.crawler.AbstractCrawler; |
||||
|
import com.crawler.factory.CrawlerFactory; |
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
import com.crawler.storage.DataStorage; |
||||
|
import com.crawler.storage.StorageFactory; |
||||
|
import com.crawler.view.ConsoleView; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.Arrays; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CrawlCommand implements Command { |
||||
|
private final ConsoleView view = ConsoleView.getInstance(); |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { return "crawl"; } |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { return "爬取指定平台的热门内容"; } |
||||
|
|
||||
|
@Override |
||||
|
public String getUsage() { return "crawl [平台名] | crawl all"; } |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args) { |
||||
|
if (args.length == 0) { |
||||
|
view.printError("请指定要爬取的平台,如: crawl bilibili"); |
||||
|
view.printInfo("支持的平台: " + String.join(", ", CrawlerFactory.getSupportedPlatforms())); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String platform = args[0].toLowerCase(); |
||||
|
|
||||
|
try { |
||||
|
if ("all".equals(platform)) { |
||||
|
crawlAllPlatforms(); |
||||
|
} else if (CrawlerFactory.supports(platform)) { |
||||
|
crawlSinglePlatform(platform); |
||||
|
} else { |
||||
|
view.printError("不支持的平台: " + platform); |
||||
|
view.printInfo("支持的平台: " + String.join(", ", CrawlerFactory.getSupportedPlatforms())); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
view.printError("爬取失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlSinglePlatform(String platform) { |
||||
|
view.printHeader("爬取 " + platform + " 热门内容"); |
||||
|
|
||||
|
AbstractCrawler<BaseMediaData> crawler = CrawlerFactory.getCrawler(platform); |
||||
|
List<BaseMediaData> dataList = crawler.startCrawl(); |
||||
|
|
||||
|
printResults(dataList); |
||||
|
saveResults(dataList); |
||||
|
} |
||||
|
|
||||
|
private void crawlAllPlatforms() { |
||||
|
view.printHeader("爬取所有平台"); |
||||
|
List<BaseMediaData> allData = new ArrayList<>(); |
||||
|
|
||||
|
for (String platform : CrawlerFactory.getSupportedPlatforms()) { |
||||
|
if (!Arrays.asList("b站", "抖音", "小红书").contains(platform)) { |
||||
|
view.printInfo("正在爬取: " + platform); |
||||
|
AbstractCrawler<BaseMediaData> crawler = CrawlerFactory.getCrawler(platform); |
||||
|
allData.addAll(crawler.startCrawl()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
view.printSuccess("所有平台爬取完成,共获取 " + allData.size() + " 条数据"); |
||||
|
printResults(allData); |
||||
|
saveResults(allData); |
||||
|
} |
||||
|
|
||||
|
private void printResults(List<BaseMediaData> dataList) { |
||||
|
if (dataList.isEmpty()) { |
||||
|
view.printWarning("暂无数据"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
view.newLine(); |
||||
|
for (int i = 0; i < dataList.size(); i++) { |
||||
|
BaseMediaData item = dataList.get(i); |
||||
|
view.println(String.format("%-3d | %-30s | %-12s | %-10s | %s", |
||||
|
i + 1, |
||||
|
item.getTitle().length() > 28 ? item.getTitle().substring(0, 25) + "..." : item.getTitle(), |
||||
|
item.getAuthor().length() > 10 ? item.getAuthor().substring(0, 9) + "..." : item.getAuthor(), |
||||
|
formatViewCount(item.getViewCount()), |
||||
|
item.getPlatform())); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void saveResults(List<BaseMediaData> dataList) { |
||||
|
DataStorage<BaseMediaData> storage = StorageFactory.getStorage("txt"); |
||||
|
storage.save(dataList); |
||||
|
view.printSuccess("数据已保存到 output 目录"); |
||||
|
} |
||||
|
|
||||
|
private String formatViewCount(Long viewCount) { |
||||
|
if (viewCount == null) return "0"; |
||||
|
if (viewCount >= 100000000) return String.format("%.1f亿", viewCount / 100000000.0); |
||||
|
if (viewCount >= 10000) return String.format("%.1f万", viewCount / 10000.0); |
||||
|
return String.valueOf(viewCount); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,22 @@ |
|||||
|
package com.crawler.command; |
||||
|
|
||||
|
import com.crawler.view.ConsoleView; |
||||
|
|
||||
|
public class ExitCommand implements Command { |
||||
|
private final ConsoleView view = ConsoleView.getInstance(); |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { return "exit"; } |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { return "退出程序"; } |
||||
|
|
||||
|
@Override |
||||
|
public String getUsage() { return "exit"; } |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args) { |
||||
|
view.printExit(); |
||||
|
System.exit(0); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,44 @@ |
|||||
|
package com.crawler.command; |
||||
|
|
||||
|
import com.crawler.factory.CrawlerFactory; |
||||
|
import com.crawler.view.ConsoleView; |
||||
|
|
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class HelpCommand implements Command { |
||||
|
private final ConsoleView view = ConsoleView.getInstance(); |
||||
|
private final Map<String, Command> commandMap; |
||||
|
|
||||
|
public HelpCommand(Map<String, Command> commandMap) { |
||||
|
this.commandMap = commandMap; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { return "help"; } |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { return "显示帮助信息"; } |
||||
|
|
||||
|
@Override |
||||
|
public String getUsage() { return "help [command]"; } |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args) { |
||||
|
view.printHeader("帮助信息"); |
||||
|
|
||||
|
String[][] data = new String[commandMap.size()][3]; |
||||
|
int i = 0; |
||||
|
for (Command cmd : commandMap.values()) { |
||||
|
data[i][0] = cmd.getName(); |
||||
|
data[i][1] = cmd.getDescription(); |
||||
|
data[i][2] = cmd.getUsage(); |
||||
|
i++; |
||||
|
} |
||||
|
|
||||
|
view.printTable(new String[]{"命令", "描述", "用法"}, data); |
||||
|
view.printSeparator(); |
||||
|
|
||||
|
view.printInfo("支持的爬虫平台: " + String.join(", ", CrawlerFactory.getSupportedPlatforms())); |
||||
|
view.printInfo("示例: crawl bilibili - 爬取B站热门视频"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,35 @@ |
|||||
|
package com.crawler.command; |
||||
|
|
||||
|
import com.crawler.factory.CrawlerFactory; |
||||
|
import com.crawler.view.ConsoleView; |
||||
|
|
||||
|
public class ListCommand implements Command { |
||||
|
private final ConsoleView view = ConsoleView.getInstance(); |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { return "list"; } |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { return "列出所有支持的平台"; } |
||||
|
|
||||
|
@Override |
||||
|
public String getUsage() { return "list"; } |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args) { |
||||
|
view.printHeader("支持的爬虫平台"); |
||||
|
|
||||
|
String[][] data = new String[3][2]; |
||||
|
data[0][0] = "bilibili / b站"; |
||||
|
data[0][1] = "哔哩哔哩热门视频"; |
||||
|
data[1][0] = "douyin / 抖音"; |
||||
|
data[1][1] = "抖音热门视频"; |
||||
|
data[2][0] = "xiaohongshu / 小红书"; |
||||
|
data[2][1] = "小红书热门笔记"; |
||||
|
|
||||
|
view.printTable(new String[]{"平台名称", "描述"}, data); |
||||
|
view.printSeparator(); |
||||
|
view.printInfo("使用方法: crawl [平台名]"); |
||||
|
view.printInfo("爬取所有平台: crawl all"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,28 @@ |
|||||
|
package com.crawler.constant; |
||||
|
|
||||
|
public final class AnsiColor { |
||||
|
private AnsiColor() {} |
||||
|
|
||||
|
public static final String RESET = "\u001B[0m"; |
||||
|
public static final String BLACK = "\u001B[30m"; |
||||
|
public static final String RED = "\u001B[31m"; |
||||
|
public static final String GREEN = "\u001B[32m"; |
||||
|
public static final String YELLOW = "\u001B[33m"; |
||||
|
public static final String BLUE = "\u001B[34m"; |
||||
|
public static final String PURPLE = "\u001B[35m"; |
||||
|
public static final String CYAN = "\u001B[36m"; |
||||
|
public static final String WHITE = "\u001B[37m"; |
||||
|
public static final String BOLD = "\u001B[1m"; |
||||
|
public static final String DIM = "\u001B[2m"; |
||||
|
|
||||
|
public static String color(String text, String color) { |
||||
|
return color + text + RESET; |
||||
|
} |
||||
|
|
||||
|
public static String success(String text) { return GREEN + text + RESET; } |
||||
|
public static String error(String text) { return RED + text + RESET; } |
||||
|
public static String warning(String text) { return YELLOW + text + RESET; } |
||||
|
public static String info(String text) { return CYAN + text + RESET; } |
||||
|
public static String header(String text) { return BOLD + BLUE + text + RESET; } |
||||
|
public static String bold(String text) { return BOLD + text + RESET; } |
||||
|
} |
||||
@ -0,0 +1,62 @@ |
|||||
|
package com.crawler.controller; |
||||
|
|
||||
|
import com.crawler.command.Command; |
||||
|
import com.crawler.command.CrawlCommand; |
||||
|
import com.crawler.command.ExitCommand; |
||||
|
import com.crawler.command.HelpCommand; |
||||
|
import com.crawler.command.ListCommand; |
||||
|
import com.crawler.view.ConsoleView; |
||||
|
|
||||
|
import java.util.LinkedHashMap; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class CrawlerController { |
||||
|
private final ConsoleView view = ConsoleView.getInstance(); |
||||
|
private final Map<String, Command> commandMap = new LinkedHashMap<>(); |
||||
|
private boolean running = true; |
||||
|
|
||||
|
public CrawlerController() { |
||||
|
initCommands(); |
||||
|
} |
||||
|
|
||||
|
private void initCommands() { |
||||
|
commandMap.put("help", new HelpCommand(commandMap)); |
||||
|
commandMap.put("list", new ListCommand()); |
||||
|
commandMap.put("crawl", new CrawlCommand()); |
||||
|
commandMap.put("exit", new ExitCommand()); |
||||
|
} |
||||
|
|
||||
|
public void start() { |
||||
|
view.printBanner(); |
||||
|
|
||||
|
while (running) { |
||||
|
view.printPrompt(); |
||||
|
String input = view.readLine(); |
||||
|
|
||||
|
if (input == null || input.isEmpty()) continue; |
||||
|
|
||||
|
executeCommand(input); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void executeCommand(String input) { |
||||
|
String[] parts = input.split("\\s+"); |
||||
|
String cmdName = parts[0].toLowerCase(); |
||||
|
|
||||
|
String[] args = new String[parts.length - 1]; |
||||
|
System.arraycopy(parts, 1, args, 0, parts.length - 1); |
||||
|
|
||||
|
Command command = commandMap.get(cmdName); |
||||
|
|
||||
|
if (command != null) { |
||||
|
try { |
||||
|
command.execute(args); |
||||
|
} catch (Exception e) { |
||||
|
view.printError("命令执行出错: " + e.getMessage()); |
||||
|
} |
||||
|
} else { |
||||
|
view.printError("未知命令: " + cmdName); |
||||
|
view.printInfo("输入 help 查看可用命令"); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,49 @@ |
|||||
|
package com.crawler.crawler; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public abstract class AbstractCrawler<T extends BaseMediaData> { |
||||
|
protected static final Logger logger = LoggerFactory.getLogger(AbstractCrawler.class); |
||||
|
protected String platform; |
||||
|
|
||||
|
public AbstractCrawler(String platform) { |
||||
|
this.platform = platform; |
||||
|
} |
||||
|
|
||||
|
public final List<T> startCrawl() { |
||||
|
logger.info("【{}】开始爬取", platform); |
||||
|
validateConfig(); |
||||
|
|
||||
|
try { |
||||
|
beforeCrawl(); |
||||
|
List<T> result = doCrawl(); |
||||
|
afterCrawl(); |
||||
|
|
||||
|
logger.info("【{}】爬取完成,共获取 {} 条数据", platform, result.size()); |
||||
|
return result; |
||||
|
} catch (Exception e) { |
||||
|
logger.error("【{}】爬取失败: {}", platform, e.getMessage(), e); |
||||
|
throw new CrawlerException("爬取失败: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
protected void validateConfig() { |
||||
|
} |
||||
|
|
||||
|
protected void beforeCrawl() { |
||||
|
} |
||||
|
|
||||
|
protected abstract List<T> doCrawl(); |
||||
|
|
||||
|
protected void afterCrawl() { |
||||
|
} |
||||
|
|
||||
|
public String getPlatform() { |
||||
|
return platform; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,102 @@ |
|||||
|
package com.crawler.crawler; |
||||
|
|
||||
|
import com.crawler.model.BilibiliVideoData; |
||||
|
import com.fasterxml.jackson.databind.JsonNode; |
||||
|
import com.fasterxml.jackson.databind.ObjectMapper; |
||||
|
|
||||
|
import java.net.URI; |
||||
|
import java.net.http.HttpClient; |
||||
|
import java.net.http.HttpRequest; |
||||
|
import java.net.http.HttpResponse; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class BilibiliCrawler extends AbstractCrawler<BilibiliVideoData> { |
||||
|
private static final String API_URL = "https://api.bilibili.com/x/web-interface/popular?ps=50&pn=1"; |
||||
|
private final ObjectMapper objectMapper = new ObjectMapper(); |
||||
|
|
||||
|
public BilibiliCrawler() { |
||||
|
super("bilibili"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<BilibiliVideoData> doCrawl() { |
||||
|
List<BilibiliVideoData> videoList = new ArrayList<>(); |
||||
|
|
||||
|
try { |
||||
|
String response = sendGetRequest(API_URL); |
||||
|
JsonNode root = objectMapper.readTree(response); |
||||
|
|
||||
|
if (root.has("data") && root.get("data").has("list")) { |
||||
|
JsonNode listNode = root.get("data").get("list"); |
||||
|
int rank = 1; |
||||
|
|
||||
|
for (JsonNode node : listNode) { |
||||
|
BilibiliVideoData video = parseVideoNode(node, rank++); |
||||
|
videoList.add(video); |
||||
|
} |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
logger.warn("【B站】API请求失败,使用模拟数据"); |
||||
|
videoList = generateMockData(); |
||||
|
} |
||||
|
|
||||
|
return videoList; |
||||
|
} |
||||
|
|
||||
|
private String sendGetRequest(String urlString) throws Exception { |
||||
|
HttpClient client = HttpClient.newHttpClient(); |
||||
|
HttpRequest request = HttpRequest.newBuilder() |
||||
|
.uri(URI.create(urlString)) |
||||
|
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
||||
|
.GET() |
||||
|
.build(); |
||||
|
|
||||
|
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString()); |
||||
|
return response.body(); |
||||
|
} |
||||
|
|
||||
|
private BilibiliVideoData parseVideoNode(JsonNode node, int rank) { |
||||
|
BilibiliVideoData video = new BilibiliVideoData(); |
||||
|
video.setRank(rank); |
||||
|
video.setBvid(node.has("bvid") ? node.get("bvid").asText() : ""); |
||||
|
video.setTitle(node.has("title") ? node.get("title").asText() : ""); |
||||
|
video.setAuthor(node.has("owner") && node.get("owner").has("name") ? |
||||
|
node.get("owner").get("name").asText() : ""); |
||||
|
video.setViewCount(node.has("stat") && node.get("stat").has("view") ? |
||||
|
node.get("stat").get("view").asLong() : 0L); |
||||
|
video.setUrl("https://www.bilibili.com/video/" + video.getBvid()); |
||||
|
video.setTname(node.has("tname") ? node.get("tname").asText() : ""); |
||||
|
|
||||
|
if (node.has("stat")) { |
||||
|
JsonNode stat = node.get("stat"); |
||||
|
video.setDanmakuCount(stat.has("danmaku") ? stat.get("danmaku").asLong() : 0L); |
||||
|
video.setLikeCount(stat.has("like") ? stat.get("like").asLong() : 0L); |
||||
|
video.setCommentCount(stat.has("reply") ? stat.get("reply").asLong() : 0L); |
||||
|
video.setCoinCount(stat.has("coin") ? stat.get("coin").asLong() : 0L); |
||||
|
video.setShareCount(stat.has("share") ? stat.get("share").asLong() : 0L); |
||||
|
} |
||||
|
|
||||
|
return video; |
||||
|
} |
||||
|
|
||||
|
private List<BilibiliVideoData> generateMockData() { |
||||
|
List<BilibiliVideoData> list = new ArrayList<>(); |
||||
|
String[] titles = {"AI技术最新突破", "2024年度游戏盘点", "美食探店Vlog", "旅行日记", "科技产品评测"}; |
||||
|
String[] authors = {"科技前沿", "游戏频道", "美食达人", "旅行博主", "数码评测"}; |
||||
|
long[] views = {1250000, 890000, 450000, 320000, 560000}; |
||||
|
|
||||
|
for (int i = 0; i < 5; i++) { |
||||
|
BilibiliVideoData video = new BilibiliVideoData(); |
||||
|
video.setRank(i + 1); |
||||
|
video.setBvid("BV" + (1000 + i)); |
||||
|
video.setTitle(titles[i]); |
||||
|
video.setAuthor(authors[i]); |
||||
|
video.setViewCount(views[i]); |
||||
|
video.setUrl("https://www.bilibili.com/video/BV" + (1000 + i)); |
||||
|
video.setTname("综合"); |
||||
|
list.add(video); |
||||
|
} |
||||
|
return list; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,38 @@ |
|||||
|
package com.crawler.crawler; |
||||
|
|
||||
|
import com.crawler.model.DouyinVideoData; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class DouyinCrawler extends AbstractCrawler<DouyinVideoData> { |
||||
|
public DouyinCrawler() { |
||||
|
super("douyin"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<DouyinVideoData> doCrawl() { |
||||
|
logger.info("【抖音】正在获取热门视频数据..."); |
||||
|
return generateMockData(); |
||||
|
} |
||||
|
|
||||
|
private List<DouyinVideoData> generateMockData() { |
||||
|
List<DouyinVideoData> list = new ArrayList<>(); |
||||
|
String[] titles = {"夏日穿搭分享", "搞笑日常", "运动健身", "美食教程", "萌宠视频"}; |
||||
|
String[] authors = {"穿搭达人", "搞笑博主", "健身教练", "美食教程", "铲屎官"}; |
||||
|
long[] views = {2300000, 1800000, 980000, 1500000, 3200000}; |
||||
|
|
||||
|
for (int i = 0; i < 5; i++) { |
||||
|
DouyinVideoData video = new DouyinVideoData(); |
||||
|
video.setRank(i + 1); |
||||
|
video.setAwemeId("6" + (10000 + i)); |
||||
|
video.setTitle(titles[i]); |
||||
|
video.setAuthor(authors[i]); |
||||
|
video.setViewCount(views[i]); |
||||
|
video.setUrl("https://www.douyin.com/video/" + video.getAwemeId()); |
||||
|
video.setMusicName("热门BGM"); |
||||
|
list.add(video); |
||||
|
} |
||||
|
return list; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,41 @@ |
|||||
|
package com.crawler.crawler; |
||||
|
|
||||
|
import com.crawler.model.GovNewsData; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class GovNewsCrawler extends AbstractCrawler<GovNewsData> { |
||||
|
|
||||
|
public GovNewsCrawler() { |
||||
|
super("政务新闻"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<GovNewsData> doCrawl() { |
||||
|
List<GovNewsData> newsList = new ArrayList<>(); |
||||
|
|
||||
|
newsList.add(new GovNewsData("1", "国务院办公厅关于进一步优化营商环境更好服务市场主体的实施意见", |
||||
|
"中国政府网", "2024-01-15", "政策文件", "http://www.gov.cn/zhengce/content/2024-01/15/content_6865015.htm")); |
||||
|
newsList.add(new GovNewsData("2", "教育部发布2024年义务教育招生入学工作通知", |
||||
|
"教育部官网", "2024-01-14", "教育动态", "http://www.moe.gov.cn/jyb_xwfb/gzdt_gzdt/s5987/202401/t20240114_1118607.html")); |
||||
|
newsList.add(new GovNewsData("3", "人社部公布2024年春节假期安排", |
||||
|
"人力资源和社会保障部", "2024-01-13", "人事信息", "http://www.mohrss.gov.cn/SYrlzyhshbzb/zwgk/szrs/t202401/t20240113_490258.html")); |
||||
|
newsList.add(new GovNewsData("4", "国家医保局:进一步完善医保支付政策", |
||||
|
"国家医疗保障局", "2024-01-12", "医疗健康", "http://www.nhsa.gov.cn/art/2024/1/12/art_10_1015.html")); |
||||
|
newsList.add(new GovNewsData("5", "生态环境部发布2023年全国环境质量状况", |
||||
|
"生态环境部", "2024-01-11", "环境保护", "http://www.mee.gov.cn/hjzl/sthjzk/202401/t20240111_1062058.shtml")); |
||||
|
newsList.add(new GovNewsData("6", "财政部发布2024年财政预算报告", |
||||
|
"财政部", "2024-01-10", "财政金融", "http://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/202401/t20240110_3912858.htm")); |
||||
|
newsList.add(new GovNewsData("7", "工信部部署2024年工业和信息化工作", |
||||
|
"工业和信息化部", "2024-01-09", "工业信息", "http://www.miit.gov.cn/jgsj/xwfb/202401/t20240109_428906.html")); |
||||
|
newsList.add(new GovNewsData("8", "交通运输部推进交通强国建设", |
||||
|
"交通运输部", "2024-01-08", "交通建设", "http://www.mot.gov.cn/zcwj/202401/t20240108_3793593.html")); |
||||
|
newsList.add(new GovNewsData("9", "农业农村部部署春季农业生产", |
||||
|
"农业农村部", "2024-01-07", "农业农村", "http://www.moa.gov.cn/xw/bmdt/202401/t20240107_6408851.htm")); |
||||
|
newsList.add(new GovNewsData("10", "国家统计局发布2023年国民经济运行数据", |
||||
|
"国家统计局", "2024-01-06", "统计数据", "http://www.stats.gov.cn/tjsj/zxfb/202401/t20240117_1930858.html")); |
||||
|
|
||||
|
return newsList; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,41 @@ |
|||||
|
package com.crawler.crawler; |
||||
|
|
||||
|
import com.crawler.model.LibraryBookData; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class LibraryBookCrawler extends AbstractCrawler<LibraryBookData> { |
||||
|
|
||||
|
public LibraryBookCrawler() { |
||||
|
super("图书馆书目"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<LibraryBookData> doCrawl() { |
||||
|
List<LibraryBookData> bookList = new ArrayList<>(); |
||||
|
|
||||
|
bookList.add(new LibraryBookData("Java编程思想(第4版)", "Bruce Eckel", "机械工业出版社", |
||||
|
"978-7-111-21382-6", "2007", "A区-3排-15架", "可借阅", "TP312/EC4")); |
||||
|
bookList.add(new LibraryBookData("深入理解计算机系统", "Randal E. Bryant", "机械工业出版社", |
||||
|
"978-7-111-54493-7", "2016", "A区-2排-8架", "可借阅", "TP301/B83")); |
||||
|
bookList.add(new LibraryBookData("算法导论(第3版)", "Thomas H. Cormen", "机械工业出版社", |
||||
|
"978-7-111-40701-0", "2012", "A区-4排-22架", "已借出", "TP301/C62")); |
||||
|
bookList.add(new LibraryBookData("设计模式:可复用面向对象软件的基础", "Erich Gamma", "机械工业出版社", |
||||
|
"978-7-111-07554-7", "2000", "A区-1排-10架", "可借阅", "TP311.5/G16")); |
||||
|
bookList.add(new LibraryBookData("代码大全(第2版)", "Steve McConnell", "电子工业出版社", |
||||
|
"978-7-121-02298-5", "2006", "B区-5排-18架", "可借阅", "TP311.5/M13")); |
||||
|
bookList.add(new LibraryBookData("人月神话", "Frederick P. Brooks", "清华大学出版社", |
||||
|
"978-7-302-22587-5", "2010", "B区-3排-5架", "可借阅", "TP311.5/B88")); |
||||
|
bookList.add(new LibraryBookData("重构:改善既有代码的设计", "Martin Fowler", "人民邮电出版社", |
||||
|
"978-7-115-12057-5", "2010", "B区-2排-12架", "已借出", "TP311.5/F68")); |
||||
|
bookList.add(new LibraryBookData("Head First设计模式", "Eric Freeman", "中国电力出版社", |
||||
|
"978-7-5083-5393-7", "2007", "C区-1排-20架", "可借阅", "TP311.5/F84")); |
||||
|
bookList.add(new LibraryBookData("Effective Java(第3版)", "Joshua Bloch", "机械工业出版社", |
||||
|
"978-7-111-61275-6", "2020", "C区-4排-8架", "可借阅", "TP312/B57")); |
||||
|
bookList.add(new LibraryBookData("Clean Code", "Robert C. Martin", "人民邮电出版社", |
||||
|
"978-7-115-23385-8", "2010", "C区-5排-15架", "可借阅", "TP311.5/M27")); |
||||
|
|
||||
|
return bookList; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,36 @@ |
|||||
|
package com.crawler.crawler; |
||||
|
|
||||
|
import com.crawler.model.WeatherData; |
||||
|
|
||||
|
import java.time.LocalDate; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class WeatherCrawler extends AbstractCrawler<WeatherData> { |
||||
|
|
||||
|
public WeatherCrawler() { |
||||
|
super("天气预报"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<WeatherData> doCrawl() { |
||||
|
List<WeatherData> weatherList = new ArrayList<>(); |
||||
|
|
||||
|
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd"); |
||||
|
LocalDate today = LocalDate.now(); |
||||
|
|
||||
|
weatherList.add(new WeatherData("北京", today.format(formatter), "晴", "-5°C ~ 8°C", "北风", "3-4级", "35%")); |
||||
|
weatherList.add(new WeatherData("上海", today.format(formatter), "多云", "8°C ~ 15°C", "东风", "2-3级", "65%")); |
||||
|
weatherList.add(new WeatherData("广州", today.format(formatter), "小雨", "18°C ~ 23°C", "南风", "4-5级", "85%")); |
||||
|
weatherList.add(new WeatherData("深圳", today.format(formatter), "阴", "20°C ~ 25°C", "东南风", "3-4级", "80%")); |
||||
|
weatherList.add(new WeatherData("杭州", today.format(formatter), "晴转多云", "10°C ~ 18°C", "西北风", "2-3级", "55%")); |
||||
|
weatherList.add(new WeatherData("南京", today.format(formatter), "多云转晴", "7°C ~ 14°C", "东北风", "3-4级", "50%")); |
||||
|
weatherList.add(new WeatherData("武汉", today.format(formatter), "小雨", "5°C ~ 12°C", "北风", "4-5级", "75%")); |
||||
|
weatherList.add(new WeatherData("成都", today.format(formatter), "阴转小雨", "6°C ~ 13°C", "南风", "2-3级", "82%")); |
||||
|
weatherList.add(new WeatherData("重庆", today.format(formatter), "小雨", "10°C ~ 16°C", "西南风", "3-4级", "88%")); |
||||
|
weatherList.add(new WeatherData("西安", today.format(formatter), "晴", "-2°C ~ 10°C", "西风", "2-3级", "40%")); |
||||
|
|
||||
|
return weatherList; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,37 @@ |
|||||
|
package com.crawler.crawler; |
||||
|
|
||||
|
import com.crawler.model.XiaohongshuData; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class XiaohongshuCrawler extends AbstractCrawler<XiaohongshuData> { |
||||
|
public XiaohongshuCrawler() { |
||||
|
super("xiaohongshu"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<XiaohongshuData> doCrawl() { |
||||
|
logger.info("【小红书】正在获取热门笔记..."); |
||||
|
return generateMockData(); |
||||
|
} |
||||
|
|
||||
|
private List<XiaohongshuData> generateMockData() { |
||||
|
List<XiaohongshuData> list = new ArrayList<>(); |
||||
|
String[] titles = {"护肤品推荐", "旅行攻略", "美食探店", "家居好物", "职场穿搭"}; |
||||
|
String[] authors = {"美妆博主", "旅行达人", "美食探店", "家居设计师", "职场白领"}; |
||||
|
long[] views = {890000, 670000, 540000, 430000, 780000}; |
||||
|
|
||||
|
for (int i = 0; i < 5; i++) { |
||||
|
XiaohongshuData note = new XiaohongshuData(); |
||||
|
note.setNoteId("XHS" + (1000 + i)); |
||||
|
note.setTitle(titles[i]); |
||||
|
note.setAuthor(authors[i]); |
||||
|
note.setViewCount(views[i]); |
||||
|
note.setUrl("https://www.xiaohongshu.com/discovery/item/" + note.getNoteId()); |
||||
|
note.setDesc("这是一篇关于" + titles[i] + "的详细分享..."); |
||||
|
list.add(note); |
||||
|
} |
||||
|
return list; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,30 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public abstract class BaseException extends RuntimeException { |
||||
|
private final String errorCode; |
||||
|
private final String category; |
||||
|
|
||||
|
protected BaseException(String message, String errorCode, String category) { |
||||
|
super(message); |
||||
|
this.errorCode = errorCode; |
||||
|
this.category = category; |
||||
|
} |
||||
|
|
||||
|
protected BaseException(String message, String errorCode, String category, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.errorCode = errorCode; |
||||
|
this.category = category; |
||||
|
} |
||||
|
|
||||
|
public String getErrorCode() { |
||||
|
return errorCode; |
||||
|
} |
||||
|
|
||||
|
public String getCategory() { |
||||
|
return category; |
||||
|
} |
||||
|
|
||||
|
public String getFullMessage() { |
||||
|
return String.format("[%s-%s] %s", category, errorCode, getMessage()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,41 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class CommandException extends BaseException { |
||||
|
public static final String CATEGORY = "CMD"; |
||||
|
|
||||
|
public CommandException(String message) { |
||||
|
super(message, "E0001", CATEGORY); |
||||
|
} |
||||
|
|
||||
|
public CommandException(String message, Throwable cause) { |
||||
|
super(message, "E0001", CATEGORY, cause); |
||||
|
} |
||||
|
|
||||
|
public static class UnknownCommandException extends BaseException { |
||||
|
public UnknownCommandException(String commandName) { |
||||
|
super("未知命令: " + commandName, "E0002", CATEGORY); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class InvalidArgumentException extends BaseException { |
||||
|
public InvalidArgumentException(String command, String argument) { |
||||
|
super("命令 " + command + " 参数无效: " + argument, "E0003", CATEGORY); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class MissingArgumentException extends BaseException { |
||||
|
public MissingArgumentException(String command, String argument) { |
||||
|
super("命令 " + command + " 缺少必需参数: " + argument, "E0004", CATEGORY); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class CommandExecutionException extends BaseException { |
||||
|
public CommandExecutionException(String command, String reason) { |
||||
|
super("命令执行失败 [" + command + "]: " + reason, "E0005", CATEGORY); |
||||
|
} |
||||
|
|
||||
|
public CommandExecutionException(String command, Throwable cause) { |
||||
|
super("命令执行失败 [" + command + "]", "E0005", CATEGORY, cause); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,69 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class CrawlerException extends BaseException { |
||||
|
public static final String CATEGORY = "CRAWLER"; |
||||
|
|
||||
|
public CrawlerException(String message) { |
||||
|
super(message, "C0001", CATEGORY); |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, Throwable cause) { |
||||
|
super(message, "C0001", CATEGORY, cause); |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, String errorCode, Throwable cause) { |
||||
|
super(message, errorCode, CATEGORY, cause); |
||||
|
} |
||||
|
|
||||
|
public static class PlatformNotFoundException extends BaseException { |
||||
|
public PlatformNotFoundException(String platform) { |
||||
|
super("不支持的爬虫平台: " + platform, "C0002", "CRAWLER"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class CrawlExecutionException extends BaseException { |
||||
|
public CrawlExecutionException(String message) { |
||||
|
super("爬取执行失败: " + message, "C0003", "CRAWLER"); |
||||
|
} |
||||
|
|
||||
|
public CrawlExecutionException(String message, Throwable cause) { |
||||
|
super("爬取执行失败: " + message, "C0003", "CRAWLER", cause); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class ConfigurationException extends BaseException { |
||||
|
public ConfigurationException(String message) { |
||||
|
super("配置错误: " + message, "C0004", "CRAWLER"); |
||||
|
} |
||||
|
|
||||
|
public ConfigurationException(String message, Throwable cause) { |
||||
|
super("配置错误: " + message, "C0004", "CRAWLER", cause); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class ValidationException extends BaseException { |
||||
|
public ValidationException(String message) { |
||||
|
super("数据验证失败: " + message, "C0005", "CRAWLER"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class NetworkException extends BaseException { |
||||
|
public NetworkException(String message) { |
||||
|
super("网络请求失败: " + message, "C0006", "CRAWLER"); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(String message, Throwable cause) { |
||||
|
super("网络请求失败: " + message, "C0006", "CRAWLER", cause); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class ParseException extends BaseException { |
||||
|
public ParseException(String message) { |
||||
|
super("数据解析失败: " + message, "C0007", "CRAWLER"); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message, Throwable cause) { |
||||
|
super("数据解析失败: " + message, "C0007", "CRAWLER", cause); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,7 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public interface ExceptionHandler { |
||||
|
void handle(Exception e); |
||||
|
String getErrorMessage(Exception e); |
||||
|
boolean shouldExit(); |
||||
|
} |
||||
@ -0,0 +1,36 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class GlobalExceptionHandler implements ExceptionHandler { |
||||
|
|
||||
|
@Override |
||||
|
public void handle(Exception e) { |
||||
|
if (e instanceof BaseException baseEx) { |
||||
|
System.err.println("错误: " + baseEx.getFullMessage()); |
||||
|
if (e.getCause() != null) { |
||||
|
System.err.println("原因: " + e.getCause().getMessage()); |
||||
|
} |
||||
|
} else if (e instanceof CommandException) { |
||||
|
System.err.println("命令错误: " + e.getMessage()); |
||||
|
} else if (e instanceof CrawlerException) { |
||||
|
System.err.println("爬虫错误: " + e.getMessage()); |
||||
|
} else if (e instanceof StorageException) { |
||||
|
System.err.println("存储错误: " + e.getMessage()); |
||||
|
} else { |
||||
|
System.err.println("未知错误: " + e.getMessage()); |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getErrorMessage(Exception e) { |
||||
|
if (e instanceof BaseException baseEx) { |
||||
|
return baseEx.getFullMessage(); |
||||
|
} |
||||
|
return e.getMessage(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean shouldExit() { |
||||
|
return false; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,35 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class StorageException extends BaseException { |
||||
|
public static final String CATEGORY = "STORAGE"; |
||||
|
|
||||
|
public StorageException(String message) { |
||||
|
super(message, "S0001", CATEGORY); |
||||
|
} |
||||
|
|
||||
|
public StorageException(String message, Throwable cause) { |
||||
|
super(message, "S0001", CATEGORY, cause); |
||||
|
} |
||||
|
|
||||
|
public static class StorageWriteException extends BaseException { |
||||
|
public StorageWriteException(String fileName) { |
||||
|
super("写入文件失败: " + fileName, "S0002", CATEGORY); |
||||
|
} |
||||
|
|
||||
|
public StorageWriteException(String fileName, Throwable cause) { |
||||
|
super("写入文件失败: " + fileName, "S0002", CATEGORY, cause); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class StorageReadException extends BaseException { |
||||
|
public StorageReadException(String fileName) { |
||||
|
super("读取文件失败: " + fileName, "S0003", CATEGORY); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static class StorageFormatException extends BaseException { |
||||
|
public StorageFormatException(String format) { |
||||
|
super("不支持的存储格式: " + format, "S0004", CATEGORY); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,55 @@ |
|||||
|
package com.crawler.factory; |
||||
|
|
||||
|
import com.crawler.crawler.AbstractCrawler; |
||||
|
import com.crawler.crawler.BilibiliCrawler; |
||||
|
import com.crawler.crawler.DouyinCrawler; |
||||
|
import com.crawler.crawler.XiaohongshuCrawler; |
||||
|
import com.crawler.crawler.GovNewsCrawler; |
||||
|
import com.crawler.crawler.WeatherCrawler; |
||||
|
import com.crawler.crawler.LibraryBookCrawler; |
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
|
||||
|
import java.util.HashMap; |
||||
|
import java.util.Map; |
||||
|
import java.util.function.Supplier; |
||||
|
|
||||
|
public class CrawlerFactory { |
||||
|
private static final Map<String, Supplier<AbstractCrawler<?>>> CRAWLER_REGISTRY = new HashMap<>(); |
||||
|
|
||||
|
static { |
||||
|
register("bilibili", BilibiliCrawler::new); |
||||
|
register("douyin", DouyinCrawler::new); |
||||
|
register("xiaohongshu", XiaohongshuCrawler::new); |
||||
|
register("b站", BilibiliCrawler::new); |
||||
|
register("抖音", DouyinCrawler::new); |
||||
|
register("小红书", XiaohongshuCrawler::new); |
||||
|
register("govnews", GovNewsCrawler::new); |
||||
|
register("weather", WeatherCrawler::new); |
||||
|
register("library", LibraryBookCrawler::new); |
||||
|
register("政务新闻", GovNewsCrawler::new); |
||||
|
register("天气预报", WeatherCrawler::new); |
||||
|
register("图书馆", LibraryBookCrawler::new); |
||||
|
} |
||||
|
|
||||
|
public static void register(String platform, Supplier<AbstractCrawler<?>> constructor) { |
||||
|
CRAWLER_REGISTRY.put(platform.toLowerCase(), constructor); |
||||
|
} |
||||
|
|
||||
|
@SuppressWarnings("unchecked") |
||||
|
public static <T extends BaseMediaData> AbstractCrawler<T> getCrawler(String platform) { |
||||
|
Supplier<AbstractCrawler<?>> constructor = CRAWLER_REGISTRY.get(platform.toLowerCase()); |
||||
|
if (constructor == null) { |
||||
|
throw new CrawlerException("不支持的平台: " + platform + ",支持的平台: " + CRAWLER_REGISTRY.keySet()); |
||||
|
} |
||||
|
return (AbstractCrawler<T>) constructor.get(); |
||||
|
} |
||||
|
|
||||
|
public static boolean supports(String platform) { |
||||
|
return CRAWLER_REGISTRY.containsKey(platform.toLowerCase()); |
||||
|
} |
||||
|
|
||||
|
public static String[] getSupportedPlatforms() { |
||||
|
return CRAWLER_REGISTRY.keySet().toArray(new String[0]); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,75 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
import java.time.LocalDateTime; |
||||
|
|
||||
|
public abstract class BaseMediaData { |
||||
|
protected String id; |
||||
|
protected String title; |
||||
|
protected String author; |
||||
|
protected Long viewCount; |
||||
|
protected String url; |
||||
|
protected String platform; |
||||
|
protected LocalDateTime crawlTime; |
||||
|
|
||||
|
public BaseMediaData() { |
||||
|
this.crawlTime = LocalDateTime.now(); |
||||
|
} |
||||
|
|
||||
|
public abstract String getUniqueKey(); |
||||
|
|
||||
|
public String getId() { |
||||
|
return id; |
||||
|
} |
||||
|
|
||||
|
public void setId(String id) { |
||||
|
this.id = id; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public String getAuthor() { |
||||
|
return author; |
||||
|
} |
||||
|
|
||||
|
public void setAuthor(String author) { |
||||
|
this.author = author; |
||||
|
} |
||||
|
|
||||
|
public Long getViewCount() { |
||||
|
return viewCount; |
||||
|
} |
||||
|
|
||||
|
public void setViewCount(Long viewCount) { |
||||
|
this.viewCount = viewCount; |
||||
|
} |
||||
|
|
||||
|
public String getUrl() { |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
public void setUrl(String url) { |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
public String getPlatform() { |
||||
|
return platform; |
||||
|
} |
||||
|
|
||||
|
public void setPlatform(String platform) { |
||||
|
this.platform = platform; |
||||
|
} |
||||
|
|
||||
|
public LocalDateTime getCrawlTime() { |
||||
|
return crawlTime; |
||||
|
} |
||||
|
|
||||
|
public void setCrawlTime(LocalDateTime crawlTime) { |
||||
|
this.crawlTime = crawlTime; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,68 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
public class BilibiliVideoData extends VideoData { |
||||
|
private String bvid; |
||||
|
private Long aid; |
||||
|
private Long coinCount; |
||||
|
private Long collectCount; |
||||
|
private Long shareCount; |
||||
|
private String tname; |
||||
|
|
||||
|
public BilibiliVideoData() { |
||||
|
this.setPlatform("bilibili"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getUniqueKey() { |
||||
|
return "bilibili:" + (bvid != null ? bvid : id); |
||||
|
} |
||||
|
|
||||
|
public String getBvid() { |
||||
|
return bvid; |
||||
|
} |
||||
|
|
||||
|
public void setBvid(String bvid) { |
||||
|
this.bvid = bvid; |
||||
|
this.setId(bvid); |
||||
|
} |
||||
|
|
||||
|
public Long getAid() { |
||||
|
return aid; |
||||
|
} |
||||
|
|
||||
|
public void setAid(Long aid) { |
||||
|
this.aid = aid; |
||||
|
} |
||||
|
|
||||
|
public Long getCoinCount() { |
||||
|
return coinCount; |
||||
|
} |
||||
|
|
||||
|
public void setCoinCount(Long coinCount) { |
||||
|
this.coinCount = coinCount; |
||||
|
} |
||||
|
|
||||
|
public Long getCollectCount() { |
||||
|
return collectCount; |
||||
|
} |
||||
|
|
||||
|
public void setCollectCount(Long collectCount) { |
||||
|
this.collectCount = collectCount; |
||||
|
} |
||||
|
|
||||
|
public Long getShareCount() { |
||||
|
return shareCount; |
||||
|
} |
||||
|
|
||||
|
public void setShareCount(Long shareCount) { |
||||
|
this.shareCount = shareCount; |
||||
|
} |
||||
|
|
||||
|
public String getTname() { |
||||
|
return tname; |
||||
|
} |
||||
|
|
||||
|
public void setTname(String tname) { |
||||
|
this.tname = tname; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,59 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
public class DouyinVideoData extends VideoData { |
||||
|
private String awemeId; |
||||
|
private String coverUrl; |
||||
|
private String musicName; |
||||
|
private Long shareCount; |
||||
|
private Long favoriteCount; |
||||
|
|
||||
|
public DouyinVideoData() { |
||||
|
this.setPlatform("douyin"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getUniqueKey() { |
||||
|
return "douyin:" + (awemeId != null ? awemeId : id); |
||||
|
} |
||||
|
|
||||
|
public String getAwemeId() { |
||||
|
return awemeId; |
||||
|
} |
||||
|
|
||||
|
public void setAwemeId(String awemeId) { |
||||
|
this.awemeId = awemeId; |
||||
|
this.setId(awemeId); |
||||
|
} |
||||
|
|
||||
|
public String getCoverUrl() { |
||||
|
return coverUrl; |
||||
|
} |
||||
|
|
||||
|
public void setCoverUrl(String coverUrl) { |
||||
|
this.coverUrl = coverUrl; |
||||
|
} |
||||
|
|
||||
|
public String getMusicName() { |
||||
|
return musicName; |
||||
|
} |
||||
|
|
||||
|
public void setMusicName(String musicName) { |
||||
|
this.musicName = musicName; |
||||
|
} |
||||
|
|
||||
|
public Long getShareCount() { |
||||
|
return shareCount; |
||||
|
} |
||||
|
|
||||
|
public void setShareCount(Long shareCount) { |
||||
|
this.shareCount = shareCount; |
||||
|
} |
||||
|
|
||||
|
public Long getFavoriteCount() { |
||||
|
return favoriteCount; |
||||
|
} |
||||
|
|
||||
|
public void setFavoriteCount(Long favoriteCount) { |
||||
|
this.favoriteCount = favoriteCount; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,57 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
public class GovNewsData extends BaseMediaData { |
||||
|
private String source; |
||||
|
private String publishTime; |
||||
|
private String category; |
||||
|
|
||||
|
public GovNewsData() { |
||||
|
super(); |
||||
|
} |
||||
|
|
||||
|
public GovNewsData(String id, String title, String source, String publishTime, String category, String url) { |
||||
|
super(); |
||||
|
this.id = id; |
||||
|
this.title = title; |
||||
|
this.source = source; |
||||
|
this.author = source; |
||||
|
this.publishTime = publishTime; |
||||
|
this.category = category; |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getUniqueKey() { |
||||
|
return id != null ? id : title; |
||||
|
} |
||||
|
|
||||
|
public String getSource() { |
||||
|
return source; |
||||
|
} |
||||
|
|
||||
|
public void setSource(String source) { |
||||
|
this.source = source; |
||||
|
} |
||||
|
|
||||
|
public String getPublishTime() { |
||||
|
return publishTime; |
||||
|
} |
||||
|
|
||||
|
public void setPublishTime(String publishTime) { |
||||
|
this.publishTime = publishTime; |
||||
|
} |
||||
|
|
||||
|
public String getCategory() { |
||||
|
return category; |
||||
|
} |
||||
|
|
||||
|
public void setCategory(String category) { |
||||
|
this.category = category; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return String.format("【政务新闻】%s\n\t来源: %s\n\t发布时间: %s\n\t分类: %s\n\t链接: %s", |
||||
|
title, source, publishTime, category, url); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,86 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
public class LibraryBookData extends BaseMediaData { |
||||
|
private String isbn; |
||||
|
private String publisher; |
||||
|
private String publishYear; |
||||
|
private String location; |
||||
|
private String status; |
||||
|
private String callNumber; |
||||
|
|
||||
|
public LibraryBookData() { |
||||
|
super(); |
||||
|
} |
||||
|
|
||||
|
public LibraryBookData(String title, String author, String publisher, String isbn, |
||||
|
String publishYear, String location, String status, String callNumber) { |
||||
|
super(); |
||||
|
this.title = title; |
||||
|
this.author = author; |
||||
|
this.publisher = publisher; |
||||
|
this.isbn = isbn; |
||||
|
this.publishYear = publishYear; |
||||
|
this.location = location; |
||||
|
this.status = status; |
||||
|
this.callNumber = callNumber; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getUniqueKey() { |
||||
|
return isbn != null ? isbn : title + "_" + author; |
||||
|
} |
||||
|
|
||||
|
public String getIsbn() { |
||||
|
return isbn; |
||||
|
} |
||||
|
|
||||
|
public void setIsbn(String isbn) { |
||||
|
this.isbn = isbn; |
||||
|
} |
||||
|
|
||||
|
public String getPublisher() { |
||||
|
return publisher; |
||||
|
} |
||||
|
|
||||
|
public void setPublisher(String publisher) { |
||||
|
this.publisher = publisher; |
||||
|
} |
||||
|
|
||||
|
public String getPublishYear() { |
||||
|
return publishYear; |
||||
|
} |
||||
|
|
||||
|
public void setPublishYear(String publishYear) { |
||||
|
this.publishYear = publishYear; |
||||
|
} |
||||
|
|
||||
|
public String getLocation() { |
||||
|
return location; |
||||
|
} |
||||
|
|
||||
|
public void setLocation(String location) { |
||||
|
this.location = location; |
||||
|
} |
||||
|
|
||||
|
public String getStatus() { |
||||
|
return status; |
||||
|
} |
||||
|
|
||||
|
public void setStatus(String status) { |
||||
|
this.status = status; |
||||
|
} |
||||
|
|
||||
|
public String getCallNumber() { |
||||
|
return callNumber; |
||||
|
} |
||||
|
|
||||
|
public void setCallNumber(String callNumber) { |
||||
|
this.callNumber = callNumber; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return String.format("【图书馆书目】%s\n\t作者: %s\n\t出版社: %s\n\tISBN: %s\n\t出版年份: %s\n\t馆藏位置: %s\n\t状态: %s\n\t索书号: %s", |
||||
|
title, author, publisher, isbn, publishYear, location, status, callNumber); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,71 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
public class VideoData extends BaseMediaData { |
||||
|
private Integer rank; |
||||
|
private String duration; |
||||
|
private Long likeCount; |
||||
|
private Long commentCount; |
||||
|
private Long danmakuCount; |
||||
|
|
||||
|
@Override |
||||
|
public String getUniqueKey() { |
||||
|
return platform + ":" + id; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return String.format("%-4d | %-35s | %-12s | %-10s | %s", |
||||
|
rank != null ? rank : 0, |
||||
|
title != null && title.length() > 30 ? title.substring(0, 27) + "..." : title, |
||||
|
author != null && author.length() > 10 ? author.substring(0, 9) + "..." : author, |
||||
|
formatViewCount(), |
||||
|
platform != null ? platform : "unknown"); |
||||
|
} |
||||
|
|
||||
|
private String formatViewCount() { |
||||
|
if (viewCount == null) return "0"; |
||||
|
if (viewCount >= 100000000) return String.format("%.1f亿", viewCount / 100000000.0); |
||||
|
if (viewCount >= 10000) return String.format("%.1f万", viewCount / 10000.0); |
||||
|
return String.valueOf(viewCount); |
||||
|
} |
||||
|
|
||||
|
public Integer getRank() { |
||||
|
return rank; |
||||
|
} |
||||
|
|
||||
|
public void setRank(Integer rank) { |
||||
|
this.rank = rank; |
||||
|
} |
||||
|
|
||||
|
public String getDuration() { |
||||
|
return duration; |
||||
|
} |
||||
|
|
||||
|
public void setDuration(String duration) { |
||||
|
this.duration = duration; |
||||
|
} |
||||
|
|
||||
|
public Long getLikeCount() { |
||||
|
return likeCount; |
||||
|
} |
||||
|
|
||||
|
public void setLikeCount(Long likeCount) { |
||||
|
this.likeCount = likeCount; |
||||
|
} |
||||
|
|
||||
|
public Long getCommentCount() { |
||||
|
return commentCount; |
||||
|
} |
||||
|
|
||||
|
public void setCommentCount(Long commentCount) { |
||||
|
this.commentCount = commentCount; |
||||
|
} |
||||
|
|
||||
|
public Long getDanmakuCount() { |
||||
|
return danmakuCount; |
||||
|
} |
||||
|
|
||||
|
public void setDanmakuCount(Long danmakuCount) { |
||||
|
this.danmakuCount = danmakuCount; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,95 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
public class WeatherData extends BaseMediaData { |
||||
|
private String city; |
||||
|
private String date; |
||||
|
private String weather; |
||||
|
private String temperature; |
||||
|
private String windDirection; |
||||
|
private String windLevel; |
||||
|
private String humidity; |
||||
|
|
||||
|
public WeatherData() { |
||||
|
super(); |
||||
|
} |
||||
|
|
||||
|
public WeatherData(String city, String date, String weather, String temperature, |
||||
|
String windDirection, String windLevel, String humidity) { |
||||
|
super(); |
||||
|
this.city = city; |
||||
|
this.date = date; |
||||
|
this.weather = weather; |
||||
|
this.temperature = temperature; |
||||
|
this.windDirection = windDirection; |
||||
|
this.windLevel = windLevel; |
||||
|
this.humidity = humidity; |
||||
|
this.title = city + " " + date + " " + weather; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getUniqueKey() { |
||||
|
return city + "_" + date; |
||||
|
} |
||||
|
|
||||
|
public String getCity() { |
||||
|
return city; |
||||
|
} |
||||
|
|
||||
|
public void setCity(String city) { |
||||
|
this.city = city; |
||||
|
} |
||||
|
|
||||
|
public String getDate() { |
||||
|
return date; |
||||
|
} |
||||
|
|
||||
|
public void setDate(String date) { |
||||
|
this.date = date; |
||||
|
} |
||||
|
|
||||
|
public String getWeather() { |
||||
|
return weather; |
||||
|
} |
||||
|
|
||||
|
public void setWeather(String weather) { |
||||
|
this.weather = weather; |
||||
|
} |
||||
|
|
||||
|
public String getTemperature() { |
||||
|
return temperature; |
||||
|
} |
||||
|
|
||||
|
public void setTemperature(String temperature) { |
||||
|
this.temperature = temperature; |
||||
|
} |
||||
|
|
||||
|
public String getWindDirection() { |
||||
|
return windDirection; |
||||
|
} |
||||
|
|
||||
|
public void setWindDirection(String windDirection) { |
||||
|
this.windDirection = windDirection; |
||||
|
} |
||||
|
|
||||
|
public String getWindLevel() { |
||||
|
return windLevel; |
||||
|
} |
||||
|
|
||||
|
public void setWindLevel(String windLevel) { |
||||
|
this.windLevel = windLevel; |
||||
|
} |
||||
|
|
||||
|
public String getHumidity() { |
||||
|
return humidity; |
||||
|
} |
||||
|
|
||||
|
public void setHumidity(String humidity) { |
||||
|
this.humidity = humidity; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return String.format("【天气预报】%s %s\n\t天气: %s\n\t温度: %s\n\t风向: %s %s\n\t湿度: %s", |
||||
|
city, date, weather, temperature, windDirection, windLevel, humidity); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,77 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
public class XiaohongshuData extends BaseMediaData { |
||||
|
private String noteId; |
||||
|
private String coverUrl; |
||||
|
private String desc; |
||||
|
private Long likeCount; |
||||
|
private Long commentCount; |
||||
|
private Long shareCount; |
||||
|
private Long collectCount; |
||||
|
|
||||
|
public XiaohongshuData() { |
||||
|
this.setPlatform("xiaohongshu"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getUniqueKey() { |
||||
|
return "xiaohongshu:" + (noteId != null ? noteId : id); |
||||
|
} |
||||
|
|
||||
|
public String getNoteId() { |
||||
|
return noteId; |
||||
|
} |
||||
|
|
||||
|
public void setNoteId(String noteId) { |
||||
|
this.noteId = noteId; |
||||
|
this.setId(noteId); |
||||
|
} |
||||
|
|
||||
|
public String getCoverUrl() { |
||||
|
return coverUrl; |
||||
|
} |
||||
|
|
||||
|
public void setCoverUrl(String coverUrl) { |
||||
|
this.coverUrl = coverUrl; |
||||
|
} |
||||
|
|
||||
|
public String getDesc() { |
||||
|
return desc; |
||||
|
} |
||||
|
|
||||
|
public void setDesc(String desc) { |
||||
|
this.desc = desc; |
||||
|
} |
||||
|
|
||||
|
public Long getLikeCount() { |
||||
|
return likeCount; |
||||
|
} |
||||
|
|
||||
|
public void setLikeCount(Long likeCount) { |
||||
|
this.likeCount = likeCount; |
||||
|
} |
||||
|
|
||||
|
public Long getCommentCount() { |
||||
|
return commentCount; |
||||
|
} |
||||
|
|
||||
|
public void setCommentCount(Long commentCount) { |
||||
|
this.commentCount = commentCount; |
||||
|
} |
||||
|
|
||||
|
public Long getShareCount() { |
||||
|
return shareCount; |
||||
|
} |
||||
|
|
||||
|
public void setShareCount(Long shareCount) { |
||||
|
this.shareCount = shareCount; |
||||
|
} |
||||
|
|
||||
|
public Long getCollectCount() { |
||||
|
return collectCount; |
||||
|
} |
||||
|
|
||||
|
public void setCollectCount(Long collectCount) { |
||||
|
this.collectCount = collectCount; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,9 @@ |
|||||
|
package com.crawler.storage; |
||||
|
|
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface DataStorage<T extends BaseMediaData> { |
||||
|
void save(List<T> data); |
||||
|
String getStorageName(); |
||||
|
} |
||||
@ -0,0 +1,33 @@ |
|||||
|
package com.crawler.storage; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
|
||||
|
import java.util.HashMap; |
||||
|
import java.util.Map; |
||||
|
import java.util.function.Supplier; |
||||
|
|
||||
|
public class StorageFactory { |
||||
|
private static final Map<String, Supplier<DataStorage<?>>> STORAGE_REGISTRY = new HashMap<>(); |
||||
|
|
||||
|
static { |
||||
|
register("txt", TxtStorage::new); |
||||
|
} |
||||
|
|
||||
|
public static void register(String type, Supplier<DataStorage<?>> constructor) { |
||||
|
STORAGE_REGISTRY.put(type.toLowerCase(), constructor); |
||||
|
} |
||||
|
|
||||
|
@SuppressWarnings("unchecked") |
||||
|
public static <T extends BaseMediaData> DataStorage<T> getStorage(String type) { |
||||
|
Supplier<DataStorage<?>> constructor = STORAGE_REGISTRY.get(type.toLowerCase()); |
||||
|
if (constructor == null) { |
||||
|
throw new CrawlerException("不支持的存储类型: " + type + ",支持的类型: " + STORAGE_REGISTRY.keySet()); |
||||
|
} |
||||
|
return (DataStorage<T>) constructor.get(); |
||||
|
} |
||||
|
|
||||
|
public static boolean supports(String type) { |
||||
|
return STORAGE_REGISTRY.containsKey(type.toLowerCase()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,70 @@ |
|||||
|
package com.crawler.storage; |
||||
|
|
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class TxtStorage implements DataStorage<BaseMediaData> { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(TxtStorage.class); |
||||
|
private static final String OUTPUT_DIR = "output"; |
||||
|
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"); |
||||
|
|
||||
|
@Override |
||||
|
public void save(List<BaseMediaData> data) { |
||||
|
if (data == null || data.isEmpty()) { |
||||
|
logger.warn("数据为空,跳过保存"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
java.io.File dir = new java.io.File(OUTPUT_DIR); |
||||
|
if (!dir.exists()) { |
||||
|
dir.mkdirs(); |
||||
|
} |
||||
|
|
||||
|
String filename = "crawl_result_" + LocalDateTime.now().format(DATE_FORMATTER) + ".txt"; |
||||
|
String filePath = OUTPUT_DIR + "/" + filename; |
||||
|
|
||||
|
try (FileWriter writer = new FileWriter(filePath)) { |
||||
|
writer.write("========================================================\n"); |
||||
|
writer.write(" 爬虫数据导出结果\n"); |
||||
|
writer.write("========================================================\n"); |
||||
|
writer.write("导出时间: " + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")) + "\n"); |
||||
|
writer.write("数据条数: " + data.size() + "\n"); |
||||
|
writer.write("========================================================\n\n"); |
||||
|
|
||||
|
for (int i = 0; i < data.size(); i++) { |
||||
|
BaseMediaData item = data.get(i); |
||||
|
writer.write(String.format("%d. %s\n", i + 1, item.getTitle())); |
||||
|
writer.write(" 作者: " + item.getAuthor() + "\n"); |
||||
|
writer.write(" 播放: " + formatViewCount(item.getViewCount()) + "\n"); |
||||
|
writer.write(" 平台: " + item.getPlatform() + "\n"); |
||||
|
writer.write(" 链接: " + item.getUrl() + "\n"); |
||||
|
writer.write("--------------------------------------------------------\n"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("数据已保存到: {}", filePath); |
||||
|
} catch (IOException e) { |
||||
|
logger.error("保存文件失败: {}", e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private String formatViewCount(Long viewCount) { |
||||
|
if (viewCount == null) return "0"; |
||||
|
if (viewCount >= 100000000) return String.format("%.1f亿", viewCount / 100000000.0); |
||||
|
if (viewCount >= 10000) return String.format("%.1f万", viewCount / 10000.0); |
||||
|
return String.valueOf(viewCount); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getStorageName() { |
||||
|
return "TXT文件存储"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,52 @@ |
|||||
|
package com.crawler.strategy.crawler; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public abstract class AbstractCrawlStrategy<T extends BaseMediaData> implements CrawlStrategy<T> { |
||||
|
protected final Logger logger = LoggerFactory.getLogger(getClass()); |
||||
|
protected String platform; |
||||
|
|
||||
|
protected AbstractCrawlStrategy(String platform) { |
||||
|
this.platform = platform; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<T> crawl() { |
||||
|
return crawl(10); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<T> crawl(int count) { |
||||
|
logger.info("开始爬取 {} 平台...", platform); |
||||
|
|
||||
|
try { |
||||
|
validate(); |
||||
|
List<T> result = doCrawl(count); |
||||
|
logger.info("{} 平台爬取完成,获取 {} 条数据", platform, result.size()); |
||||
|
return result; |
||||
|
} catch (Exception e) { |
||||
|
logger.error("{} 平台爬取失败: {}", platform, e.getMessage()); |
||||
|
throw new CrawlerException.CrawlExecutionException(platform + " 爬取失败", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
protected void validate() { |
||||
|
} |
||||
|
|
||||
|
protected abstract List<T> doCrawl(int count); |
||||
|
|
||||
|
@Override |
||||
|
public String getPlatform() { |
||||
|
return platform; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean isAvailable() { |
||||
|
return true; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,20 @@ |
|||||
|
package com.crawler.strategy.crawler; |
||||
|
|
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlStrategy<T extends BaseMediaData> { |
||||
|
List<T> crawl(); |
||||
|
List<T> crawl(int count); |
||||
|
String getPlatform(); |
||||
|
StrategyType getType(); |
||||
|
boolean isAvailable(); |
||||
|
|
||||
|
enum StrategyType { |
||||
|
VIDEO, |
||||
|
NEWS, |
||||
|
WEATHER, |
||||
|
BOOK, |
||||
|
SOCIAL |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,37 @@ |
|||||
|
package com.crawler.strategy.crawler; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
|
||||
|
import java.util.HashMap; |
||||
|
import java.util.Map; |
||||
|
import java.util.function.Supplier; |
||||
|
|
||||
|
public class CrawlStrategyFactory { |
||||
|
private static final Map<String, Supplier<? extends CrawlStrategy<?>>> STRATEGY_REGISTRY = new HashMap<>(); |
||||
|
|
||||
|
static { |
||||
|
register("govnews", GovNewsCrawlStrategy::new); |
||||
|
register("weather", WeatherCrawlStrategy::new); |
||||
|
register("library", LibraryBookCrawlStrategy::new); |
||||
|
register("政务新闻", GovNewsCrawlStrategy::new); |
||||
|
register("天气预报", WeatherCrawlStrategy::new); |
||||
|
register("图书馆", LibraryBookCrawlStrategy::new); |
||||
|
} |
||||
|
|
||||
|
public static void register(String platform, Supplier<? extends CrawlStrategy<?>> constructor) { |
||||
|
STRATEGY_REGISTRY.put(platform.toLowerCase(), constructor); |
||||
|
} |
||||
|
|
||||
|
@SuppressWarnings("unchecked") |
||||
|
public static <T extends CrawlStrategy<?>> T getStrategy(String platform) { |
||||
|
Supplier<? extends CrawlStrategy<?>> constructor = STRATEGY_REGISTRY.get(platform.toLowerCase()); |
||||
|
if (constructor == null) { |
||||
|
throw new CrawlerException.PlatformNotFoundException(platform); |
||||
|
} |
||||
|
return (T) constructor.get(); |
||||
|
} |
||||
|
|
||||
|
public static boolean supports(String platform) { |
||||
|
return STRATEGY_REGISTRY.containsKey(platform.toLowerCase()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,52 @@ |
|||||
|
package com.crawler.strategy.crawler; |
||||
|
|
||||
|
import com.crawler.model.GovNewsData; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class GovNewsCrawlStrategy extends AbstractCrawlStrategy<GovNewsData> { |
||||
|
|
||||
|
public GovNewsCrawlStrategy() { |
||||
|
super("govnews"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public StrategyType getType() { |
||||
|
return StrategyType.NEWS; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<GovNewsData> doCrawl(int count) { |
||||
|
List<GovNewsData> newsList = new ArrayList<>(); |
||||
|
|
||||
|
addNews(newsList, "1", "国务院办公厅关于进一步优化营商环境更好服务市场主体的实施意见", |
||||
|
"中国政府网", "2024-01-15", "政策文件", "http://www.gov.cn"); |
||||
|
addNews(newsList, "2", "教育部发布2024年义务教育招生入学工作通知", |
||||
|
"教育部官网", "2024-01-14", "教育动态", "http://www.moe.gov.cn"); |
||||
|
addNews(newsList, "3", "人社部公布2024年春节假期安排", |
||||
|
"人力资源和社会保障部", "2024-01-13", "人事信息", "http://www.mohrss.gov.cn"); |
||||
|
addNews(newsList, "4", "国家医保局:进一步完善医保支付政策", |
||||
|
"国家医疗保障局", "2024-01-12", "医疗健康", "http://www.nhsa.gov.cn"); |
||||
|
addNews(newsList, "5", "生态环境部发布2023年全国环境质量状况", |
||||
|
"生态环境部", "2024-01-11", "环境保护", "http://www.mee.gov.cn"); |
||||
|
addNews(newsList, "6", "财政部发布2024年财政预算报告", |
||||
|
"财政部", "2024-01-10", "财政金融", "http://www.mof.gov.cn"); |
||||
|
addNews(newsList, "7", "工信部部署2024年工业和信息化工作", |
||||
|
"工业和信息化部", "2024-01-09", "工业信息", "http://www.miit.gov.cn"); |
||||
|
addNews(newsList, "8", "交通运输部推进交通强国建设", |
||||
|
"交通运输部", "2024-01-08", "交通建设", "http://www.mot.gov.cn"); |
||||
|
addNews(newsList, "9", "农业农村部部署春季农业生产", |
||||
|
"农业农村部", "2024-01-07", "农业农村", "http://www.moa.gov.cn"); |
||||
|
addNews(newsList, "10", "国家统计局发布2023年国民经济运行数据", |
||||
|
"国家统计局", "2024-01-06", "统计数据", "http://www.stats.gov.cn"); |
||||
|
|
||||
|
return newsList.subList(0, Math.min(count, newsList.size())); |
||||
|
} |
||||
|
|
||||
|
private void addNews(List<GovNewsData> list, String id, String title, String source, |
||||
|
String publishTime, String category, String url) { |
||||
|
GovNewsData news = new GovNewsData(id, title, source, publishTime, category, url); |
||||
|
news.setPlatform("govnews"); |
||||
|
list.add(news); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,52 @@ |
|||||
|
package com.crawler.strategy.crawler; |
||||
|
|
||||
|
import com.crawler.model.LibraryBookData; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class LibraryBookCrawlStrategy extends AbstractCrawlStrategy<LibraryBookData> { |
||||
|
|
||||
|
public LibraryBookCrawlStrategy() { |
||||
|
super("library"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public StrategyType getType() { |
||||
|
return StrategyType.BOOK; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<LibraryBookData> doCrawl(int count) { |
||||
|
List<LibraryBookData> bookList = new ArrayList<>(); |
||||
|
|
||||
|
addBook(bookList, "Java编程思想(第4版)", "Bruce Eckel", "机械工业出版社", |
||||
|
"978-7-111-21382-6", "2007", "A区-3排-15架", "可借阅", "TP312/EC4"); |
||||
|
addBook(bookList, "深入理解计算机系统", "Randal E. Bryant", "机械工业出版社", |
||||
|
"978-7-111-54493-7", "2016", "A区-2排-8架", "可借阅", "TP301/B83"); |
||||
|
addBook(bookList, "算法导论(第3版)", "Thomas H. Cormen", "机械工业出版社", |
||||
|
"978-7-111-40701-0", "2012", "A区-4排-22架", "已借出", "TP301/C62"); |
||||
|
addBook(bookList, "设计模式:可复用面向对象软件的基础", "Erich Gamma", "机械工业出版社", |
||||
|
"978-7-111-07554-7", "2000", "A区-1排-10架", "可借阅", "TP311.5/G16"); |
||||
|
addBook(bookList, "代码大全(第2版)", "Steve McConnell", "电子工业出版社", |
||||
|
"978-7-121-02298-5", "2006", "B区-5排-18架", "可借阅", "TP311.5/M13"); |
||||
|
addBook(bookList, "人月神话", "Frederick P. Brooks", "清华大学出版社", |
||||
|
"978-7-302-22587-5", "2010", "B区-3排-5架", "可借阅", "TP311.5/B88"); |
||||
|
addBook(bookList, "重构:改善既有代码的设计", "Martin Fowler", "人民邮电出版社", |
||||
|
"978-7-115-12057-5", "2010", "B区-2排-12架", "已借出", "TP311.5/F68"); |
||||
|
addBook(bookList, "Head First设计模式", "Eric Freeman", "中国电力出版社", |
||||
|
"978-7-5083-5393-7", "2007", "C区-1排-20架", "可借阅", "TP311.5/F84"); |
||||
|
addBook(bookList, "Effective Java(第3版)", "Joshua Bloch", "机械工业出版社", |
||||
|
"978-7-115-61275-6", "2020", "C区-4排-8架", "可借阅", "TP312/B57"); |
||||
|
addBook(bookList, "Clean Code", "Robert C. Martin", "人民邮电出版社", |
||||
|
"978-7-115-23385-8", "2010", "C区-5排-15架", "可借阅", "TP311.5/M27"); |
||||
|
|
||||
|
return bookList.subList(0, Math.min(count, bookList.size())); |
||||
|
} |
||||
|
|
||||
|
private void addBook(List<LibraryBookData> list, String title, String author, String publisher, |
||||
|
String isbn, String publishYear, String location, String status, String callNumber) { |
||||
|
LibraryBookData book = new LibraryBookData(title, author, publisher, isbn, publishYear, location, status, callNumber); |
||||
|
book.setPlatform("library"); |
||||
|
list.add(book); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,46 @@ |
|||||
|
package com.crawler.strategy.crawler; |
||||
|
|
||||
|
import com.crawler.model.WeatherData; |
||||
|
import java.time.LocalDate; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class WeatherCrawlStrategy extends AbstractCrawlStrategy<WeatherData> { |
||||
|
|
||||
|
public WeatherCrawlStrategy() { |
||||
|
super("weather"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public StrategyType getType() { |
||||
|
return StrategyType.WEATHER; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<WeatherData> doCrawl(int count) { |
||||
|
List<WeatherData> weatherList = new ArrayList<>(); |
||||
|
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd"); |
||||
|
LocalDate today = LocalDate.now(); |
||||
|
|
||||
|
addWeather(weatherList, "北京", today.format(formatter), "晴", "-5°C ~ 8°C", "北风", "3-4级", "35%"); |
||||
|
addWeather(weatherList, "上海", today.format(formatter), "多云", "8°C ~ 15°C", "东风", "2-3级", "65%"); |
||||
|
addWeather(weatherList, "广州", today.format(formatter), "小雨", "18°C ~ 23°C", "南风", "4-5级", "85%"); |
||||
|
addWeather(weatherList, "深圳", today.format(formatter), "阴", "20°C ~ 25°C", "东南风", "3-4级", "80%"); |
||||
|
addWeather(weatherList, "杭州", today.format(formatter), "晴转多云", "10°C ~ 18°C", "西北风", "2-3级", "55%"); |
||||
|
addWeather(weatherList, "南京", today.format(formatter), "多云转晴", "7°C ~ 14°C", "东北风", "3-4级", "50%"); |
||||
|
addWeather(weatherList, "武汉", today.format(formatter), "小雨", "5°C ~ 12°C", "北风", "4-5级", "75%"); |
||||
|
addWeather(weatherList, "成都", today.format(formatter), "阴转小雨", "6°C ~ 13°C", "南风", "2-3级", "82%"); |
||||
|
addWeather(weatherList, "重庆", today.format(formatter), "小雨", "10°C ~ 16°C", "西南风", "3-4级", "88%"); |
||||
|
addWeather(weatherList, "西安", today.format(formatter), "晴", "-2°C ~ 10°C", "西风", "2-3级", "40%"); |
||||
|
|
||||
|
return weatherList.subList(0, Math.min(count, weatherList.size())); |
||||
|
} |
||||
|
|
||||
|
private void addWeather(List<WeatherData> list, String city, String date, String weather, |
||||
|
String temperature, String windDirection, String windLevel, String humidity) { |
||||
|
WeatherData w = new WeatherData(city, date, weather, temperature, windDirection, windLevel, humidity); |
||||
|
w.setPlatform("weather"); |
||||
|
list.add(w); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,42 @@ |
|||||
|
package com.crawler.strategy.storage; |
||||
|
|
||||
|
import com.crawler.exception.StorageException; |
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.nio.file.Paths; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
|
||||
|
public abstract class AbstractStorageStrategy implements StorageStrategy { |
||||
|
protected final Logger logger = LoggerFactory.getLogger(getClass()); |
||||
|
protected static final String OUTPUT_DIR = "output"; |
||||
|
protected static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"); |
||||
|
|
||||
|
protected String generateFileName(String platform, String extension) { |
||||
|
String timestamp = LocalDateTime.now().format(DATE_FORMATTER); |
||||
|
return platform + "_" + timestamp + "." + extension; |
||||
|
} |
||||
|
|
||||
|
protected void ensureOutputDirectory() { |
||||
|
try { |
||||
|
Path outputPath = Paths.get(OUTPUT_DIR); |
||||
|
if (!Files.exists(outputPath)) { |
||||
|
Files.createDirectories(outputPath); |
||||
|
logger.info("创建输出目录: {}", outputPath.toAbsolutePath()); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
throw new StorageException.StorageWriteException(OUTPUT_DIR, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
protected String getFilePath(String fileName) { |
||||
|
return Paths.get(OUTPUT_DIR, fileName).toString(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,65 @@ |
|||||
|
package com.crawler.strategy.storage; |
||||
|
|
||||
|
import com.crawler.exception.StorageException; |
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CsvStorageStrategy extends AbstractStorageStrategy { |
||||
|
|
||||
|
@Override |
||||
|
public String save(List<? extends BaseMediaData> data, String platform) { |
||||
|
String fileName = generateFileName(platform, getFileExtension()); |
||||
|
String filePath = getFilePath(fileName); |
||||
|
ensureOutputDirectory(); |
||||
|
|
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) { |
||||
|
writer.write("\uFEFF"); |
||||
|
writer.write("序号,标题,作者,平台,链接,爬取时间"); |
||||
|
writer.newLine(); |
||||
|
|
||||
|
int index = 1; |
||||
|
for (BaseMediaData item : data) { |
||||
|
writer.write(String.format("%d,\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"", |
||||
|
index++, |
||||
|
escapeCsv(item.getTitle()), |
||||
|
escapeCsv(item.getAuthor()), |
||||
|
escapeCsv(item.getPlatform()), |
||||
|
escapeCsv(item.getUrl()), |
||||
|
item.getCrawlTime().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")))); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
|
||||
|
logger.info("CSV数据已保存到: {}", filePath); |
||||
|
return filePath; |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
throw new StorageException.StorageWriteException(fileName, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String save(List<? extends BaseMediaData> data) { |
||||
|
return save(data, "data"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getFormat() { |
||||
|
return "csv"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getFileExtension() { |
||||
|
return "csv"; |
||||
|
} |
||||
|
|
||||
|
private String escapeCsv(String value) { |
||||
|
if (value == null) return ""; |
||||
|
return value.replace("\"", "\"\""); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,61 @@ |
|||||
|
package com.crawler.strategy.storage; |
||||
|
|
||||
|
import com.crawler.exception.StorageException; |
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
import com.fasterxml.jackson.databind.ObjectMapper; |
||||
|
import com.fasterxml.jackson.databind.SerializationFeature; |
||||
|
|
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.HashMap; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class JsonStorageStrategy extends AbstractStorageStrategy { |
||||
|
private final ObjectMapper objectMapper; |
||||
|
|
||||
|
public JsonStorageStrategy() { |
||||
|
this.objectMapper = new ObjectMapper(); |
||||
|
this.objectMapper.enable(SerializationFeature.INDENT_OUTPUT); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String save(List<? extends BaseMediaData> data, String platform) { |
||||
|
String fileName = generateFileName(platform, getFileExtension()); |
||||
|
String filePath = getFilePath(fileName); |
||||
|
ensureOutputDirectory(); |
||||
|
|
||||
|
try (FileWriter writer = new FileWriter(filePath)) { |
||||
|
Map<String, Object> output = new HashMap<>(); |
||||
|
output.put("platform", platform); |
||||
|
output.put("crawlTime", LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))); |
||||
|
output.put("count", data.size()); |
||||
|
output.put("data", data); |
||||
|
|
||||
|
objectMapper.writeValue(writer, output); |
||||
|
|
||||
|
logger.info("JSON数据已保存到: {}", filePath); |
||||
|
return filePath; |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
throw new StorageException.StorageWriteException(fileName, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String save(List<? extends BaseMediaData> data) { |
||||
|
return save(data, "data"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getFormat() { |
||||
|
return "json"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getFileExtension() { |
||||
|
return "json"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.crawler.strategy.storage; |
||||
|
|
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface StorageStrategy { |
||||
|
String save(List<? extends BaseMediaData> data, String platform); |
||||
|
String save(List<? extends BaseMediaData> data); |
||||
|
String getFormat(); |
||||
|
String getFileExtension(); |
||||
|
} |
||||
@ -0,0 +1,33 @@ |
|||||
|
package com.crawler.strategy.storage; |
||||
|
|
||||
|
import com.crawler.exception.StorageException; |
||||
|
|
||||
|
import java.util.HashMap; |
||||
|
import java.util.Map; |
||||
|
import java.util.function.Supplier; |
||||
|
|
||||
|
public class StorageStrategyFactory { |
||||
|
private static final Map<String, Supplier<StorageStrategy>> STRATEGY_REGISTRY = new HashMap<>(); |
||||
|
|
||||
|
static { |
||||
|
register("txt", TxtStorageStrategy::new); |
||||
|
register("json", JsonStorageStrategy::new); |
||||
|
register("csv", CsvStorageStrategy::new); |
||||
|
} |
||||
|
|
||||
|
public static void register(String format, Supplier<StorageStrategy> constructor) { |
||||
|
STRATEGY_REGISTRY.put(format.toLowerCase(), constructor); |
||||
|
} |
||||
|
|
||||
|
public static StorageStrategy getStrategy(String format) { |
||||
|
Supplier<StorageStrategy> constructor = STRATEGY_REGISTRY.get(format.toLowerCase()); |
||||
|
if (constructor == null) { |
||||
|
throw new StorageException.StorageFormatException(format); |
||||
|
} |
||||
|
return constructor.get(); |
||||
|
} |
||||
|
|
||||
|
public static boolean supports(String format) { |
||||
|
return STRATEGY_REGISTRY.containsKey(format.toLowerCase()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,74 @@ |
|||||
|
package com.crawler.strategy.storage; |
||||
|
|
||||
|
import com.crawler.exception.StorageException; |
||||
|
import com.crawler.model.BaseMediaData; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class TxtStorageStrategy extends AbstractStorageStrategy { |
||||
|
|
||||
|
@Override |
||||
|
public String save(List<? extends BaseMediaData> data, String platform) { |
||||
|
String fileName = generateFileName(platform, getFileExtension()); |
||||
|
String filePath = getFilePath(fileName); |
||||
|
ensureOutputDirectory(); |
||||
|
|
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) { |
||||
|
writer.write("=".repeat(50)); |
||||
|
writer.newLine(); |
||||
|
writer.write("爬虫数据 - " + platform); |
||||
|
writer.newLine(); |
||||
|
writer.write("爬取时间: " + LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))); |
||||
|
writer.newLine(); |
||||
|
writer.write("数据条数: " + data.size()); |
||||
|
writer.newLine(); |
||||
|
writer.write("=".repeat(50)); |
||||
|
writer.newLine(); |
||||
|
writer.newLine(); |
||||
|
|
||||
|
int index = 1; |
||||
|
for (BaseMediaData item : data) { |
||||
|
writer.write(String.format("[%d] %s", index++, item.getTitle())); |
||||
|
writer.newLine(); |
||||
|
writer.write(" 作者: " + item.getAuthor()); |
||||
|
writer.newLine(); |
||||
|
writer.write(" 平台: " + item.getPlatform()); |
||||
|
writer.newLine(); |
||||
|
writer.write(" 链接: " + item.getUrl()); |
||||
|
writer.newLine(); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
|
||||
|
writer.write("=".repeat(50)); |
||||
|
writer.newLine(); |
||||
|
writer.write("共计 " + data.size() + " 条记录"); |
||||
|
writer.newLine(); |
||||
|
|
||||
|
logger.info("数据已保存到: {}", filePath); |
||||
|
return filePath; |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
throw new StorageException.StorageWriteException(fileName, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String save(List<? extends BaseMediaData> data) { |
||||
|
return save(data, "data"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getFormat() { |
||||
|
return "txt"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getFileExtension() { |
||||
|
return "txt"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,78 @@ |
|||||
|
package com.crawler.view; |
||||
|
|
||||
|
import com.crawler.constant.AnsiColor; |
||||
|
import java.io.PrintStream; |
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
public class ConsoleView { |
||||
|
private static final PrintStream OUT = System.out; |
||||
|
private static final Scanner SCANNER = new Scanner(System.in); |
||||
|
private static ConsoleView instance; |
||||
|
|
||||
|
private ConsoleView() {} |
||||
|
|
||||
|
public static ConsoleView getInstance() { |
||||
|
if (instance == null) instance = new ConsoleView(); |
||||
|
return instance; |
||||
|
} |
||||
|
|
||||
|
public void println(String text) { OUT.println(text); } |
||||
|
public void print(String text) { OUT.print(text); } |
||||
|
public void newLine() { OUT.println(); } |
||||
|
|
||||
|
public void printSuccess(String text) { OUT.println(AnsiColor.success("✓ " + text)); } |
||||
|
public void printError(String text) { OUT.println(AnsiColor.error("✗ " + text)); } |
||||
|
public void printWarning(String text) { OUT.println(AnsiColor.warning("⚠ " + text)); } |
||||
|
public void printInfo(String text) { OUT.println(AnsiColor.info("ℹ " + text)); } |
||||
|
|
||||
|
public void printHeader(String text) { |
||||
|
newLine(); |
||||
|
OUT.println(AnsiColor.header("═══════════════════════════════════════════════")); |
||||
|
OUT.println(AnsiColor.header(" " + text)); |
||||
|
OUT.println(AnsiColor.header("═══════════════════════════════════════════════")); |
||||
|
newLine(); |
||||
|
} |
||||
|
|
||||
|
public void printBanner() { |
||||
|
OUT.println(AnsiColor.CYAN); |
||||
|
OUT.println(" ____ _ _ _ _ "); |
||||
|
OUT.println(" | __ ) __ _| |__ (_)_ __ ___| |_ ___| |_ "); |
||||
|
OUT.println(" | _ \\ / _` | '_ \\| | '_ \\ / _ \\ __/ __| __|"); |
||||
|
OUT.println(" | |_) | (_| | |_) | | | | | __/ || (__| |_ "); |
||||
|
OUT.println(" |____/ \\__,_|_.__/|_|_| |_|\\___|\\__\\___|\\__|"); |
||||
|
OUT.println(AnsiColor.RESET); |
||||
|
OUT.println(AnsiColor.bold(" 多网站爬虫系统") + " v1.0"); |
||||
|
OUT.println(AnsiColor.PURPLE + " 输入 help 查看可用命令" + AnsiColor.RESET); |
||||
|
newLine(); |
||||
|
} |
||||
|
|
||||
|
public void printPrompt() { |
||||
|
OUT.print(AnsiColor.BOLD + AnsiColor.GREEN + "crawler> " + AnsiColor.RESET); |
||||
|
} |
||||
|
|
||||
|
public String readLine() { return SCANNER.nextLine().trim(); } |
||||
|
|
||||
|
public void printTable(String[] headers, String[][] data) { |
||||
|
for (String header : headers) { |
||||
|
OUT.print(AnsiColor.BOLD + String.format("%-18s", header) + AnsiColor.RESET); |
||||
|
} |
||||
|
newLine(); |
||||
|
OUT.println(AnsiColor.YELLOW + "──────────────────────────────────────────────────────────────────────────" + AnsiColor.RESET); |
||||
|
for (String[] row : data) { |
||||
|
for (String cell : row) { |
||||
|
OUT.print(String.format("%-18s", cell != null ? cell : "-")); |
||||
|
} |
||||
|
newLine(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void printSeparator() { |
||||
|
OUT.println(AnsiColor.CYAN + "──────────────────────────────────────────────────────────────────────────" + AnsiColor.RESET); |
||||
|
} |
||||
|
|
||||
|
public void printExit() { |
||||
|
newLine(); |
||||
|
OUT.println(AnsiColor.YELLOW + " 感谢使用,再见!" + AnsiColor.RESET); |
||||
|
newLine(); |
||||
|
} |
||||
|
} |
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files changed in this diff
Loading…
Reference in new issue