48 changed files with 6161 additions and 0 deletions
Binary file not shown.
@ -0,0 +1,146 @@ |
|||||
|
[ |
||||
|
{ |
||||
|
"id": "6859ecc9-c992-4e93-93e6-87ddc6e1a6be", |
||||
|
"title": "浏览内容", |
||||
|
"articleUrl": "#section_head", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "5dbcafb6-8a83-4052-aed1-850e72265f91", |
||||
|
"title": "App 下载", |
||||
|
"articleUrl": "http://www.wandoujia.com/apps/com.zhihu.daily.android", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "06b7c5b6-c4ce-4281-9e7d-6e312858307d", |
||||
|
"title": "知乎日报", |
||||
|
"articleUrl": "http://daily.zhihu.com/", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "e4880198-329b-47c7-bf7b-78052aa2bf8b", |
||||
|
"title": "iOS 版", |
||||
|
"articleUrl": "https://itunes.apple.com/cn/app/id639087967?mt\u003d8", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "e17a296b-5f65-4bfb-b12f-9fd4fe1a07d7", |
||||
|
"title": "文学创作会不会受到 AI 的冲击?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790086", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "12676469-494d-473a-89c8-01f140a57188", |
||||
|
"title": "为什么说西西弗斯面对巨石,不断推上山是一种超越和蔑视?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790101", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "625a5f08-3ce9-487f-b367-5be0cdb7500e", |
||||
|
"title": "有哪些看起来很高端的技术其实原理很暴力很初级?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790092", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "d4aa8adc-e4af-4194-bef1-bdf233742f86", |
||||
|
"title": "中国古代官方不重视理工科吗,如果是,为什么?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790090", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "e36a73d4-c0b8-4674-b4d5-d1d9de5cd894", |
||||
|
"title": "为什么人类不能自身合成维生素C?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790062", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "e7adc448-4570-45e6-a6e7-46b1397f0677", |
||||
|
"title": "林黛玉被妙玉嫌弃太俗,却不敢反驳,她怼贾宝玉的劲儿哪去了?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790081", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "37468176-caca-47f2-b422-b467a644e0ff", |
||||
|
"title": "魏博没有山川险阻,靠什么屹立150年。甚至长期成为最强藩?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790071", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "bdf153fc-5ece-4ea3-85bb-de0a2b49f5ee", |
||||
|
"title": "瞎扯 · 如何正确地吐槽", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790084", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "e9cec0a7-82c2-43e5-a529-92d3979d4b24", |
||||
|
"title": "为什么松鼠的动作总是一顿一顿的?像卡帧一样?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790034", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "288cd539-af4b-4bf1-81ee-6fe5a9936fae", |
||||
|
"title": "既有 F\u003dma,又有F\u003dkx,那么物理公式到底要求等号左边是因还是果?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790046", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "c595c7e0-e768-4f5e-a0ed-9982466bf761", |
||||
|
"title": "西安唐代城门恢复为何不学洛阳,丹凤门像土黄色纸壳子,明德门像塑料玩具?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790039", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "3bc7435b-2ac8-4979-bd8c-20da0b4f7a3d", |
||||
|
"title": "游牧民族几乎没有碳水来源,为什么没有营养不良?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790022", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "c5ebc7af-5117-46a7-9107-1f28ed91d0c1", |
||||
|
"title": "为什么压力单位这么混乱?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790028", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
}, |
||||
|
{ |
||||
|
"id": "867ebe9e-0016-4e29-a407-319ef50aa51c", |
||||
|
"title": "为什么会有好奇害死猫这个说法?", |
||||
|
"articleUrl": "https://daily.zhihu.com/story/9790027", |
||||
|
"author": "知乎日报", |
||||
|
"source": "知乎日报", |
||||
|
"crawledAt": "2026-05-30T14:49:20.886753" |
||||
|
} |
||||
|
] |
||||
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
@ -0,0 +1,73 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" |
||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
||||
|
<modelVersion>4.0.0</modelVersion> |
||||
|
|
||||
|
<groupId>com.newscrawler</groupId> |
||||
|
<artifactId>news-crawler</artifactId> |
||||
|
<version>1.0.0</version> |
||||
|
<packaging>jar</packaging> |
||||
|
|
||||
|
<name>News Crawler</name> |
||||
|
<description>增强版Java资讯爬虫</description> |
||||
|
|
||||
|
<properties> |
||||
|
<java.version>17</java.version> |
||||
|
<maven.compiler.source>17</maven.compiler.source> |
||||
|
<maven.compiler.target>17</maven.compiler.target> |
||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
||||
|
<jsoup.version>1.17.2</jsoup.version> |
||||
|
<gson.version>2.10.1</gson.version> |
||||
|
<logback.version>1.4.14</logback.version> |
||||
|
<slf4j.version>2.0.11</slf4j.version> |
||||
|
</properties> |
||||
|
|
||||
|
<dependencies> |
||||
|
<dependency> |
||||
|
<groupId>org.jsoup</groupId> |
||||
|
<artifactId>jsoup</artifactId> |
||||
|
<version>${jsoup.version}</version> |
||||
|
</dependency> |
||||
|
|
||||
|
<dependency> |
||||
|
<groupId>com.google.code.gson</groupId> |
||||
|
<artifactId>gson</artifactId> |
||||
|
<version>${gson.version}</version> |
||||
|
</dependency> |
||||
|
|
||||
|
<dependency> |
||||
|
<groupId>ch.qos.logback</groupId> |
||||
|
<artifactId>logback-classic</artifactId> |
||||
|
<version>${logback.version}</version> |
||||
|
</dependency> |
||||
|
|
||||
|
<dependency> |
||||
|
<groupId>org.slf4j</groupId> |
||||
|
<artifactId>slf4j-api</artifactId> |
||||
|
<version>${slf4j.version}</version> |
||||
|
</dependency> |
||||
|
</dependencies> |
||||
|
|
||||
|
<build> |
||||
|
<plugins> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-compiler-plugin</artifactId> |
||||
|
<version>3.11.0</version> |
||||
|
<configuration> |
||||
|
<source>${java.version}</source> |
||||
|
<target>${java.version}</target> |
||||
|
</configuration> |
||||
|
</plugin> |
||||
|
<plugin> |
||||
|
<groupId>org.codehaus.mojo</groupId> |
||||
|
<artifactId>exec-maven-plugin</artifactId> |
||||
|
<version>3.1.0</version> |
||||
|
<configuration> |
||||
|
<mainClass>com.newscrawler.Main</mainClass> |
||||
|
</configuration> |
||||
|
</plugin> |
||||
|
</plugins> |
||||
|
</build> |
||||
|
</project> |
||||
@ -0,0 +1,26 @@ |
|||||
|
package com.newscrawler; |
||||
|
|
||||
|
import com.newscrawler.command.MenuCommand; |
||||
|
import com.newscrawler.service.CrawlerService; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
public class Main { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(Main.class); |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
logger.info("资讯爬虫启动"); |
||||
|
|
||||
|
try { |
||||
|
CrawlerService crawlerService = new CrawlerService(); |
||||
|
MenuCommand menuCommand = new MenuCommand(crawlerService); |
||||
|
menuCommand.start(); |
||||
|
} catch (Exception e) { |
||||
|
logger.error("程序执行异常", e); |
||||
|
System.err.println("程序执行失败: " + e.getMessage()); |
||||
|
System.exit(1); |
||||
|
} |
||||
|
|
||||
|
logger.info("资讯爬虫关闭"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,361 @@ |
|||||
|
package com.newscrawler.command; |
||||
|
|
||||
|
import com.newscrawler.entity.Article; |
||||
|
import com.newscrawler.entity.CrawlHistory; |
||||
|
import com.newscrawler.exception.CrawlerException; |
||||
|
import com.newscrawler.service.CrawlerService; |
||||
|
import com.newscrawler.util.JsonUtil; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
import java.util.Objects; |
||||
|
import java.util.Scanner; |
||||
|
import java.util.Set; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class MenuCommand { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(MenuCommand.class); |
||||
|
private final CrawlerService crawlerService; |
||||
|
private final Scanner scanner; |
||||
|
|
||||
|
public MenuCommand(CrawlerService crawlerService) { |
||||
|
this.crawlerService = crawlerService; |
||||
|
this.scanner = new Scanner(System.in); |
||||
|
} |
||||
|
|
||||
|
public void start() { |
||||
|
boolean running = true; |
||||
|
while (running) { |
||||
|
showMainMenu(); |
||||
|
String input = scanner.nextLine().trim(); |
||||
|
running = handleMainMenu(input); |
||||
|
} |
||||
|
System.out.println("感谢使用资讯爬虫,再见!"); |
||||
|
} |
||||
|
|
||||
|
private void showMainMenu() { |
||||
|
System.out.println("\n========== 请选择你要执行的操作 =========="); |
||||
|
System.out.println("1. 爬取菜鸟教程资讯"); |
||||
|
System.out.println("2. 爬取知乎日报"); |
||||
|
System.out.println("3. 爬取搜狐资讯"); |
||||
|
System.out.println("4. 批量爬取全部站点"); |
||||
|
System.out.println("5. 查看全部资讯"); |
||||
|
System.out.println("6. 数据统计"); |
||||
|
System.out.println("7. 爬取历史"); |
||||
|
System.out.println("8. 数据导入/导出"); |
||||
|
System.out.println("9. 数据管理(删除/清空)"); |
||||
|
System.out.println("0. 退出程序"); |
||||
|
System.out.print("请输入选项: "); |
||||
|
} |
||||
|
|
||||
|
private boolean handleMainMenu(String input) { |
||||
|
try { |
||||
|
switch (input) { |
||||
|
case "1": |
||||
|
crawlSite("runoob"); |
||||
|
break; |
||||
|
case "2": |
||||
|
crawlSite("youth"); |
||||
|
break; |
||||
|
case "3": |
||||
|
crawlSite("sohu"); |
||||
|
break; |
||||
|
case "4": |
||||
|
crawlAllSites(); |
||||
|
break; |
||||
|
case "5": |
||||
|
viewAllArticles(); |
||||
|
break; |
||||
|
case "6": |
||||
|
showStatistics(); |
||||
|
break; |
||||
|
case "7": |
||||
|
showCrawlHistory(); |
||||
|
break; |
||||
|
case "8": |
||||
|
handleImportExport(); |
||||
|
break; |
||||
|
case "9": |
||||
|
manageData(); |
||||
|
break; |
||||
|
case "0": |
||||
|
return false; |
||||
|
default: |
||||
|
System.out.println("无效选项,请输入0-9之间的数字"); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
System.out.println("操作出错: " + e.getMessage()); |
||||
|
logger.error("菜单操作异常", e); |
||||
|
} |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
private void crawlSite(String siteKey) throws CrawlerException { |
||||
|
System.out.println("正在爬取,请稍候..."); |
||||
|
List<Article> articles = crawlerService.crawlSingleSite(siteKey); |
||||
|
System.out.println("爬取完成!获取到 " + articles.size() + " 篇文章"); |
||||
|
} |
||||
|
|
||||
|
private void crawlAllSites() throws CrawlerException { |
||||
|
System.out.println("正在批量爬取所有站点,请稍候..."); |
||||
|
List<Article> articles = crawlerService.crawlAllSites(); |
||||
|
System.out.println("批量爬取完成!共获取到 " + articles.size() + " 篇文章"); |
||||
|
} |
||||
|
|
||||
|
private void viewAllArticles() { |
||||
|
List<Article> articles = crawlerService.getAllArticles(); |
||||
|
if (articles.isEmpty()) { |
||||
|
System.out.println("暂无资讯,请先爬取数据"); |
||||
|
return; |
||||
|
} |
||||
|
printArticles(articles); |
||||
|
} |
||||
|
|
||||
|
private void showStatistics() { |
||||
|
Map<String, Long> stats = crawlerService.getStatistics(); |
||||
|
System.out.println("\n=== 数据统计 ==="); |
||||
|
stats.forEach((source, count) -> System.out.println(source + ": " + count + " 条")); |
||||
|
} |
||||
|
|
||||
|
private void showCrawlHistory() { |
||||
|
List<CrawlHistory> histories = crawlerService.getCrawlHistories(); |
||||
|
if (histories.isEmpty()) { |
||||
|
System.out.println("暂无爬取历史"); |
||||
|
return; |
||||
|
} |
||||
|
System.out.println("\n=== 爬取历史 ==="); |
||||
|
histories.forEach(h -> System.out.println(h.toString())); |
||||
|
} |
||||
|
|
||||
|
private void handleImportExport() { |
||||
|
System.out.println("\n=== 数据导入/导出 ==="); |
||||
|
System.out.println("1. 导出数据到JSON"); |
||||
|
System.out.println("2. 从JSON导入数据"); |
||||
|
System.out.print("选择: "); |
||||
|
String choice = scanner.nextLine().trim(); |
||||
|
|
||||
|
switch (choice) { |
||||
|
case "1" -> exportToJson(); |
||||
|
case "2" -> importFromJson(); |
||||
|
default -> System.out.println("无效选择"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void exportToJson() { |
||||
|
List<Article> articles = crawlerService.getAllArticles(); |
||||
|
if (articles.isEmpty()) { |
||||
|
System.out.println("暂无数据可导出,请先爬取数据"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
Set<String> sources = articles.stream() |
||||
|
.map(Article::getSource) |
||||
|
.filter(Objects::nonNull) |
||||
|
.collect(Collectors.toSet()); |
||||
|
|
||||
|
String prefix; |
||||
|
if (sources.size() > 1) { |
||||
|
prefix = "全"; |
||||
|
} else if (sources.contains("菜鸟教程资讯")) { |
||||
|
prefix = "菜鸟"; |
||||
|
} else if (sources.contains("知乎日报")) { |
||||
|
prefix = "知乎"; |
||||
|
} else if (sources.contains("搜狐资讯")) { |
||||
|
prefix = "搜狐"; |
||||
|
} else { |
||||
|
prefix = "articles"; |
||||
|
} |
||||
|
|
||||
|
String filename = generateNumberedFilename(prefix); |
||||
|
|
||||
|
try { |
||||
|
JsonUtil.exportToJson(articles, filename); |
||||
|
System.out.println("导出成功: " + filename); |
||||
|
} catch (Exception e) { |
||||
|
System.out.println("导出失败: " + e.getMessage()); |
||||
|
logger.error("导出JSON失败", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private String generateNumberedFilename(String prefix) { |
||||
|
java.io.File dir = new java.io.File("data"); |
||||
|
if (!dir.exists()) { |
||||
|
return prefix + "1.json"; |
||||
|
} |
||||
|
|
||||
|
java.io.File[] files = dir.listFiles((d, name) -> name.startsWith(prefix) && name.endsWith(".json")); |
||||
|
if (files == null || files.length == 0) { |
||||
|
return prefix + "1.json"; |
||||
|
} |
||||
|
|
||||
|
int maxNum = 0; |
||||
|
for (java.io.File file : files) { |
||||
|
String name = file.getName(); |
||||
|
try { |
||||
|
String numStr = name.substring(prefix.length(), name.length() - 5); |
||||
|
int num = Integer.parseInt(numStr); |
||||
|
if (num > maxNum) { |
||||
|
maxNum = num; |
||||
|
} |
||||
|
} catch (NumberFormatException ignored) { |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return prefix + (maxNum + 1) + ".json"; |
||||
|
} |
||||
|
|
||||
|
private void importFromJson() { |
||||
|
java.io.File dir = new java.io.File("data"); |
||||
|
if (!dir.exists()) { |
||||
|
System.out.println("当前目录下没有JSON文件,请先导出数据"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
java.io.File[] files = dir.listFiles((d, name) -> name.endsWith(".json")); |
||||
|
|
||||
|
if (files == null || files.length == 0) { |
||||
|
System.out.println("当前目录下没有JSON文件,请先导出数据"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
System.out.println("\n当前目录下的JSON文件:"); |
||||
|
for (int i = 0; i < files.length; i++) { |
||||
|
String name = files[i].getName(); |
||||
|
System.out.println((i + 1) + ". " + name); |
||||
|
} |
||||
|
|
||||
|
System.out.print("\n请输入要导入的文件编号,或输入0返回: "); |
||||
|
String input = scanner.nextLine().trim(); |
||||
|
|
||||
|
if ("0".equals(input)) { |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
if (input.isEmpty()) { |
||||
|
System.out.println("输入不能为空"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
int index; |
||||
|
try { |
||||
|
index = Integer.parseInt(input) - 1; |
||||
|
} catch (NumberFormatException e) { |
||||
|
System.out.println("请输入有效的数字编号"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
if (index < 0 || index >= files.length) { |
||||
|
System.out.println("编号超出范围"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String filename = files[index].getName(); |
||||
|
|
||||
|
try { |
||||
|
List<Article> articles = JsonUtil.importFromJson(filename); |
||||
|
crawlerService.getRepository().addAll(articles); |
||||
|
System.out.println("导入成功: " + articles.size() + " 篇文章(已自动去重)"); |
||||
|
} catch (Exception e) { |
||||
|
System.out.println("导入失败: " + e.getMessage()); |
||||
|
logger.error("导入JSON失败", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void manageData() { |
||||
|
System.out.println("\n=== 数据管理 ==="); |
||||
|
System.out.println("1. 删除单条资讯"); |
||||
|
System.out.println("2. 清空所有资讯"); |
||||
|
System.out.print("选择: "); |
||||
|
String choice = scanner.nextLine().trim(); |
||||
|
|
||||
|
switch (choice) { |
||||
|
case "1" -> deleteSingleArticle(); |
||||
|
case "2" -> clearAllArticles(); |
||||
|
default -> System.out.println("无效选择"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void deleteSingleArticle() { |
||||
|
List<Article> articles = crawlerService.getAllArticles(); |
||||
|
if (articles.isEmpty()) { |
||||
|
System.out.println("暂无资讯可删除"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
System.out.println("\n=== 当前数据库中的资讯 ==="); |
||||
|
for (int i = 0; i < articles.size(); i++) { |
||||
|
Article article = articles.get(i); |
||||
|
System.out.printf("[%d] %s%n", i + 1, article.getTitle()); |
||||
|
System.out.println(" 来源: " + article.getSource()); |
||||
|
if (article.getPublishDate() != null && !article.getPublishDate().isEmpty()) { |
||||
|
System.out.println(" 时间: " + article.getPublishDate()); |
||||
|
} |
||||
|
System.out.println("-".repeat(60)); |
||||
|
} |
||||
|
|
||||
|
System.out.print("\n请输入要删除的文章编号(输入0取消): "); |
||||
|
String input = scanner.nextLine().trim(); |
||||
|
|
||||
|
if ("0".equals(input)) { |
||||
|
System.out.println("已取消操作"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
int index; |
||||
|
try { |
||||
|
index = Integer.parseInt(input) - 1; |
||||
|
} catch (NumberFormatException e) { |
||||
|
System.out.println("请输入有效的数字"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
if (index < 0 || index >= articles.size()) { |
||||
|
System.out.println("编号超出范围"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
Article articleToDelete = articles.get(index); |
||||
|
if (crawlerService.removeArticle(articleToDelete.getId())) { |
||||
|
System.out.println("删除成功: " + articleToDelete.getTitle()); |
||||
|
} else { |
||||
|
System.out.println("删除失败"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void clearAllArticles() { |
||||
|
System.out.print("确定要清空所有数据吗?(y/n): "); |
||||
|
String confirm = scanner.nextLine().trim().toLowerCase(); |
||||
|
if ("y".equals(confirm)) { |
||||
|
crawlerService.clearAllArticles(); |
||||
|
System.out.println("已清空所有数据"); |
||||
|
} else { |
||||
|
System.out.println("已取消操作"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void printArticles(List<Article> articles) { |
||||
|
System.out.println("-".repeat(80)); |
||||
|
for (int i = 0; i < articles.size(); i++) { |
||||
|
Article article = articles.get(i); |
||||
|
System.out.printf("[%d] %s%n", i + 1, article.getTitle()); |
||||
|
System.out.println(" 来源: " + article.getSource()); |
||||
|
if (article.getAuthor() != null && !article.getAuthor().isEmpty()) { |
||||
|
System.out.println(" 作者: " + article.getAuthor()); |
||||
|
} |
||||
|
if (article.getPublishDate() != null && !article.getPublishDate().isEmpty()) { |
||||
|
System.out.println(" 时间: " + article.getPublishDate()); |
||||
|
} |
||||
|
if (article.getSummary() != null && !article.getSummary().isEmpty()) { |
||||
|
String summary = article.getSummary(); |
||||
|
if (summary.length() > 50) { |
||||
|
summary = summary.substring(0, 50) + "..."; |
||||
|
} |
||||
|
System.out.println(" 摘要: " + summary); |
||||
|
} |
||||
|
System.out.println(" ID: " + article.getId()); |
||||
|
System.out.println("-".repeat(80)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,113 @@ |
|||||
|
package com.newscrawler.entity; |
||||
|
|
||||
|
import java.util.Objects; |
||||
|
import java.util.UUID; |
||||
|
|
||||
|
public class Article { |
||||
|
private String id; |
||||
|
private String title; |
||||
|
private String summary; |
||||
|
private String publishDate; |
||||
|
private String articleUrl; |
||||
|
private String author; |
||||
|
private String source; |
||||
|
private String crawledAt; |
||||
|
|
||||
|
public Article() { |
||||
|
this.id = UUID.randomUUID().toString(); |
||||
|
this.crawledAt = java.time.LocalDateTime.now().toString(); |
||||
|
} |
||||
|
|
||||
|
public String getId() { |
||||
|
return id; |
||||
|
} |
||||
|
|
||||
|
public void setId(String id) { |
||||
|
this.id = id; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public String getSummary() { |
||||
|
return summary; |
||||
|
} |
||||
|
|
||||
|
public void setSummary(String summary) { |
||||
|
this.summary = summary; |
||||
|
} |
||||
|
|
||||
|
public String getPublishDate() { |
||||
|
return publishDate; |
||||
|
} |
||||
|
|
||||
|
public void setPublishDate(String publishDate) { |
||||
|
this.publishDate = publishDate; |
||||
|
} |
||||
|
|
||||
|
public String getArticleUrl() { |
||||
|
return articleUrl; |
||||
|
} |
||||
|
|
||||
|
public void setArticleUrl(String articleUrl) { |
||||
|
this.articleUrl = articleUrl; |
||||
|
} |
||||
|
|
||||
|
public String getAuthor() { |
||||
|
return author; |
||||
|
} |
||||
|
|
||||
|
public void setAuthor(String author) { |
||||
|
this.author = author; |
||||
|
} |
||||
|
|
||||
|
public String getSource() { |
||||
|
return source; |
||||
|
} |
||||
|
|
||||
|
public void setSource(String source) { |
||||
|
this.source = source; |
||||
|
} |
||||
|
|
||||
|
public String getCrawledAt() { |
||||
|
return crawledAt; |
||||
|
} |
||||
|
|
||||
|
public void setCrawledAt(String crawledAt) { |
||||
|
this.crawledAt = crawledAt; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public boolean equals(Object o) { |
||||
|
if (this == o) return true; |
||||
|
if (o == null || getClass() != o.getClass()) return false; |
||||
|
Article article = (Article) o; |
||||
|
return Objects.equals(title, article.title) && |
||||
|
Objects.equals(source, article.source) && |
||||
|
Objects.equals(publishDate, article.publishDate); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public int hashCode() { |
||||
|
return Objects.hash(title, source, publishDate); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Article{" + |
||||
|
"id='" + id + '\'' + |
||||
|
", title='" + title + '\'' + |
||||
|
", summary='" + summary + '\'' + |
||||
|
", publishDate='" + publishDate + '\'' + |
||||
|
", articleUrl='" + articleUrl + '\'' + |
||||
|
", author='" + author + '\'' + |
||||
|
", source='" + source + '\'' + |
||||
|
", crawledAt='" + crawledAt + '\'' + |
||||
|
'}'; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,98 @@ |
|||||
|
package com.newscrawler.entity; |
||||
|
|
||||
|
import java.time.LocalDateTime; |
||||
|
|
||||
|
public class CrawlHistory { |
||||
|
private String id; |
||||
|
private String siteName; |
||||
|
private String siteUrl; |
||||
|
private int articleCount; |
||||
|
private LocalDateTime crawlTime; |
||||
|
private boolean success; |
||||
|
private String errorMessage; |
||||
|
|
||||
|
public CrawlHistory() { |
||||
|
this.id = java.util.UUID.randomUUID().toString(); |
||||
|
this.crawlTime = LocalDateTime.now(); |
||||
|
this.success = true; |
||||
|
} |
||||
|
|
||||
|
public CrawlHistory(String siteName, String siteUrl) { |
||||
|
this(); |
||||
|
this.siteName = siteName; |
||||
|
this.siteUrl = siteUrl; |
||||
|
} |
||||
|
|
||||
|
public CrawlHistory(String siteName, String siteUrl, int articleCount, boolean success, String errorMessage) { |
||||
|
this(siteName, siteUrl); |
||||
|
this.articleCount = articleCount; |
||||
|
this.success = success; |
||||
|
this.errorMessage = errorMessage; |
||||
|
} |
||||
|
|
||||
|
public String getId() { |
||||
|
return id; |
||||
|
} |
||||
|
|
||||
|
public void setId(String id) { |
||||
|
this.id = id; |
||||
|
} |
||||
|
|
||||
|
public String getSiteName() { |
||||
|
return siteName; |
||||
|
} |
||||
|
|
||||
|
public void setSiteName(String siteName) { |
||||
|
this.siteName = siteName; |
||||
|
} |
||||
|
|
||||
|
public String getSiteUrl() { |
||||
|
return siteUrl; |
||||
|
} |
||||
|
|
||||
|
public void setSiteUrl(String siteUrl) { |
||||
|
this.siteUrl = siteUrl; |
||||
|
} |
||||
|
|
||||
|
public int getArticleCount() { |
||||
|
return articleCount; |
||||
|
} |
||||
|
|
||||
|
public void setArticleCount(int articleCount) { |
||||
|
this.articleCount = articleCount; |
||||
|
} |
||||
|
|
||||
|
public LocalDateTime getCrawlTime() { |
||||
|
return crawlTime; |
||||
|
} |
||||
|
|
||||
|
public void setCrawlTime(LocalDateTime crawlTime) { |
||||
|
this.crawlTime = crawlTime; |
||||
|
} |
||||
|
|
||||
|
public boolean isSuccess() { |
||||
|
return success; |
||||
|
} |
||||
|
|
||||
|
public void setSuccess(boolean success) { |
||||
|
this.success = success; |
||||
|
} |
||||
|
|
||||
|
public String getErrorMessage() { |
||||
|
return errorMessage; |
||||
|
} |
||||
|
|
||||
|
public void setErrorMessage(String errorMessage) { |
||||
|
this.errorMessage = errorMessage; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return String.format("[%s] %s - %s - %d条 - %s", |
||||
|
crawlTime.toString().replace("T", " "), |
||||
|
siteName, |
||||
|
success ? "成功" : "失败", |
||||
|
articleCount, |
||||
|
success ? "" : errorMessage); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,21 @@ |
|||||
|
package com.newscrawler.exception; |
||||
|
|
||||
|
public class CrawlerException extends Exception { |
||||
|
private static final long serialVersionUID = 1L; |
||||
|
|
||||
|
public CrawlerException() { |
||||
|
super(); |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(Throwable cause) { |
||||
|
super(cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,21 @@ |
|||||
|
package com.newscrawler.exception; |
||||
|
|
||||
|
public class NetworkException extends CrawlerException { |
||||
|
private static final long serialVersionUID = 1L; |
||||
|
|
||||
|
public NetworkException() { |
||||
|
super(); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(Throwable cause) { |
||||
|
super(cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,21 @@ |
|||||
|
package com.newscrawler.exception; |
||||
|
|
||||
|
public class ParseException extends CrawlerException { |
||||
|
private static final long serialVersionUID = 1L; |
||||
|
|
||||
|
public ParseException() { |
||||
|
super(); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
|
||||
|
public ParseException(Throwable cause) { |
||||
|
super(cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,126 @@ |
|||||
|
package com.newscrawler.repository; |
||||
|
|
||||
|
import com.newscrawler.entity.Article; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.Collection; |
||||
|
import java.util.List; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class ArticleRepository { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); |
||||
|
private final List<Article> articles; |
||||
|
|
||||
|
public ArticleRepository() { |
||||
|
this.articles = new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
public void add(Article article) { |
||||
|
if (article == null) { |
||||
|
logger.warn("尝试添加空文章到仓库"); |
||||
|
throw new IllegalArgumentException("文章不能为空"); |
||||
|
} |
||||
|
if (article.getTitle() == null || article.getTitle().trim().isEmpty()) { |
||||
|
logger.warn("尝试添加标题为空的文章"); |
||||
|
throw new IllegalArgumentException("文章标题不能为空"); |
||||
|
} |
||||
|
if (!exists(article)) { |
||||
|
articles.add(article); |
||||
|
logger.debug("添加文章: {}", article.getTitle()); |
||||
|
} else { |
||||
|
logger.debug("文章已存在,跳过: {}", article.getTitle()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void addAll(Collection<Article> articlesToAdd) { |
||||
|
if (articlesToAdd == null) { |
||||
|
logger.warn("尝试添加空集合到仓库"); |
||||
|
throw new IllegalArgumentException("文章集合不能为空"); |
||||
|
} |
||||
|
int count = 0; |
||||
|
for (Article article : articlesToAdd) { |
||||
|
try { |
||||
|
add(article); |
||||
|
count++; |
||||
|
} catch (IllegalArgumentException e) { |
||||
|
logger.warn("跳过无效文章: {}", e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
logger.info("批量添加完成,成功添加{}篇文章", count); |
||||
|
} |
||||
|
|
||||
|
public boolean remove(Article article) { |
||||
|
if (article == null) { |
||||
|
logger.warn("尝试删除空文章"); |
||||
|
return false; |
||||
|
} |
||||
|
boolean removed = articles.remove(article); |
||||
|
if (removed) { |
||||
|
logger.debug("删除文章: {}", article.getTitle()); |
||||
|
} |
||||
|
return removed; |
||||
|
} |
||||
|
|
||||
|
public boolean removeById(String id) { |
||||
|
if (id == null || id.trim().isEmpty()) { |
||||
|
logger.warn("尝试用空ID删除文章"); |
||||
|
return false; |
||||
|
} |
||||
|
boolean removed = articles.removeIf(a -> a.getId().equals(id)); |
||||
|
if (removed) { |
||||
|
logger.debug("通过ID删除文章: {}", id); |
||||
|
} |
||||
|
return removed; |
||||
|
} |
||||
|
|
||||
|
public void clear() { |
||||
|
int size = articles.size(); |
||||
|
articles.clear(); |
||||
|
logger.info("清空仓库,删除了{}篇文章", size); |
||||
|
} |
||||
|
|
||||
|
public List<Article> getAll() { |
||||
|
return new ArrayList<>(articles); |
||||
|
} |
||||
|
|
||||
|
public List<Article> findBySource(String source) { |
||||
|
if (source == null || source.trim().isEmpty()) { |
||||
|
logger.warn("使用空来源查询"); |
||||
|
return new ArrayList<>(); |
||||
|
} |
||||
|
return articles.stream() |
||||
|
.filter(a -> a.getSource() != null && a.getSource().contains(source)) |
||||
|
.collect(Collectors.toList()); |
||||
|
} |
||||
|
|
||||
|
public boolean exists(Article article) { |
||||
|
if (article == null) { |
||||
|
return false; |
||||
|
} |
||||
|
return articles.contains(article); |
||||
|
} |
||||
|
|
||||
|
public int size() { |
||||
|
return articles.size(); |
||||
|
} |
||||
|
|
||||
|
public boolean isEmpty() { |
||||
|
return articles.isEmpty(); |
||||
|
} |
||||
|
|
||||
|
public long countBySource(String source) { |
||||
|
return findBySource(source).size(); |
||||
|
} |
||||
|
|
||||
|
public Article findById(String id) { |
||||
|
if (id == null || id.trim().isEmpty()) { |
||||
|
return null; |
||||
|
} |
||||
|
return articles.stream() |
||||
|
.filter(a -> a.getId().equals(id)) |
||||
|
.findFirst() |
||||
|
.orElse(null); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,113 @@ |
|||||
|
package com.newscrawler.service; |
||||
|
|
||||
|
import com.newscrawler.entity.Article; |
||||
|
import com.newscrawler.entity.CrawlHistory; |
||||
|
import com.newscrawler.exception.CrawlerException; |
||||
|
import com.newscrawler.exception.ParseException; |
||||
|
import com.newscrawler.repository.ArticleRepository; |
||||
|
import com.newscrawler.strategy.CrawlStrategy; |
||||
|
import com.newscrawler.strategy.StrategyFactory; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class CrawlerService { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerService.class); |
||||
|
|
||||
|
private final ArticleRepository repository; |
||||
|
private final List<CrawlHistory> crawlHistories; |
||||
|
|
||||
|
public CrawlerService() { |
||||
|
this.repository = new ArticleRepository(); |
||||
|
this.crawlHistories = new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
public List<Article> crawlSingleSite(String siteKey) throws CrawlerException { |
||||
|
logger.info("开始爬取单个站点: {}", siteKey); |
||||
|
CrawlStrategy strategy = StrategyFactory.getStrategy(siteKey); |
||||
|
return executeCrawl(strategy); |
||||
|
} |
||||
|
|
||||
|
public List<Article> crawlAllSites() throws CrawlerException { |
||||
|
logger.info("开始批量爬取所有站点"); |
||||
|
List<Article> allArticles = new ArrayList<>(); |
||||
|
Map<String, CrawlStrategy> strategies = StrategyFactory.getAllStrategies(); |
||||
|
|
||||
|
for (Map.Entry<String, CrawlStrategy> entry : strategies.entrySet()) { |
||||
|
try { |
||||
|
List<Article> articles = executeCrawl(entry.getValue()); |
||||
|
allArticles.addAll(articles); |
||||
|
} catch (CrawlerException e) { |
||||
|
logger.error("爬取站点{}失败: {}", entry.getKey(), e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("批量爬取完成,共获取{}篇文章", allArticles.size()); |
||||
|
return allArticles; |
||||
|
} |
||||
|
|
||||
|
private List<Article> executeCrawl(CrawlStrategy strategy) throws CrawlerException { |
||||
|
String siteName = strategy.getSiteName(); |
||||
|
String siteUrl = strategy.getSiteUrl(); |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
|
||||
|
try { |
||||
|
logger.info("开始爬取: {} - {}", siteName, siteUrl); |
||||
|
articles = strategy.crawl(); |
||||
|
repository.addAll(articles); |
||||
|
|
||||
|
CrawlHistory history = new CrawlHistory(siteName, siteUrl, articles.size(), true, null); |
||||
|
crawlHistories.add(history); |
||||
|
|
||||
|
logger.info("爬取{}成功,获取{}篇文章", siteName, articles.size()); |
||||
|
} catch (ParseException e) { |
||||
|
CrawlHistory history = new CrawlHistory(siteName, siteUrl, 0, false, e.getMessage()); |
||||
|
crawlHistories.add(history); |
||||
|
logger.error("爬取{}失败: {}", siteName, e.getMessage()); |
||||
|
throw e; |
||||
|
} |
||||
|
|
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
public ArticleRepository getRepository() { |
||||
|
return repository; |
||||
|
} |
||||
|
|
||||
|
public List<CrawlHistory> getCrawlHistories() { |
||||
|
return new ArrayList<>(crawlHistories); |
||||
|
} |
||||
|
|
||||
|
public List<Article> getAllArticles() { |
||||
|
return repository.getAll(); |
||||
|
} |
||||
|
|
||||
|
public List<Article> getArticlesBySource(String source) { |
||||
|
return repository.findBySource(source); |
||||
|
} |
||||
|
|
||||
|
public boolean removeArticle(String id) { |
||||
|
return repository.removeById(id); |
||||
|
} |
||||
|
|
||||
|
public void clearAllArticles() { |
||||
|
repository.clear(); |
||||
|
logger.info("已清空所有文章"); |
||||
|
} |
||||
|
|
||||
|
public Map<String, Long> getStatistics() { |
||||
|
Map<String, Long> stats = repository.getAll().stream() |
||||
|
.collect(Collectors.groupingBy(Article::getSource, Collectors.counting())); |
||||
|
stats.put("总计", (long) repository.size()); |
||||
|
return stats; |
||||
|
} |
||||
|
|
||||
|
public void clearHistory() { |
||||
|
crawlHistories.clear(); |
||||
|
logger.info("已清空爬取历史"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,83 @@ |
|||||
|
package com.newscrawler.strategy; |
||||
|
|
||||
|
import com.newscrawler.entity.Article; |
||||
|
import com.newscrawler.exception.NetworkException; |
||||
|
import com.newscrawler.exception.ParseException; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public abstract class AbstractBaseStrategy implements CrawlStrategy { |
||||
|
protected static final Logger logger = LoggerFactory.getLogger(AbstractBaseStrategy.class); |
||||
|
protected static final int MAX_RETRIES = 3; |
||||
|
protected static final int RETRY_DELAY_MS = 2000; |
||||
|
protected static final int TIMEOUT_MS = 30000; |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> crawl() throws ParseException { |
||||
|
logger.info("开始爬取站点: {}", getSiteName()); |
||||
|
String html = fetchWithRetry(); |
||||
|
if (html == null || html.isEmpty()) { |
||||
|
throw new ParseException("获取HTML内容为空"); |
||||
|
} |
||||
|
return parseHtml(html); |
||||
|
} |
||||
|
|
||||
|
protected String fetchWithRetry() throws ParseException { |
||||
|
int attempts = 0; |
||||
|
Exception lastException = null; |
||||
|
|
||||
|
while (attempts < MAX_RETRIES) { |
||||
|
try { |
||||
|
attempts++; |
||||
|
logger.debug("第{}次尝试获取页面: {}", attempts, getSiteUrl()); |
||||
|
Document doc = Jsoup.connect(getSiteUrl()) |
||||
|
.timeout(TIMEOUT_MS) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
||||
|
.get(); |
||||
|
return doc.html(); |
||||
|
} catch (IOException e) { |
||||
|
lastException = e; |
||||
|
logger.warn("第{}次尝试失败: {}", attempts, e.getMessage()); |
||||
|
if (attempts < MAX_RETRIES) { |
||||
|
try { |
||||
|
Thread.sleep(RETRY_DELAY_MS * attempts); |
||||
|
} catch (InterruptedException ie) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
throw new ParseException("爬取被中断", ie); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
NetworkException networkEx = new NetworkException("网络请求失败,已重试" + MAX_RETRIES + "次", lastException); |
||||
|
logger.error("网络请求最终失败: {}", networkEx.getMessage()); |
||||
|
throw new ParseException("获取页面失败", networkEx); |
||||
|
} |
||||
|
|
||||
|
protected abstract List<Article> parseHtml(String html) throws ParseException; |
||||
|
|
||||
|
protected List<Article> createArticles(List<String> titles, List<String> summaries, |
||||
|
List<String> publishDates, List<String> articleUrls, |
||||
|
List<String> authors) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
int size = titles.size(); |
||||
|
|
||||
|
for (int i = 0; i < size; i++) { |
||||
|
Article article = new Article(); |
||||
|
article.setTitle(i < titles.size() ? titles.get(i) : ""); |
||||
|
article.setSummary(i < summaries.size() ? summaries.get(i) : ""); |
||||
|
article.setPublishDate(i < publishDates.size() ? publishDates.get(i) : ""); |
||||
|
article.setArticleUrl(i < articleUrls.size() ? articleUrls.get(i) : ""); |
||||
|
article.setAuthor(i < authors.size() ? authors.get(i) : ""); |
||||
|
article.setSource(getSiteName()); |
||||
|
articles.add(article); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,14 @@ |
|||||
|
package com.newscrawler.strategy; |
||||
|
|
||||
|
import com.newscrawler.entity.Article; |
||||
|
import com.newscrawler.exception.ParseException; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlStrategy { |
||||
|
String getSiteName(); |
||||
|
|
||||
|
String getSiteUrl(); |
||||
|
|
||||
|
List<Article> crawl() throws ParseException; |
||||
|
} |
||||
@ -0,0 +1,93 @@ |
|||||
|
package com.newscrawler.strategy; |
||||
|
|
||||
|
import com.newscrawler.entity.Article; |
||||
|
import com.newscrawler.exception.ParseException; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class RunoobStrategy extends AbstractBaseStrategy { |
||||
|
private static final String SITE_NAME = "菜鸟教程资讯"; |
||||
|
private static final String SITE_URL = "https://www.runoob.com/"; |
||||
|
|
||||
|
@Override |
||||
|
public String getSiteName() { |
||||
|
return SITE_NAME; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getSiteUrl() { |
||||
|
return SITE_URL; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Article> parseHtml(String html) throws ParseException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
org.jsoup.nodes.Document doc = Jsoup.parse(html); |
||||
|
|
||||
|
Elements newsItems = doc.select(".article-list .item, .list-group-item, .news-item, .content li, article"); |
||||
|
|
||||
|
for (Element item : newsItems) { |
||||
|
Article article = new Article(); |
||||
|
article.setSource(SITE_NAME); |
||||
|
|
||||
|
Element titleElem = item.selectFirst("h3, h4, h2, .title, .news-title, a[href]"); |
||||
|
article.setTitle(titleElem != null ? titleElem.text().trim() : ""); |
||||
|
|
||||
|
Element summaryElem = item.selectFirst(".desc, .summary, .news-desc, p"); |
||||
|
article.setSummary(summaryElem != null ? summaryElem.text().trim() : ""); |
||||
|
|
||||
|
Element dateElem = item.selectFirst(".date, time, .time, span"); |
||||
|
article.setPublishDate(dateElem != null ? dateElem.text().trim() : ""); |
||||
|
|
||||
|
Element linkElem = item.selectFirst("a[href]"); |
||||
|
article.setArticleUrl(linkElem != null && linkElem.hasAttr("href") |
||||
|
? linkElem.attr("href") : ""); |
||||
|
|
||||
|
if (!article.getTitle().isEmpty()) { |
||||
|
articles.add(article); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (articles.isEmpty()) { |
||||
|
Elements links = doc.select("a[href*='/article/'], a[href*='/tutorial/'], a[href*='/course/']"); |
||||
|
for (Element link : links) { |
||||
|
String title = link.text().trim(); |
||||
|
if (title.length() > 5 && !title.contains("首页") && !title.contains("教程") && !title.contains("学习")) { |
||||
|
Article article = new Article(); |
||||
|
article.setSource(SITE_NAME); |
||||
|
article.setTitle(title); |
||||
|
article.setArticleUrl(link.hasAttr("href") ? link.attr("href") : ""); |
||||
|
articles.add(article); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (articles.isEmpty()) { |
||||
|
Elements allLinks = doc.select("a"); |
||||
|
for (Element link : allLinks) { |
||||
|
String title = link.text().trim(); |
||||
|
if (title.length() > 5 && title.length() < 100) { |
||||
|
String href = link.hasAttr("href") ? link.attr("href") : ""; |
||||
|
if (href.contains("/") && !href.startsWith("#") && !href.contains("javascript")) { |
||||
|
Article article = new Article(); |
||||
|
article.setSource(SITE_NAME); |
||||
|
article.setTitle(title); |
||||
|
article.setArticleUrl(href); |
||||
|
articles.add(article); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("菜鸟教程解析到{}条资讯", articles.size()); |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseException("解析菜鸟教程页面失败", e); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,90 @@ |
|||||
|
package com.newscrawler.strategy; |
||||
|
|
||||
|
import com.newscrawler.entity.Article; |
||||
|
import com.newscrawler.exception.ParseException; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class SohuStrategy extends AbstractBaseStrategy { |
||||
|
private static final String SITE_NAME = "搜狐资讯"; |
||||
|
private static final String SITE_URL = "https://news.sohu.com/"; |
||||
|
|
||||
|
@Override |
||||
|
public String getSiteName() { |
||||
|
return SITE_NAME; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getSiteUrl() { |
||||
|
return SITE_URL; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Article> parseHtml(String html) throws ParseException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
org.jsoup.nodes.Document doc = Jsoup.parse(html); |
||||
|
|
||||
|
Elements allLinks = doc.select("a"); |
||||
|
int count = 0; |
||||
|
for (Element link : allLinks) { |
||||
|
String title = link.text().trim(); |
||||
|
String href = link.attr("href"); |
||||
|
|
||||
|
if (title.length() > 8 && title.length() < 80 && |
||||
|
!title.contains("登录") && !title.contains("注册") && |
||||
|
!title.contains("评论") && !title.contains("分享") && |
||||
|
!title.contains("更多") && !title.contains("首页") && |
||||
|
href.contains("/a/")) { |
||||
|
|
||||
|
Article article = new Article(); |
||||
|
article.setSource(SITE_NAME); |
||||
|
article.setTitle(title); |
||||
|
article.setAuthor("搜狐"); |
||||
|
|
||||
|
if (href.startsWith("//")) { |
||||
|
href = "https:" + href; |
||||
|
} else if (href.startsWith("/")) { |
||||
|
href = "https://news.sohu.com" + href; |
||||
|
} |
||||
|
article.setArticleUrl(href); |
||||
|
articles.add(article); |
||||
|
count++; |
||||
|
|
||||
|
if (count >= 20) break; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (articles.isEmpty()) { |
||||
|
Elements mainNews = doc.select(".focus-news-list li, .main-news li, .listCon li"); |
||||
|
for (Element item : mainNews) { |
||||
|
Element titleLink = item.selectFirst("a"); |
||||
|
if (titleLink != null) { |
||||
|
String title = titleLink.text().trim(); |
||||
|
if (title.length() > 5) { |
||||
|
Article article = new Article(); |
||||
|
article.setSource(SITE_NAME); |
||||
|
article.setTitle(title); |
||||
|
article.setAuthor("搜狐"); |
||||
|
String href = titleLink.attr("href"); |
||||
|
if (href.startsWith("/")) { |
||||
|
href = "https://news.sohu.com" + href; |
||||
|
} |
||||
|
article.setArticleUrl(href); |
||||
|
articles.add(article); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("搜狐资讯解析到{}条资讯", articles.size()); |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseException("解析搜狐资讯页面失败", e); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,39 @@ |
|||||
|
package com.newscrawler.strategy; |
||||
|
|
||||
|
import java.util.HashMap; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class StrategyFactory { |
||||
|
private static final Map<String, CrawlStrategy> STRATEGY_MAP = new HashMap<>(); |
||||
|
|
||||
|
static { |
||||
|
STRATEGY_MAP.put("runoob", new RunoobStrategy()); |
||||
|
STRATEGY_MAP.put("youth", new YouthStrategy()); |
||||
|
STRATEGY_MAP.put("sohu", new SohuStrategy()); |
||||
|
} |
||||
|
|
||||
|
public static CrawlStrategy getStrategy(String siteKey) { |
||||
|
CrawlStrategy strategy = STRATEGY_MAP.get(siteKey.toLowerCase()); |
||||
|
if (strategy == null) { |
||||
|
throw new IllegalArgumentException("不支持的站点: " + siteKey); |
||||
|
} |
||||
|
return strategy; |
||||
|
} |
||||
|
|
||||
|
public static CrawlStrategy getStrategyBySiteName(String siteName) { |
||||
|
for (Map.Entry<String, CrawlStrategy> entry : STRATEGY_MAP.entrySet()) { |
||||
|
if (entry.getValue().getSiteName().contains(siteName)) { |
||||
|
return entry.getValue(); |
||||
|
} |
||||
|
} |
||||
|
throw new IllegalArgumentException("未找到站点对应的策略: " + siteName); |
||||
|
} |
||||
|
|
||||
|
public static Map<String, CrawlStrategy> getAllStrategies() { |
||||
|
return new HashMap<>(STRATEGY_MAP); |
||||
|
} |
||||
|
|
||||
|
public static String[] getSiteKeys() { |
||||
|
return STRATEGY_MAP.keySet().toArray(new String[0]); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,70 @@ |
|||||
|
package com.newscrawler.strategy; |
||||
|
|
||||
|
import com.newscrawler.entity.Article; |
||||
|
import com.newscrawler.exception.ParseException; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class YouthStrategy extends AbstractBaseStrategy { |
||||
|
private static final String SITE_NAME = "知乎日报"; |
||||
|
private static final String SITE_URL = "https://daily.zhihu.com/"; |
||||
|
|
||||
|
@Override |
||||
|
public String getSiteName() { |
||||
|
return SITE_NAME; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getSiteUrl() { |
||||
|
return SITE_URL; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Article> parseHtml(String html) throws ParseException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
logger.info("开始解析知乎日报页面,HTML长度: {}", html.length()); |
||||
|
|
||||
|
org.jsoup.nodes.Document doc = Jsoup.parse(html); |
||||
|
Elements allLinks = doc.select("a"); |
||||
|
|
||||
|
logger.info("页面共有 {} 个链接", allLinks.size()); |
||||
|
|
||||
|
for (Element link : allLinks) { |
||||
|
String title = link.text().trim(); |
||||
|
String href = link.attr("href"); |
||||
|
|
||||
|
if (title.length() >= 4 && title.length() <= 50 && !title.isEmpty()) { |
||||
|
Article article = new Article(); |
||||
|
article.setSource(SITE_NAME); |
||||
|
article.setTitle(title); |
||||
|
article.setAuthor("知乎日报"); |
||||
|
|
||||
|
if (href.startsWith("//")) { |
||||
|
href = "https:" + href; |
||||
|
} else if (href.startsWith("/")) { |
||||
|
href = "https://daily.zhihu.com" + href; |
||||
|
} |
||||
|
article.setArticleUrl(href); |
||||
|
articles.add(article); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.info("初步解析到{}条资讯", articles.size()); |
||||
|
|
||||
|
if (articles.size() > 20) { |
||||
|
articles = articles.subList(0, 20); |
||||
|
} |
||||
|
|
||||
|
logger.info("知乎日报最终解析到{}条资讯", articles.size()); |
||||
|
} catch (Exception e) { |
||||
|
logger.error("解析异常: {}", e.getMessage()); |
||||
|
throw new ParseException("解析知乎日报页面失败", e); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,134 @@ |
|||||
|
package com.newscrawler.util; |
||||
|
|
||||
|
import com.google.gson.Gson; |
||||
|
import com.google.gson.GsonBuilder; |
||||
|
import com.google.gson.reflect.TypeToken; |
||||
|
import com.newscrawler.entity.Article; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.*; |
||||
|
import java.lang.reflect.Type; |
||||
|
import java.nio.charset.StandardCharsets; |
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.nio.file.Paths; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.HashSet; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
|
||||
|
public class JsonUtil { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(JsonUtil.class); |
||||
|
private static final Gson GSON = new GsonBuilder() |
||||
|
.setPrettyPrinting() |
||||
|
.setDateFormat("yyyy-MM-dd'T'HH:mm:ss") |
||||
|
.create(); |
||||
|
private static final String DATA_DIR = "data"; |
||||
|
|
||||
|
static { |
||||
|
try { |
||||
|
Files.createDirectories(Paths.get(DATA_DIR)); |
||||
|
} catch (IOException e) { |
||||
|
logger.warn("创建数据目录失败: {}", e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static void exportToJson(List<Article> articles, String filename) throws IOException { |
||||
|
if (articles == null) { |
||||
|
throw new IllegalArgumentException("文章列表不能为空"); |
||||
|
} |
||||
|
if (filename == null || filename.trim().isEmpty()) { |
||||
|
throw new IllegalArgumentException("文件名不能为空"); |
||||
|
} |
||||
|
|
||||
|
String fullPath = getFullPath(filename); |
||||
|
try (Writer writer = new BufferedWriter( |
||||
|
new OutputStreamWriter( |
||||
|
new FileOutputStream(fullPath), StandardCharsets.UTF_8))) { |
||||
|
GSON.toJson(articles, writer); |
||||
|
logger.info("成功导出{}篇文章到{}", articles.size(), fullPath); |
||||
|
} catch (IOException e) { |
||||
|
logger.error("导出JSON失败: {}", e.getMessage()); |
||||
|
throw e; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static List<Article> importFromJson(String filename) throws IOException { |
||||
|
if (filename == null || filename.trim().isEmpty()) { |
||||
|
throw new IllegalArgumentException("文件名不能为空"); |
||||
|
} |
||||
|
|
||||
|
String fullPath = getFullPath(filename); |
||||
|
Path path = Paths.get(fullPath); |
||||
|
|
||||
|
if (!Files.exists(path)) { |
||||
|
throw new FileNotFoundException("文件不存在: " + fullPath); |
||||
|
} |
||||
|
|
||||
|
List<Article> importedArticles; |
||||
|
try (Reader reader = new BufferedReader( |
||||
|
new InputStreamReader( |
||||
|
new FileInputStream(fullPath), StandardCharsets.UTF_8))) { |
||||
|
Type listType = new TypeToken<ArrayList<Article>>() {}.getType(); |
||||
|
importedArticles = GSON.fromJson(reader, listType); |
||||
|
} catch (IOException e) { |
||||
|
logger.error("导入JSON失败: {}", e.getMessage()); |
||||
|
throw e; |
||||
|
} |
||||
|
|
||||
|
if (importedArticles == null) { |
||||
|
importedArticles = new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
List<Article> deduplicatedArticles = deduplicate(importedArticles); |
||||
|
logger.info("从{}导入{}篇文章,去重后保留{}篇", |
||||
|
fullPath, importedArticles.size(), deduplicatedArticles.size()); |
||||
|
|
||||
|
return deduplicatedArticles; |
||||
|
} |
||||
|
|
||||
|
private static List<Article> deduplicate(List<Article> articles) { |
||||
|
Set<String> seen = new HashSet<>(); |
||||
|
List<Article> deduplicated = new ArrayList<>(); |
||||
|
|
||||
|
for (Article article : articles) { |
||||
|
String key = generateDeduplicateKey(article); |
||||
|
if (!seen.contains(key)) { |
||||
|
seen.add(key); |
||||
|
deduplicated.add(article); |
||||
|
} else { |
||||
|
logger.debug("去重重复文章: {}", article.getTitle()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return deduplicated; |
||||
|
} |
||||
|
|
||||
|
private static String generateDeduplicateKey(Article article) { |
||||
|
return (article.getTitle() != null ? article.getTitle() : "") + "|" + |
||||
|
(article.getSource() != null ? article.getSource() : "") + "|" + |
||||
|
(article.getPublishDate() != null ? article.getPublishDate() : ""); |
||||
|
} |
||||
|
|
||||
|
private static String getFullPath(String filename) { |
||||
|
if (filename.endsWith(".json")) { |
||||
|
return DATA_DIR + File.separator + filename; |
||||
|
} |
||||
|
return DATA_DIR + File.separator + filename + ".json"; |
||||
|
} |
||||
|
|
||||
|
public static void exportHistoriesToJson(List<?> histories, String filename) throws IOException { |
||||
|
if (histories == null) { |
||||
|
throw new IllegalArgumentException("历史记录列表不能为空"); |
||||
|
} |
||||
|
|
||||
|
String fullPath = DATA_DIR + File.separator + filename + "_history.json"; |
||||
|
try (Writer writer = new BufferedWriter( |
||||
|
new OutputStreamWriter( |
||||
|
new FileOutputStream(fullPath), StandardCharsets.UTF_8))) { |
||||
|
GSON.toJson(histories, writer); |
||||
|
logger.info("成功导出{}条历史记录到{}", histories.size(), fullPath); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,37 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<configuration> |
||||
|
<property name="LOG_PATTERN" value="%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n"/> |
||||
|
|
||||
|
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
||||
|
<encoder> |
||||
|
<pattern>${LOG_PATTERN}</pattern> |
||||
|
<charset>GBK</charset> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender"> |
||||
|
<file>logs/crawler.log</file> |
||||
|
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy"> |
||||
|
<fileNamePattern>logs/crawler-%d{yyyy-MM-dd}.log</fileNamePattern> |
||||
|
<maxHistory>30</maxHistory> |
||||
|
</rollingPolicy> |
||||
|
<encoder> |
||||
|
<pattern>${LOG_PATTERN}</pattern> |
||||
|
<charset>UTF-8</charset> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<logger name="com.newscrawler" level="DEBUG" additivity="false"> |
||||
|
<appender-ref ref="CONSOLE"/> |
||||
|
<appender-ref ref="FILE"/> |
||||
|
</logger> |
||||
|
|
||||
|
<logger name="org.jsoup" level="WARN" additivity="false"> |
||||
|
<appender-ref ref="CONSOLE"/> |
||||
|
</logger> |
||||
|
|
||||
|
<root level="INFO"> |
||||
|
<appender-ref ref="CONSOLE"/> |
||||
|
<appender-ref ref="FILE"/> |
||||
|
</root> |
||||
|
</configuration> |
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,37 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<configuration> |
||||
|
<property name="LOG_PATTERN" value="%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n"/> |
||||
|
|
||||
|
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
||||
|
<encoder> |
||||
|
<pattern>${LOG_PATTERN}</pattern> |
||||
|
<charset>GBK</charset> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender"> |
||||
|
<file>logs/crawler.log</file> |
||||
|
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy"> |
||||
|
<fileNamePattern>logs/crawler-%d{yyyy-MM-dd}.log</fileNamePattern> |
||||
|
<maxHistory>30</maxHistory> |
||||
|
</rollingPolicy> |
||||
|
<encoder> |
||||
|
<pattern>${LOG_PATTERN}</pattern> |
||||
|
<charset>UTF-8</charset> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<logger name="com.newscrawler" level="DEBUG" additivity="false"> |
||||
|
<appender-ref ref="CONSOLE"/> |
||||
|
<appender-ref ref="FILE"/> |
||||
|
</logger> |
||||
|
|
||||
|
<logger name="org.jsoup" level="WARN" additivity="false"> |
||||
|
<appender-ref ref="CONSOLE"/> |
||||
|
</logger> |
||||
|
|
||||
|
<root level="INFO"> |
||||
|
<appender-ref ref="CONSOLE"/> |
||||
|
<appender-ref ref="FILE"/> |
||||
|
</root> |
||||
|
</configuration> |
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,17 @@ |
|||||
|
com\newscrawler\util\JsonUtil$1.class |
||||
|
com\newscrawler\entity\CrawlHistory.class |
||||
|
com\newscrawler\exception\ParseException.class |
||||
|
com\newscrawler\entity\Article.class |
||||
|
com\newscrawler\repository\ArticleRepository.class |
||||
|
com\newscrawler\strategy\AbstractBaseStrategy.class |
||||
|
com\newscrawler\strategy\StrategyFactory.class |
||||
|
com\newscrawler\command\MenuCommand.class |
||||
|
com\newscrawler\service\CrawlerService.class |
||||
|
com\newscrawler\strategy\RunoobStrategy.class |
||||
|
com\newscrawler\exception\CrawlerException.class |
||||
|
com\newscrawler\strategy\CrawlStrategy.class |
||||
|
com\newscrawler\strategy\SohuStrategy.class |
||||
|
com\newscrawler\exception\NetworkException.class |
||||
|
com\newscrawler\util\JsonUtil.class |
||||
|
com\newscrawler\strategy\YouthStrategy.class |
||||
|
com\newscrawler\Main.class |
||||
@ -0,0 +1,16 @@ |
|||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\exception\NetworkException.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\util\JsonUtil.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\exception\ParseException.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\YouthStrategy.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\entity\CrawlHistory.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\service\CrawlerService.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\RunoobStrategy.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\exception\CrawlerException.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\repository\ArticleRepository.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\command\MenuCommand.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\StrategyFactory.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\entity\Article.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\Main.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\SohuStrategy.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\CrawlStrategy.java |
||||
|
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\AbstractBaseStrategy.java |
||||
Loading…
Reference in new issue