Browse Source

宋瑞-202506050301

main
Songrui 3 weeks ago
parent
commit
e927004858
  1. BIN
      project/202506050301-宋瑞-期末实验报告.docx
  2. 146
      project/资讯爬虫/data/知乎1.json
  3. 1255
      project/资讯爬虫/data/菜鸟1.json
  4. 1986
      project/资讯爬虫/logs/crawler-2026-05-29.log
  5. 1171
      project/资讯爬虫/logs/crawler.log
  6. 73
      project/资讯爬虫/pom.xml
  7. 26
      project/资讯爬虫/src/main/java/com/newscrawler/Main.java
  8. 361
      project/资讯爬虫/src/main/java/com/newscrawler/command/MenuCommand.java
  9. 113
      project/资讯爬虫/src/main/java/com/newscrawler/entity/Article.java
  10. 98
      project/资讯爬虫/src/main/java/com/newscrawler/entity/CrawlHistory.java
  11. 21
      project/资讯爬虫/src/main/java/com/newscrawler/exception/CrawlerException.java
  12. 21
      project/资讯爬虫/src/main/java/com/newscrawler/exception/NetworkException.java
  13. 21
      project/资讯爬虫/src/main/java/com/newscrawler/exception/ParseException.java
  14. 126
      project/资讯爬虫/src/main/java/com/newscrawler/repository/ArticleRepository.java
  15. 113
      project/资讯爬虫/src/main/java/com/newscrawler/service/CrawlerService.java
  16. 83
      project/资讯爬虫/src/main/java/com/newscrawler/strategy/AbstractBaseStrategy.java
  17. 14
      project/资讯爬虫/src/main/java/com/newscrawler/strategy/CrawlStrategy.java
  18. 93
      project/资讯爬虫/src/main/java/com/newscrawler/strategy/RunoobStrategy.java
  19. 90
      project/资讯爬虫/src/main/java/com/newscrawler/strategy/SohuStrategy.java
  20. 39
      project/资讯爬虫/src/main/java/com/newscrawler/strategy/StrategyFactory.java
  21. 70
      project/资讯爬虫/src/main/java/com/newscrawler/strategy/YouthStrategy.java
  22. 134
      project/资讯爬虫/src/main/java/com/newscrawler/util/JsonUtil.java
  23. 37
      project/资讯爬虫/src/main/resources/logback.xml
  24. BIN
      project/资讯爬虫/target/classes/com/newscrawler/Main.class
  25. BIN
      project/资讯爬虫/target/classes/com/newscrawler/command/MenuCommand.class
  26. BIN
      project/资讯爬虫/target/classes/com/newscrawler/entity/Article.class
  27. BIN
      project/资讯爬虫/target/classes/com/newscrawler/entity/CrawlHistory.class
  28. BIN
      project/资讯爬虫/target/classes/com/newscrawler/exception/CrawlerException.class
  29. BIN
      project/资讯爬虫/target/classes/com/newscrawler/exception/NetworkException.class
  30. BIN
      project/资讯爬虫/target/classes/com/newscrawler/exception/ParseException.class
  31. BIN
      project/资讯爬虫/target/classes/com/newscrawler/repository/ArticleRepository.class
  32. BIN
      project/资讯爬虫/target/classes/com/newscrawler/service/CrawlerService.class
  33. BIN
      project/资讯爬虫/target/classes/com/newscrawler/strategy/AbstractBaseStrategy.class
  34. BIN
      project/资讯爬虫/target/classes/com/newscrawler/strategy/CrawlStrategy.class
  35. BIN
      project/资讯爬虫/target/classes/com/newscrawler/strategy/RunoobStrategy.class
  36. BIN
      project/资讯爬虫/target/classes/com/newscrawler/strategy/SohuStrategy.class
  37. BIN
      project/资讯爬虫/target/classes/com/newscrawler/strategy/StrategyFactory.class
  38. BIN
      project/资讯爬虫/target/classes/com/newscrawler/strategy/YouthStrategy.class
  39. BIN
      project/资讯爬虫/target/classes/com/newscrawler/util/JsonUtil$1.class
  40. BIN
      project/资讯爬虫/target/classes/com/newscrawler/util/JsonUtil.class
  41. 37
      project/资讯爬虫/target/classes/logback.xml
  42. BIN
      project/资讯爬虫/target/dependency/gson-2.10.1.jar
  43. BIN
      project/资讯爬虫/target/dependency/jsoup-1.17.2.jar
  44. BIN
      project/资讯爬虫/target/dependency/logback-classic-1.4.14.jar
  45. BIN
      project/资讯爬虫/target/dependency/logback-core-1.4.14.jar
  46. BIN
      project/资讯爬虫/target/dependency/slf4j-api-2.0.11.jar
  47. 17
      project/资讯爬虫/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
  48. 16
      project/资讯爬虫/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst

BIN
project/202506050301-宋瑞-期末实验报告.docx

Binary file not shown.

146
project/资讯爬虫/data/知乎1.json

@ -0,0 +1,146 @@
[
{
"id": "6859ecc9-c992-4e93-93e6-87ddc6e1a6be",
"title": "浏览内容",
"articleUrl": "#section_head",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "5dbcafb6-8a83-4052-aed1-850e72265f91",
"title": "App 下载",
"articleUrl": "http://www.wandoujia.com/apps/com.zhihu.daily.android",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "06b7c5b6-c4ce-4281-9e7d-6e312858307d",
"title": "知乎日报",
"articleUrl": "http://daily.zhihu.com/",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "e4880198-329b-47c7-bf7b-78052aa2bf8b",
"title": "iOS 版",
"articleUrl": "https://itunes.apple.com/cn/app/id639087967?mt\u003d8",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "e17a296b-5f65-4bfb-b12f-9fd4fe1a07d7",
"title": "文学创作会不会受到 AI 的冲击?",
"articleUrl": "https://daily.zhihu.com/story/9790086",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "12676469-494d-473a-89c8-01f140a57188",
"title": "为什么说西西弗斯面对巨石,不断推上山是一种超越和蔑视?",
"articleUrl": "https://daily.zhihu.com/story/9790101",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "625a5f08-3ce9-487f-b367-5be0cdb7500e",
"title": "有哪些看起来很高端的技术其实原理很暴力很初级?",
"articleUrl": "https://daily.zhihu.com/story/9790092",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "d4aa8adc-e4af-4194-bef1-bdf233742f86",
"title": "中国古代官方不重视理工科吗,如果是,为什么?",
"articleUrl": "https://daily.zhihu.com/story/9790090",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "e36a73d4-c0b8-4674-b4d5-d1d9de5cd894",
"title": "为什么人类不能自身合成维生素C?",
"articleUrl": "https://daily.zhihu.com/story/9790062",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "e7adc448-4570-45e6-a6e7-46b1397f0677",
"title": "林黛玉被妙玉嫌弃太俗,却不敢反驳,她怼贾宝玉的劲儿哪去了?",
"articleUrl": "https://daily.zhihu.com/story/9790081",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "37468176-caca-47f2-b422-b467a644e0ff",
"title": "魏博没有山川险阻,靠什么屹立150年。甚至长期成为最强藩?",
"articleUrl": "https://daily.zhihu.com/story/9790071",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "bdf153fc-5ece-4ea3-85bb-de0a2b49f5ee",
"title": "瞎扯 · 如何正确地吐槽",
"articleUrl": "https://daily.zhihu.com/story/9790084",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "e9cec0a7-82c2-43e5-a529-92d3979d4b24",
"title": "为什么松鼠的动作总是一顿一顿的?像卡帧一样?",
"articleUrl": "https://daily.zhihu.com/story/9790034",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "288cd539-af4b-4bf1-81ee-6fe5a9936fae",
"title": "既有 F\u003dma,又有F\u003dkx,那么物理公式到底要求等号左边是因还是果?",
"articleUrl": "https://daily.zhihu.com/story/9790046",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "c595c7e0-e768-4f5e-a0ed-9982466bf761",
"title": "西安唐代城门恢复为何不学洛阳,丹凤门像土黄色纸壳子,明德门像塑料玩具?",
"articleUrl": "https://daily.zhihu.com/story/9790039",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "3bc7435b-2ac8-4979-bd8c-20da0b4f7a3d",
"title": "游牧民族几乎没有碳水来源,为什么没有营养不良?",
"articleUrl": "https://daily.zhihu.com/story/9790022",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "c5ebc7af-5117-46a7-9107-1f28ed91d0c1",
"title": "为什么压力单位这么混乱?",
"articleUrl": "https://daily.zhihu.com/story/9790028",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
},
{
"id": "867ebe9e-0016-4e29-a407-319ef50aa51c",
"title": "为什么会有好奇害死猫这个说法?",
"articleUrl": "https://daily.zhihu.com/story/9790027",
"author": "知乎日报",
"source": "知乎日报",
"crawledAt": "2026-05-30T14:49:20.886753"
}
]

1255
project/资讯爬虫/data/菜鸟1.json

File diff suppressed because it is too large

1986
project/资讯爬虫/logs/crawler-2026-05-29.log

File diff suppressed because it is too large

1171
project/资讯爬虫/logs/crawler.log

File diff suppressed because it is too large

73
project/资讯爬虫/pom.xml

@ -0,0 +1,73 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.newscrawler</groupId>
<artifactId>news-crawler</artifactId>
<version>1.0.0</version>
<packaging>jar</packaging>
<name>News Crawler</name>
<description>增强版Java资讯爬虫</description>
<properties>
<java.version>17</java.version>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<jsoup.version>1.17.2</jsoup.version>
<gson.version>2.10.1</gson.version>
<logback.version>1.4.14</logback.version>
<slf4j.version>2.0.11</slf4j.version>
</properties>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>${gson.version}</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>${logback.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>3.1.0</version>
<configuration>
<mainClass>com.newscrawler.Main</mainClass>
</configuration>
</plugin>
</plugins>
</build>
</project>

26
project/资讯爬虫/src/main/java/com/newscrawler/Main.java

@ -0,0 +1,26 @@
package com.newscrawler;
import com.newscrawler.command.MenuCommand;
import com.newscrawler.service.CrawlerService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Main {
private static final Logger logger = LoggerFactory.getLogger(Main.class);
public static void main(String[] args) {
logger.info("资讯爬虫启动");
try {
CrawlerService crawlerService = new CrawlerService();
MenuCommand menuCommand = new MenuCommand(crawlerService);
menuCommand.start();
} catch (Exception e) {
logger.error("程序执行异常", e);
System.err.println("程序执行失败: " + e.getMessage());
System.exit(1);
}
logger.info("资讯爬虫关闭");
}
}

361
project/资讯爬虫/src/main/java/com/newscrawler/command/MenuCommand.java

@ -0,0 +1,361 @@
package com.newscrawler.command;
import com.newscrawler.entity.Article;
import com.newscrawler.entity.CrawlHistory;
import com.newscrawler.exception.CrawlerException;
import com.newscrawler.service.CrawlerService;
import com.newscrawler.util.JsonUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Scanner;
import java.util.Set;
import java.util.stream.Collectors;
public class MenuCommand {
private static final Logger logger = LoggerFactory.getLogger(MenuCommand.class);
private final CrawlerService crawlerService;
private final Scanner scanner;
public MenuCommand(CrawlerService crawlerService) {
this.crawlerService = crawlerService;
this.scanner = new Scanner(System.in);
}
public void start() {
boolean running = true;
while (running) {
showMainMenu();
String input = scanner.nextLine().trim();
running = handleMainMenu(input);
}
System.out.println("感谢使用资讯爬虫,再见!");
}
private void showMainMenu() {
System.out.println("\n========== 请选择你要执行的操作 ==========");
System.out.println("1. 爬取菜鸟教程资讯");
System.out.println("2. 爬取知乎日报");
System.out.println("3. 爬取搜狐资讯");
System.out.println("4. 批量爬取全部站点");
System.out.println("5. 查看全部资讯");
System.out.println("6. 数据统计");
System.out.println("7. 爬取历史");
System.out.println("8. 数据导入/导出");
System.out.println("9. 数据管理(删除/清空)");
System.out.println("0. 退出程序");
System.out.print("请输入选项: ");
}
private boolean handleMainMenu(String input) {
try {
switch (input) {
case "1":
crawlSite("runoob");
break;
case "2":
crawlSite("youth");
break;
case "3":
crawlSite("sohu");
break;
case "4":
crawlAllSites();
break;
case "5":
viewAllArticles();
break;
case "6":
showStatistics();
break;
case "7":
showCrawlHistory();
break;
case "8":
handleImportExport();
break;
case "9":
manageData();
break;
case "0":
return false;
default:
System.out.println("无效选项,请输入0-9之间的数字");
}
} catch (Exception e) {
System.out.println("操作出错: " + e.getMessage());
logger.error("菜单操作异常", e);
}
return true;
}
private void crawlSite(String siteKey) throws CrawlerException {
System.out.println("正在爬取,请稍候...");
List<Article> articles = crawlerService.crawlSingleSite(siteKey);
System.out.println("爬取完成!获取到 " + articles.size() + " 篇文章");
}
private void crawlAllSites() throws CrawlerException {
System.out.println("正在批量爬取所有站点,请稍候...");
List<Article> articles = crawlerService.crawlAllSites();
System.out.println("批量爬取完成!共获取到 " + articles.size() + " 篇文章");
}
private void viewAllArticles() {
List<Article> articles = crawlerService.getAllArticles();
if (articles.isEmpty()) {
System.out.println("暂无资讯,请先爬取数据");
return;
}
printArticles(articles);
}
private void showStatistics() {
Map<String, Long> stats = crawlerService.getStatistics();
System.out.println("\n=== 数据统计 ===");
stats.forEach((source, count) -> System.out.println(source + ": " + count + " 条"));
}
private void showCrawlHistory() {
List<CrawlHistory> histories = crawlerService.getCrawlHistories();
if (histories.isEmpty()) {
System.out.println("暂无爬取历史");
return;
}
System.out.println("\n=== 爬取历史 ===");
histories.forEach(h -> System.out.println(h.toString()));
}
private void handleImportExport() {
System.out.println("\n=== 数据导入/导出 ===");
System.out.println("1. 导出数据到JSON");
System.out.println("2. 从JSON导入数据");
System.out.print("选择: ");
String choice = scanner.nextLine().trim();
switch (choice) {
case "1" -> exportToJson();
case "2" -> importFromJson();
default -> System.out.println("无效选择");
}
}
private void exportToJson() {
List<Article> articles = crawlerService.getAllArticles();
if (articles.isEmpty()) {
System.out.println("暂无数据可导出,请先爬取数据");
return;
}
Set<String> sources = articles.stream()
.map(Article::getSource)
.filter(Objects::nonNull)
.collect(Collectors.toSet());
String prefix;
if (sources.size() > 1) {
prefix = "全";
} else if (sources.contains("菜鸟教程资讯")) {
prefix = "菜鸟";
} else if (sources.contains("知乎日报")) {
prefix = "知乎";
} else if (sources.contains("搜狐资讯")) {
prefix = "搜狐";
} else {
prefix = "articles";
}
String filename = generateNumberedFilename(prefix);
try {
JsonUtil.exportToJson(articles, filename);
System.out.println("导出成功: " + filename);
} catch (Exception e) {
System.out.println("导出失败: " + e.getMessage());
logger.error("导出JSON失败", e);
}
}
private String generateNumberedFilename(String prefix) {
java.io.File dir = new java.io.File("data");
if (!dir.exists()) {
return prefix + "1.json";
}
java.io.File[] files = dir.listFiles((d, name) -> name.startsWith(prefix) && name.endsWith(".json"));
if (files == null || files.length == 0) {
return prefix + "1.json";
}
int maxNum = 0;
for (java.io.File file : files) {
String name = file.getName();
try {
String numStr = name.substring(prefix.length(), name.length() - 5);
int num = Integer.parseInt(numStr);
if (num > maxNum) {
maxNum = num;
}
} catch (NumberFormatException ignored) {
}
}
return prefix + (maxNum + 1) + ".json";
}
private void importFromJson() {
java.io.File dir = new java.io.File("data");
if (!dir.exists()) {
System.out.println("当前目录下没有JSON文件,请先导出数据");
return;
}
java.io.File[] files = dir.listFiles((d, name) -> name.endsWith(".json"));
if (files == null || files.length == 0) {
System.out.println("当前目录下没有JSON文件,请先导出数据");
return;
}
System.out.println("\n当前目录下的JSON文件:");
for (int i = 0; i < files.length; i++) {
String name = files[i].getName();
System.out.println((i + 1) + ". " + name);
}
System.out.print("\n请输入要导入的文件编号,或输入0返回: ");
String input = scanner.nextLine().trim();
if ("0".equals(input)) {
return;
}
if (input.isEmpty()) {
System.out.println("输入不能为空");
return;
}
int index;
try {
index = Integer.parseInt(input) - 1;
} catch (NumberFormatException e) {
System.out.println("请输入有效的数字编号");
return;
}
if (index < 0 || index >= files.length) {
System.out.println("编号超出范围");
return;
}
String filename = files[index].getName();
try {
List<Article> articles = JsonUtil.importFromJson(filename);
crawlerService.getRepository().addAll(articles);
System.out.println("导入成功: " + articles.size() + " 篇文章(已自动去重)");
} catch (Exception e) {
System.out.println("导入失败: " + e.getMessage());
logger.error("导入JSON失败", e);
}
}
private void manageData() {
System.out.println("\n=== 数据管理 ===");
System.out.println("1. 删除单条资讯");
System.out.println("2. 清空所有资讯");
System.out.print("选择: ");
String choice = scanner.nextLine().trim();
switch (choice) {
case "1" -> deleteSingleArticle();
case "2" -> clearAllArticles();
default -> System.out.println("无效选择");
}
}
private void deleteSingleArticle() {
List<Article> articles = crawlerService.getAllArticles();
if (articles.isEmpty()) {
System.out.println("暂无资讯可删除");
return;
}
System.out.println("\n=== 当前数据库中的资讯 ===");
for (int i = 0; i < articles.size(); i++) {
Article article = articles.get(i);
System.out.printf("[%d] %s%n", i + 1, article.getTitle());
System.out.println(" 来源: " + article.getSource());
if (article.getPublishDate() != null && !article.getPublishDate().isEmpty()) {
System.out.println(" 时间: " + article.getPublishDate());
}
System.out.println("-".repeat(60));
}
System.out.print("\n请输入要删除的文章编号(输入0取消): ");
String input = scanner.nextLine().trim();
if ("0".equals(input)) {
System.out.println("已取消操作");
return;
}
int index;
try {
index = Integer.parseInt(input) - 1;
} catch (NumberFormatException e) {
System.out.println("请输入有效的数字");
return;
}
if (index < 0 || index >= articles.size()) {
System.out.println("编号超出范围");
return;
}
Article articleToDelete = articles.get(index);
if (crawlerService.removeArticle(articleToDelete.getId())) {
System.out.println("删除成功: " + articleToDelete.getTitle());
} else {
System.out.println("删除失败");
}
}
private void clearAllArticles() {
System.out.print("确定要清空所有数据吗?(y/n): ");
String confirm = scanner.nextLine().trim().toLowerCase();
if ("y".equals(confirm)) {
crawlerService.clearAllArticles();
System.out.println("已清空所有数据");
} else {
System.out.println("已取消操作");
}
}
private void printArticles(List<Article> articles) {
System.out.println("-".repeat(80));
for (int i = 0; i < articles.size(); i++) {
Article article = articles.get(i);
System.out.printf("[%d] %s%n", i + 1, article.getTitle());
System.out.println(" 来源: " + article.getSource());
if (article.getAuthor() != null && !article.getAuthor().isEmpty()) {
System.out.println(" 作者: " + article.getAuthor());
}
if (article.getPublishDate() != null && !article.getPublishDate().isEmpty()) {
System.out.println(" 时间: " + article.getPublishDate());
}
if (article.getSummary() != null && !article.getSummary().isEmpty()) {
String summary = article.getSummary();
if (summary.length() > 50) {
summary = summary.substring(0, 50) + "...";
}
System.out.println(" 摘要: " + summary);
}
System.out.println(" ID: " + article.getId());
System.out.println("-".repeat(80));
}
}
}

113
project/资讯爬虫/src/main/java/com/newscrawler/entity/Article.java

@ -0,0 +1,113 @@
package com.newscrawler.entity;
import java.util.Objects;
import java.util.UUID;
public class Article {
private String id;
private String title;
private String summary;
private String publishDate;
private String articleUrl;
private String author;
private String source;
private String crawledAt;
public Article() {
this.id = UUID.randomUUID().toString();
this.crawledAt = java.time.LocalDateTime.now().toString();
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getPublishDate() {
return publishDate;
}
public void setPublishDate(String publishDate) {
this.publishDate = publishDate;
}
public String getArticleUrl() {
return articleUrl;
}
public void setArticleUrl(String articleUrl) {
this.articleUrl = articleUrl;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getCrawledAt() {
return crawledAt;
}
public void setCrawledAt(String crawledAt) {
this.crawledAt = crawledAt;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Article article = (Article) o;
return Objects.equals(title, article.title) &&
Objects.equals(source, article.source) &&
Objects.equals(publishDate, article.publishDate);
}
@Override
public int hashCode() {
return Objects.hash(title, source, publishDate);
}
@Override
public String toString() {
return "Article{" +
"id='" + id + '\'' +
", title='" + title + '\'' +
", summary='" + summary + '\'' +
", publishDate='" + publishDate + '\'' +
", articleUrl='" + articleUrl + '\'' +
", author='" + author + '\'' +
", source='" + source + '\'' +
", crawledAt='" + crawledAt + '\'' +
'}';
}
}

98
project/资讯爬虫/src/main/java/com/newscrawler/entity/CrawlHistory.java

@ -0,0 +1,98 @@
package com.newscrawler.entity;
import java.time.LocalDateTime;
public class CrawlHistory {
private String id;
private String siteName;
private String siteUrl;
private int articleCount;
private LocalDateTime crawlTime;
private boolean success;
private String errorMessage;
public CrawlHistory() {
this.id = java.util.UUID.randomUUID().toString();
this.crawlTime = LocalDateTime.now();
this.success = true;
}
public CrawlHistory(String siteName, String siteUrl) {
this();
this.siteName = siteName;
this.siteUrl = siteUrl;
}
public CrawlHistory(String siteName, String siteUrl, int articleCount, boolean success, String errorMessage) {
this(siteName, siteUrl);
this.articleCount = articleCount;
this.success = success;
this.errorMessage = errorMessage;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getSiteName() {
return siteName;
}
public void setSiteName(String siteName) {
this.siteName = siteName;
}
public String getSiteUrl() {
return siteUrl;
}
public void setSiteUrl(String siteUrl) {
this.siteUrl = siteUrl;
}
public int getArticleCount() {
return articleCount;
}
public void setArticleCount(int articleCount) {
this.articleCount = articleCount;
}
public LocalDateTime getCrawlTime() {
return crawlTime;
}
public void setCrawlTime(LocalDateTime crawlTime) {
this.crawlTime = crawlTime;
}
public boolean isSuccess() {
return success;
}
public void setSuccess(boolean success) {
this.success = success;
}
public String getErrorMessage() {
return errorMessage;
}
public void setErrorMessage(String errorMessage) {
this.errorMessage = errorMessage;
}
@Override
public String toString() {
return String.format("[%s] %s - %s - %d条 - %s",
crawlTime.toString().replace("T", " "),
siteName,
success ? "成功" : "失败",
articleCount,
success ? "" : errorMessage);
}
}

21
project/资讯爬虫/src/main/java/com/newscrawler/exception/CrawlerException.java

@ -0,0 +1,21 @@
package com.newscrawler.exception;
public class CrawlerException extends Exception {
private static final long serialVersionUID = 1L;
public CrawlerException() {
super();
}
public CrawlerException(String message) {
super(message);
}
public CrawlerException(String message, Throwable cause) {
super(message, cause);
}
public CrawlerException(Throwable cause) {
super(cause);
}
}

21
project/资讯爬虫/src/main/java/com/newscrawler/exception/NetworkException.java

@ -0,0 +1,21 @@
package com.newscrawler.exception;
public class NetworkException extends CrawlerException {
private static final long serialVersionUID = 1L;
public NetworkException() {
super();
}
public NetworkException(String message) {
super(message);
}
public NetworkException(String message, Throwable cause) {
super(message, cause);
}
public NetworkException(Throwable cause) {
super(cause);
}
}

21
project/资讯爬虫/src/main/java/com/newscrawler/exception/ParseException.java

@ -0,0 +1,21 @@
package com.newscrawler.exception;
public class ParseException extends CrawlerException {
private static final long serialVersionUID = 1L;
public ParseException() {
super();
}
public ParseException(String message) {
super(message);
}
public ParseException(String message, Throwable cause) {
super(message, cause);
}
public ParseException(Throwable cause) {
super(cause);
}
}

126
project/资讯爬虫/src/main/java/com/newscrawler/repository/ArticleRepository.java

@ -0,0 +1,126 @@
package com.newscrawler.repository;
import com.newscrawler.entity.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;
public class ArticleRepository {
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class);
private final List<Article> articles;
public ArticleRepository() {
this.articles = new ArrayList<>();
}
public void add(Article article) {
if (article == null) {
logger.warn("尝试添加空文章到仓库");
throw new IllegalArgumentException("文章不能为空");
}
if (article.getTitle() == null || article.getTitle().trim().isEmpty()) {
logger.warn("尝试添加标题为空的文章");
throw new IllegalArgumentException("文章标题不能为空");
}
if (!exists(article)) {
articles.add(article);
logger.debug("添加文章: {}", article.getTitle());
} else {
logger.debug("文章已存在,跳过: {}", article.getTitle());
}
}
public void addAll(Collection<Article> articlesToAdd) {
if (articlesToAdd == null) {
logger.warn("尝试添加空集合到仓库");
throw new IllegalArgumentException("文章集合不能为空");
}
int count = 0;
for (Article article : articlesToAdd) {
try {
add(article);
count++;
} catch (IllegalArgumentException e) {
logger.warn("跳过无效文章: {}", e.getMessage());
}
}
logger.info("批量添加完成,成功添加{}篇文章", count);
}
public boolean remove(Article article) {
if (article == null) {
logger.warn("尝试删除空文章");
return false;
}
boolean removed = articles.remove(article);
if (removed) {
logger.debug("删除文章: {}", article.getTitle());
}
return removed;
}
public boolean removeById(String id) {
if (id == null || id.trim().isEmpty()) {
logger.warn("尝试用空ID删除文章");
return false;
}
boolean removed = articles.removeIf(a -> a.getId().equals(id));
if (removed) {
logger.debug("通过ID删除文章: {}", id);
}
return removed;
}
public void clear() {
int size = articles.size();
articles.clear();
logger.info("清空仓库,删除了{}篇文章", size);
}
public List<Article> getAll() {
return new ArrayList<>(articles);
}
public List<Article> findBySource(String source) {
if (source == null || source.trim().isEmpty()) {
logger.warn("使用空来源查询");
return new ArrayList<>();
}
return articles.stream()
.filter(a -> a.getSource() != null && a.getSource().contains(source))
.collect(Collectors.toList());
}
public boolean exists(Article article) {
if (article == null) {
return false;
}
return articles.contains(article);
}
public int size() {
return articles.size();
}
public boolean isEmpty() {
return articles.isEmpty();
}
public long countBySource(String source) {
return findBySource(source).size();
}
public Article findById(String id) {
if (id == null || id.trim().isEmpty()) {
return null;
}
return articles.stream()
.filter(a -> a.getId().equals(id))
.findFirst()
.orElse(null);
}
}

113
project/资讯爬虫/src/main/java/com/newscrawler/service/CrawlerService.java

@ -0,0 +1,113 @@
package com.newscrawler.service;
import com.newscrawler.entity.Article;
import com.newscrawler.entity.CrawlHistory;
import com.newscrawler.exception.CrawlerException;
import com.newscrawler.exception.ParseException;
import com.newscrawler.repository.ArticleRepository;
import com.newscrawler.strategy.CrawlStrategy;
import com.newscrawler.strategy.StrategyFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class CrawlerService {
private static final Logger logger = LoggerFactory.getLogger(CrawlerService.class);
private final ArticleRepository repository;
private final List<CrawlHistory> crawlHistories;
public CrawlerService() {
this.repository = new ArticleRepository();
this.crawlHistories = new ArrayList<>();
}
public List<Article> crawlSingleSite(String siteKey) throws CrawlerException {
logger.info("开始爬取单个站点: {}", siteKey);
CrawlStrategy strategy = StrategyFactory.getStrategy(siteKey);
return executeCrawl(strategy);
}
public List<Article> crawlAllSites() throws CrawlerException {
logger.info("开始批量爬取所有站点");
List<Article> allArticles = new ArrayList<>();
Map<String, CrawlStrategy> strategies = StrategyFactory.getAllStrategies();
for (Map.Entry<String, CrawlStrategy> entry : strategies.entrySet()) {
try {
List<Article> articles = executeCrawl(entry.getValue());
allArticles.addAll(articles);
} catch (CrawlerException e) {
logger.error("爬取站点{}失败: {}", entry.getKey(), e.getMessage());
}
}
logger.info("批量爬取完成,共获取{}篇文章", allArticles.size());
return allArticles;
}
private List<Article> executeCrawl(CrawlStrategy strategy) throws CrawlerException {
String siteName = strategy.getSiteName();
String siteUrl = strategy.getSiteUrl();
List<Article> articles = new ArrayList<>();
try {
logger.info("开始爬取: {} - {}", siteName, siteUrl);
articles = strategy.crawl();
repository.addAll(articles);
CrawlHistory history = new CrawlHistory(siteName, siteUrl, articles.size(), true, null);
crawlHistories.add(history);
logger.info("爬取{}成功,获取{}篇文章", siteName, articles.size());
} catch (ParseException e) {
CrawlHistory history = new CrawlHistory(siteName, siteUrl, 0, false, e.getMessage());
crawlHistories.add(history);
logger.error("爬取{}失败: {}", siteName, e.getMessage());
throw e;
}
return articles;
}
public ArticleRepository getRepository() {
return repository;
}
public List<CrawlHistory> getCrawlHistories() {
return new ArrayList<>(crawlHistories);
}
public List<Article> getAllArticles() {
return repository.getAll();
}
public List<Article> getArticlesBySource(String source) {
return repository.findBySource(source);
}
public boolean removeArticle(String id) {
return repository.removeById(id);
}
public void clearAllArticles() {
repository.clear();
logger.info("已清空所有文章");
}
public Map<String, Long> getStatistics() {
Map<String, Long> stats = repository.getAll().stream()
.collect(Collectors.groupingBy(Article::getSource, Collectors.counting()));
stats.put("总计", (long) repository.size());
return stats;
}
public void clearHistory() {
crawlHistories.clear();
logger.info("已清空爬取历史");
}
}

83
project/资讯爬虫/src/main/java/com/newscrawler/strategy/AbstractBaseStrategy.java

@ -0,0 +1,83 @@
package com.newscrawler.strategy;
import com.newscrawler.entity.Article;
import com.newscrawler.exception.NetworkException;
import com.newscrawler.exception.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public abstract class AbstractBaseStrategy implements CrawlStrategy {
protected static final Logger logger = LoggerFactory.getLogger(AbstractBaseStrategy.class);
protected static final int MAX_RETRIES = 3;
protected static final int RETRY_DELAY_MS = 2000;
protected static final int TIMEOUT_MS = 30000;
@Override
public List<Article> crawl() throws ParseException {
logger.info("开始爬取站点: {}", getSiteName());
String html = fetchWithRetry();
if (html == null || html.isEmpty()) {
throw new ParseException("获取HTML内容为空");
}
return parseHtml(html);
}
protected String fetchWithRetry() throws ParseException {
int attempts = 0;
Exception lastException = null;
while (attempts < MAX_RETRIES) {
try {
attempts++;
logger.debug("第{}次尝试获取页面: {}", attempts, getSiteUrl());
Document doc = Jsoup.connect(getSiteUrl())
.timeout(TIMEOUT_MS)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.get();
return doc.html();
} catch (IOException e) {
lastException = e;
logger.warn("第{}次尝试失败: {}", attempts, e.getMessage());
if (attempts < MAX_RETRIES) {
try {
Thread.sleep(RETRY_DELAY_MS * attempts);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
throw new ParseException("爬取被中断", ie);
}
}
}
}
NetworkException networkEx = new NetworkException("网络请求失败,已重试" + MAX_RETRIES + "次", lastException);
logger.error("网络请求最终失败: {}", networkEx.getMessage());
throw new ParseException("获取页面失败", networkEx);
}
protected abstract List<Article> parseHtml(String html) throws ParseException;
protected List<Article> createArticles(List<String> titles, List<String> summaries,
List<String> publishDates, List<String> articleUrls,
List<String> authors) {
List<Article> articles = new ArrayList<>();
int size = titles.size();
for (int i = 0; i < size; i++) {
Article article = new Article();
article.setTitle(i < titles.size() ? titles.get(i) : "");
article.setSummary(i < summaries.size() ? summaries.get(i) : "");
article.setPublishDate(i < publishDates.size() ? publishDates.get(i) : "");
article.setArticleUrl(i < articleUrls.size() ? articleUrls.get(i) : "");
article.setAuthor(i < authors.size() ? authors.get(i) : "");
article.setSource(getSiteName());
articles.add(article);
}
return articles;
}
}

14
project/资讯爬虫/src/main/java/com/newscrawler/strategy/CrawlStrategy.java

@ -0,0 +1,14 @@
package com.newscrawler.strategy;
import com.newscrawler.entity.Article;
import com.newscrawler.exception.ParseException;
import java.util.List;
public interface CrawlStrategy {
String getSiteName();
String getSiteUrl();
List<Article> crawl() throws ParseException;
}

93
project/资讯爬虫/src/main/java/com/newscrawler/strategy/RunoobStrategy.java

@ -0,0 +1,93 @@
package com.newscrawler.strategy;
import com.newscrawler.entity.Article;
import com.newscrawler.exception.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class RunoobStrategy extends AbstractBaseStrategy {
private static final String SITE_NAME = "菜鸟教程资讯";
private static final String SITE_URL = "https://www.runoob.com/";
@Override
public String getSiteName() {
return SITE_NAME;
}
@Override
public String getSiteUrl() {
return SITE_URL;
}
@Override
protected List<Article> parseHtml(String html) throws ParseException {
List<Article> articles = new ArrayList<>();
try {
org.jsoup.nodes.Document doc = Jsoup.parse(html);
Elements newsItems = doc.select(".article-list .item, .list-group-item, .news-item, .content li, article");
for (Element item : newsItems) {
Article article = new Article();
article.setSource(SITE_NAME);
Element titleElem = item.selectFirst("h3, h4, h2, .title, .news-title, a[href]");
article.setTitle(titleElem != null ? titleElem.text().trim() : "");
Element summaryElem = item.selectFirst(".desc, .summary, .news-desc, p");
article.setSummary(summaryElem != null ? summaryElem.text().trim() : "");
Element dateElem = item.selectFirst(".date, time, .time, span");
article.setPublishDate(dateElem != null ? dateElem.text().trim() : "");
Element linkElem = item.selectFirst("a[href]");
article.setArticleUrl(linkElem != null && linkElem.hasAttr("href")
? linkElem.attr("href") : "");
if (!article.getTitle().isEmpty()) {
articles.add(article);
}
}
if (articles.isEmpty()) {
Elements links = doc.select("a[href*='/article/'], a[href*='/tutorial/'], a[href*='/course/']");
for (Element link : links) {
String title = link.text().trim();
if (title.length() > 5 && !title.contains("首页") && !title.contains("教程") && !title.contains("学习")) {
Article article = new Article();
article.setSource(SITE_NAME);
article.setTitle(title);
article.setArticleUrl(link.hasAttr("href") ? link.attr("href") : "");
articles.add(article);
}
}
}
if (articles.isEmpty()) {
Elements allLinks = doc.select("a");
for (Element link : allLinks) {
String title = link.text().trim();
if (title.length() > 5 && title.length() < 100) {
String href = link.hasAttr("href") ? link.attr("href") : "";
if (href.contains("/") && !href.startsWith("#") && !href.contains("javascript")) {
Article article = new Article();
article.setSource(SITE_NAME);
article.setTitle(title);
article.setArticleUrl(href);
articles.add(article);
}
}
}
}
logger.info("菜鸟教程解析到{}条资讯", articles.size());
} catch (Exception e) {
throw new ParseException("解析菜鸟教程页面失败", e);
}
return articles;
}
}

90
project/资讯爬虫/src/main/java/com/newscrawler/strategy/SohuStrategy.java

@ -0,0 +1,90 @@
package com.newscrawler.strategy;
import com.newscrawler.entity.Article;
import com.newscrawler.exception.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class SohuStrategy extends AbstractBaseStrategy {
private static final String SITE_NAME = "搜狐资讯";
private static final String SITE_URL = "https://news.sohu.com/";
@Override
public String getSiteName() {
return SITE_NAME;
}
@Override
public String getSiteUrl() {
return SITE_URL;
}
@Override
protected List<Article> parseHtml(String html) throws ParseException {
List<Article> articles = new ArrayList<>();
try {
org.jsoup.nodes.Document doc = Jsoup.parse(html);
Elements allLinks = doc.select("a");
int count = 0;
for (Element link : allLinks) {
String title = link.text().trim();
String href = link.attr("href");
if (title.length() > 8 && title.length() < 80 &&
!title.contains("登录") && !title.contains("注册") &&
!title.contains("评论") && !title.contains("分享") &&
!title.contains("更多") && !title.contains("首页") &&
href.contains("/a/")) {
Article article = new Article();
article.setSource(SITE_NAME);
article.setTitle(title);
article.setAuthor("搜狐");
if (href.startsWith("//")) {
href = "https:" + href;
} else if (href.startsWith("/")) {
href = "https://news.sohu.com" + href;
}
article.setArticleUrl(href);
articles.add(article);
count++;
if (count >= 20) break;
}
}
if (articles.isEmpty()) {
Elements mainNews = doc.select(".focus-news-list li, .main-news li, .listCon li");
for (Element item : mainNews) {
Element titleLink = item.selectFirst("a");
if (titleLink != null) {
String title = titleLink.text().trim();
if (title.length() > 5) {
Article article = new Article();
article.setSource(SITE_NAME);
article.setTitle(title);
article.setAuthor("搜狐");
String href = titleLink.attr("href");
if (href.startsWith("/")) {
href = "https://news.sohu.com" + href;
}
article.setArticleUrl(href);
articles.add(article);
}
}
}
}
logger.info("搜狐资讯解析到{}条资讯", articles.size());
} catch (Exception e) {
throw new ParseException("解析搜狐资讯页面失败", e);
}
return articles;
}
}

39
project/资讯爬虫/src/main/java/com/newscrawler/strategy/StrategyFactory.java

@ -0,0 +1,39 @@
package com.newscrawler.strategy;
import java.util.HashMap;
import java.util.Map;
public class StrategyFactory {
private static final Map<String, CrawlStrategy> STRATEGY_MAP = new HashMap<>();
static {
STRATEGY_MAP.put("runoob", new RunoobStrategy());
STRATEGY_MAP.put("youth", new YouthStrategy());
STRATEGY_MAP.put("sohu", new SohuStrategy());
}
public static CrawlStrategy getStrategy(String siteKey) {
CrawlStrategy strategy = STRATEGY_MAP.get(siteKey.toLowerCase());
if (strategy == null) {
throw new IllegalArgumentException("不支持的站点: " + siteKey);
}
return strategy;
}
public static CrawlStrategy getStrategyBySiteName(String siteName) {
for (Map.Entry<String, CrawlStrategy> entry : STRATEGY_MAP.entrySet()) {
if (entry.getValue().getSiteName().contains(siteName)) {
return entry.getValue();
}
}
throw new IllegalArgumentException("未找到站点对应的策略: " + siteName);
}
public static Map<String, CrawlStrategy> getAllStrategies() {
return new HashMap<>(STRATEGY_MAP);
}
public static String[] getSiteKeys() {
return STRATEGY_MAP.keySet().toArray(new String[0]);
}
}

70
project/资讯爬虫/src/main/java/com/newscrawler/strategy/YouthStrategy.java

@ -0,0 +1,70 @@
package com.newscrawler.strategy;
import com.newscrawler.entity.Article;
import com.newscrawler.exception.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class YouthStrategy extends AbstractBaseStrategy {
private static final String SITE_NAME = "知乎日报";
private static final String SITE_URL = "https://daily.zhihu.com/";
@Override
public String getSiteName() {
return SITE_NAME;
}
@Override
public String getSiteUrl() {
return SITE_URL;
}
@Override
protected List<Article> parseHtml(String html) throws ParseException {
List<Article> articles = new ArrayList<>();
try {
logger.info("开始解析知乎日报页面,HTML长度: {}", html.length());
org.jsoup.nodes.Document doc = Jsoup.parse(html);
Elements allLinks = doc.select("a");
logger.info("页面共有 {} 个链接", allLinks.size());
for (Element link : allLinks) {
String title = link.text().trim();
String href = link.attr("href");
if (title.length() >= 4 && title.length() <= 50 && !title.isEmpty()) {
Article article = new Article();
article.setSource(SITE_NAME);
article.setTitle(title);
article.setAuthor("知乎日报");
if (href.startsWith("//")) {
href = "https:" + href;
} else if (href.startsWith("/")) {
href = "https://daily.zhihu.com" + href;
}
article.setArticleUrl(href);
articles.add(article);
}
}
logger.info("初步解析到{}条资讯", articles.size());
if (articles.size() > 20) {
articles = articles.subList(0, 20);
}
logger.info("知乎日报最终解析到{}条资讯", articles.size());
} catch (Exception e) {
logger.error("解析异常: {}", e.getMessage());
throw new ParseException("解析知乎日报页面失败", e);
}
return articles;
}
}

134
project/资讯爬虫/src/main/java/com/newscrawler/util/JsonUtil.java

@ -0,0 +1,134 @@
package com.newscrawler.util;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.reflect.TypeToken;
import com.newscrawler.entity.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.lang.reflect.Type;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class JsonUtil {
private static final Logger logger = LoggerFactory.getLogger(JsonUtil.class);
private static final Gson GSON = new GsonBuilder()
.setPrettyPrinting()
.setDateFormat("yyyy-MM-dd'T'HH:mm:ss")
.create();
private static final String DATA_DIR = "data";
static {
try {
Files.createDirectories(Paths.get(DATA_DIR));
} catch (IOException e) {
logger.warn("创建数据目录失败: {}", e.getMessage());
}
}
public static void exportToJson(List<Article> articles, String filename) throws IOException {
if (articles == null) {
throw new IllegalArgumentException("文章列表不能为空");
}
if (filename == null || filename.trim().isEmpty()) {
throw new IllegalArgumentException("文件名不能为空");
}
String fullPath = getFullPath(filename);
try (Writer writer = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(fullPath), StandardCharsets.UTF_8))) {
GSON.toJson(articles, writer);
logger.info("成功导出{}篇文章到{}", articles.size(), fullPath);
} catch (IOException e) {
logger.error("导出JSON失败: {}", e.getMessage());
throw e;
}
}
public static List<Article> importFromJson(String filename) throws IOException {
if (filename == null || filename.trim().isEmpty()) {
throw new IllegalArgumentException("文件名不能为空");
}
String fullPath = getFullPath(filename);
Path path = Paths.get(fullPath);
if (!Files.exists(path)) {
throw new FileNotFoundException("文件不存在: " + fullPath);
}
List<Article> importedArticles;
try (Reader reader = new BufferedReader(
new InputStreamReader(
new FileInputStream(fullPath), StandardCharsets.UTF_8))) {
Type listType = new TypeToken<ArrayList<Article>>() {}.getType();
importedArticles = GSON.fromJson(reader, listType);
} catch (IOException e) {
logger.error("导入JSON失败: {}", e.getMessage());
throw e;
}
if (importedArticles == null) {
importedArticles = new ArrayList<>();
}
List<Article> deduplicatedArticles = deduplicate(importedArticles);
logger.info("从{}导入{}篇文章,去重后保留{}篇",
fullPath, importedArticles.size(), deduplicatedArticles.size());
return deduplicatedArticles;
}
private static List<Article> deduplicate(List<Article> articles) {
Set<String> seen = new HashSet<>();
List<Article> deduplicated = new ArrayList<>();
for (Article article : articles) {
String key = generateDeduplicateKey(article);
if (!seen.contains(key)) {
seen.add(key);
deduplicated.add(article);
} else {
logger.debug("去重重复文章: {}", article.getTitle());
}
}
return deduplicated;
}
private static String generateDeduplicateKey(Article article) {
return (article.getTitle() != null ? article.getTitle() : "") + "|" +
(article.getSource() != null ? article.getSource() : "") + "|" +
(article.getPublishDate() != null ? article.getPublishDate() : "");
}
private static String getFullPath(String filename) {
if (filename.endsWith(".json")) {
return DATA_DIR + File.separator + filename;
}
return DATA_DIR + File.separator + filename + ".json";
}
public static void exportHistoriesToJson(List<?> histories, String filename) throws IOException {
if (histories == null) {
throw new IllegalArgumentException("历史记录列表不能为空");
}
String fullPath = DATA_DIR + File.separator + filename + "_history.json";
try (Writer writer = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(fullPath), StandardCharsets.UTF_8))) {
GSON.toJson(histories, writer);
logger.info("成功导出{}条历史记录到{}", histories.size(), fullPath);
}
}
}

37
project/资讯爬虫/src/main/resources/logback.xml

@ -0,0 +1,37 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<property name="LOG_PATTERN" value="%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n"/>
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>${LOG_PATTERN}</pattern>
<charset>GBK</charset>
</encoder>
</appender>
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/crawler.log</file>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<fileNamePattern>logs/crawler-%d{yyyy-MM-dd}.log</fileNamePattern>
<maxHistory>30</maxHistory>
</rollingPolicy>
<encoder>
<pattern>${LOG_PATTERN}</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<logger name="com.newscrawler" level="DEBUG" additivity="false">
<appender-ref ref="CONSOLE"/>
<appender-ref ref="FILE"/>
</logger>
<logger name="org.jsoup" level="WARN" additivity="false">
<appender-ref ref="CONSOLE"/>
</logger>
<root level="INFO">
<appender-ref ref="CONSOLE"/>
<appender-ref ref="FILE"/>
</root>
</configuration>

BIN
project/资讯爬虫/target/classes/com/newscrawler/Main.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/command/MenuCommand.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/entity/Article.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/entity/CrawlHistory.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/exception/CrawlerException.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/exception/NetworkException.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/exception/ParseException.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/repository/ArticleRepository.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/service/CrawlerService.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/strategy/AbstractBaseStrategy.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/strategy/CrawlStrategy.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/strategy/RunoobStrategy.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/strategy/SohuStrategy.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/strategy/StrategyFactory.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/strategy/YouthStrategy.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/util/JsonUtil$1.class

Binary file not shown.

BIN
project/资讯爬虫/target/classes/com/newscrawler/util/JsonUtil.class

Binary file not shown.

37
project/资讯爬虫/target/classes/logback.xml

@ -0,0 +1,37 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<property name="LOG_PATTERN" value="%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n"/>
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>${LOG_PATTERN}</pattern>
<charset>GBK</charset>
</encoder>
</appender>
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/crawler.log</file>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<fileNamePattern>logs/crawler-%d{yyyy-MM-dd}.log</fileNamePattern>
<maxHistory>30</maxHistory>
</rollingPolicy>
<encoder>
<pattern>${LOG_PATTERN}</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<logger name="com.newscrawler" level="DEBUG" additivity="false">
<appender-ref ref="CONSOLE"/>
<appender-ref ref="FILE"/>
</logger>
<logger name="org.jsoup" level="WARN" additivity="false">
<appender-ref ref="CONSOLE"/>
</logger>
<root level="INFO">
<appender-ref ref="CONSOLE"/>
<appender-ref ref="FILE"/>
</root>
</configuration>

BIN
project/资讯爬虫/target/dependency/gson-2.10.1.jar

Binary file not shown.

BIN
project/资讯爬虫/target/dependency/jsoup-1.17.2.jar

Binary file not shown.

BIN
project/资讯爬虫/target/dependency/logback-classic-1.4.14.jar

Binary file not shown.

BIN
project/资讯爬虫/target/dependency/logback-core-1.4.14.jar

Binary file not shown.

BIN
project/资讯爬虫/target/dependency/slf4j-api-2.0.11.jar

Binary file not shown.

17
project/资讯爬虫/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst

@ -0,0 +1,17 @@
com\newscrawler\util\JsonUtil$1.class
com\newscrawler\entity\CrawlHistory.class
com\newscrawler\exception\ParseException.class
com\newscrawler\entity\Article.class
com\newscrawler\repository\ArticleRepository.class
com\newscrawler\strategy\AbstractBaseStrategy.class
com\newscrawler\strategy\StrategyFactory.class
com\newscrawler\command\MenuCommand.class
com\newscrawler\service\CrawlerService.class
com\newscrawler\strategy\RunoobStrategy.class
com\newscrawler\exception\CrawlerException.class
com\newscrawler\strategy\CrawlStrategy.class
com\newscrawler\strategy\SohuStrategy.class
com\newscrawler\exception\NetworkException.class
com\newscrawler\util\JsonUtil.class
com\newscrawler\strategy\YouthStrategy.class
com\newscrawler\Main.class

16
project/资讯爬虫/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst

@ -0,0 +1,16 @@
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\exception\NetworkException.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\util\JsonUtil.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\exception\ParseException.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\YouthStrategy.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\entity\CrawlHistory.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\service\CrawlerService.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\RunoobStrategy.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\exception\CrawlerException.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\repository\ArticleRepository.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\command\MenuCommand.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\StrategyFactory.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\entity\Article.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\Main.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\SohuStrategy.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\CrawlStrategy.java
D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\AbstractBaseStrategy.java
Loading…
Cancel
Save