宋瑞-202506050301

3 weeks ago · e927004858
48 changed files with 6161 additions and 0 deletions
--- a/project/202506050301-宋瑞-期末实验报告.docx
+++ b/project/202506050301-宋瑞-期末实验报告.docx
--- a/project/资讯爬虫/data/知乎1.json
+++ b/project/资讯爬虫/data/知乎1.json
@ -0,0 +1,146 @@
 [
  {
    "id": "6859ecc9-c992-4e93-93e6-87ddc6e1a6be",
    "title": "浏览内容",
    "articleUrl": "#section_head",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "5dbcafb6-8a83-4052-aed1-850e72265f91",
    "title": "App 下载",
    "articleUrl": "http://www.wandoujia.com/apps/com.zhihu.daily.android",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "06b7c5b6-c4ce-4281-9e7d-6e312858307d",
    "title": "知乎日报",
    "articleUrl": "http://daily.zhihu.com/",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "e4880198-329b-47c7-bf7b-78052aa2bf8b",
    "title": "iOS 版",
    "articleUrl": "https://itunes.apple.com/cn/app/id639087967?mt\u003d8",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "e17a296b-5f65-4bfb-b12f-9fd4fe1a07d7",
    "title": "文学创作会不会受到 AI 的冲击？",
    "articleUrl": "https://daily.zhihu.com/story/9790086",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "12676469-494d-473a-89c8-01f140a57188",
    "title": "为什么说西西弗斯面对巨石，不断推上山是一种超越和蔑视？",
    "articleUrl": "https://daily.zhihu.com/story/9790101",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "625a5f08-3ce9-487f-b367-5be0cdb7500e",
    "title": "有哪些看起来很高端的技术其实原理很暴力很初级？",
    "articleUrl": "https://daily.zhihu.com/story/9790092",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "d4aa8adc-e4af-4194-bef1-bdf233742f86",
    "title": "中国古代官方不重视理工科吗,如果是,为什么?",
    "articleUrl": "https://daily.zhihu.com/story/9790090",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "e36a73d4-c0b8-4674-b4d5-d1d9de5cd894",
    "title": "为什么人类不能自身合成维生素C?",
    "articleUrl": "https://daily.zhihu.com/story/9790062",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "e7adc448-4570-45e6-a6e7-46b1397f0677",
    "title": "林黛玉被妙玉嫌弃太俗，却不敢反驳，她怼贾宝玉的劲儿哪去了？",
    "articleUrl": "https://daily.zhihu.com/story/9790081",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "37468176-caca-47f2-b422-b467a644e0ff",
    "title": "魏博没有山川险阻，靠什么屹立150年。甚至长期成为最强藩？",
    "articleUrl": "https://daily.zhihu.com/story/9790071",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "bdf153fc-5ece-4ea3-85bb-de0a2b49f5ee",
    "title": "瞎扯 · 如何正确地吐槽",
    "articleUrl": "https://daily.zhihu.com/story/9790084",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "e9cec0a7-82c2-43e5-a529-92d3979d4b24",
    "title": "为什么松鼠的动作总是一顿一顿的？像卡帧一样？",
    "articleUrl": "https://daily.zhihu.com/story/9790034",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "288cd539-af4b-4bf1-81ee-6fe5a9936fae",
    "title": "既有 F\u003dma，又有F\u003dkx，那么物理公式到底要求等号左边是因还是果？",
    "articleUrl": "https://daily.zhihu.com/story/9790046",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "c595c7e0-e768-4f5e-a0ed-9982466bf761",
    "title": "西安唐代城门恢复为何不学洛阳，丹凤门像土黄色纸壳子，明德门像塑料玩具？",
    "articleUrl": "https://daily.zhihu.com/story/9790039",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "3bc7435b-2ac8-4979-bd8c-20da0b4f7a3d",
    "title": "游牧民族几乎没有碳水来源，为什么没有营养不良?",
    "articleUrl": "https://daily.zhihu.com/story/9790022",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "c5ebc7af-5117-46a7-9107-1f28ed91d0c1",
    "title": "为什么压力单位这么混乱？",
    "articleUrl": "https://daily.zhihu.com/story/9790028",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  },
  {
    "id": "867ebe9e-0016-4e29-a407-319ef50aa51c",
    "title": "为什么会有好奇害死猫这个说法？",
    "articleUrl": "https://daily.zhihu.com/story/9790027",
    "author": "知乎日报",
    "source": "知乎日报",
    "crawledAt": "2026-05-30T14:49:20.886753"
  }
 ]
--- a/project/资讯爬虫/data/菜鸟1.json
+++ b/project/资讯爬虫/data/菜鸟1.json
--- a/project/资讯爬虫/logs/crawler-2026-05-29.log
+++ b/project/资讯爬虫/logs/crawler-2026-05-29.log
--- a/project/资讯爬虫/logs/crawler.log
+++ b/project/资讯爬虫/logs/crawler.log
--- a/project/资讯爬虫/pom.xml
+++ b/project/资讯爬虫/pom.xml
@ -0,0 +1,73 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.newscrawler</groupId>
    <artifactId>news-crawler</artifactId>
    <version>1.0.0</version>
    <packaging>jar</packaging>
    <name>News Crawler</name>
    <description>增强版Java资讯爬虫</description>
    <properties>
        <java.version>17</java.version>
        <maven.compiler.source>17</maven.compiler.source>
        <maven.compiler.target>17</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <jsoup.version>1.17.2</jsoup.version>
        <gson.version>2.10.1</gson.version>
        <logback.version>1.4.14</logback.version>
        <slf4j.version>2.0.11</slf4j.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>${jsoup.version}</version>
        </dependency>
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>${gson.version}</version>
        </dependency>
        <dependency>
            <groupId>ch.qos.logback</groupId>
            <artifactId>logback-classic</artifactId>
            <version>${logback.version}</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>${slf4j.version}</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.11.0</version>
                <configuration>
                    <source>${java.version}</source>
                    <target>${java.version}</target>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.codehaus.mojo</groupId>
                <artifactId>exec-maven-plugin</artifactId>
                <version>3.1.0</version>
                <configuration>
                    <mainClass>com.newscrawler.Main</mainClass>
                </configuration>
            </plugin>
        </plugins>
    </build>
 </project>
--- a/project/资讯爬虫/src/main/java/com/newscrawler/Main.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/Main.java
@ -0,0 +1,26 @@
 package com.newscrawler;
 import com.newscrawler.command.MenuCommand;
 import com.newscrawler.service.CrawlerService;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 public class Main {
    private static final Logger logger = LoggerFactory.getLogger(Main.class);
    public static void main(String[] args) {
        logger.info("资讯爬虫启动");
        try {
            CrawlerService crawlerService = new CrawlerService();
            MenuCommand menuCommand = new MenuCommand(crawlerService);
            menuCommand.start();
        } catch (Exception e) {
            logger.error("程序执行异常", e);
            System.err.println("程序执行失败: " + e.getMessage());
            System.exit(1);
        }
        logger.info("资讯爬虫关闭");
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/command/MenuCommand.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/command/MenuCommand.java
@ -0,0 +1,361 @@
 package com.newscrawler.command;
 import com.newscrawler.entity.Article;
 import com.newscrawler.entity.CrawlHistory;
 import com.newscrawler.exception.CrawlerException;
 import com.newscrawler.service.CrawlerService;
 import com.newscrawler.util.JsonUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Scanner;
 import java.util.Set;
 import java.util.stream.Collectors;
 public class MenuCommand {
    private static final Logger logger = LoggerFactory.getLogger(MenuCommand.class);
    private final CrawlerService crawlerService;
    private final Scanner scanner;
    public MenuCommand(CrawlerService crawlerService) {
        this.crawlerService = crawlerService;
        this.scanner = new Scanner(System.in);
    }
    public void start() {
        boolean running = true;
        while (running) {
            showMainMenu();
            String input = scanner.nextLine().trim();
            running = handleMainMenu(input);
        }
        System.out.println("感谢使用资讯爬虫，再见！");
    }
    private void showMainMenu() {
        System.out.println("\n========== 请选择你要执行的操作 ==========");
        System.out.println("1. 爬取菜鸟教程资讯");
        System.out.println("2. 爬取知乎日报");
        System.out.println("3. 爬取搜狐资讯");
        System.out.println("4. 批量爬取全部站点");
        System.out.println("5. 查看全部资讯");
        System.out.println("6. 数据统计");
        System.out.println("7. 爬取历史");
        System.out.println("8. 数据导入/导出");
        System.out.println("9. 数据管理(删除/清空)");
        System.out.println("0. 退出程序");
        System.out.print("请输入选项: ");
    }
    private boolean handleMainMenu(String input) {
        try {
            switch (input) {
                case "1":
                    crawlSite("runoob");
                    break;
                case "2":
                    crawlSite("youth");
                    break;
                case "3":
                    crawlSite("sohu");
                    break;
                case "4":
                    crawlAllSites();
                    break;
                case "5":
                    viewAllArticles();
                    break;
                case "6":
                    showStatistics();
                    break;
                case "7":
                    showCrawlHistory();
                    break;
                case "8":
                    handleImportExport();
                    break;
                case "9":
                    manageData();
                    break;
                case "0":
                    return false;
                default:
                    System.out.println("无效选项，请输入0-9之间的数字");
            }
        } catch (Exception e) {
            System.out.println("操作出错: " + e.getMessage());
            logger.error("菜单操作异常", e);
        }
        return true;
    }
    private void crawlSite(String siteKey) throws CrawlerException {
        System.out.println("正在爬取，请稍候...");
        List<Article> articles = crawlerService.crawlSingleSite(siteKey);
        System.out.println("爬取完成！获取到 " + articles.size() + " 篇文章");
    }
    private void crawlAllSites() throws CrawlerException {
        System.out.println("正在批量爬取所有站点，请稍候...");
        List<Article> articles = crawlerService.crawlAllSites();
        System.out.println("批量爬取完成！共获取到 " + articles.size() + " 篇文章");
    }
    private void viewAllArticles() {
        List<Article> articles = crawlerService.getAllArticles();
        if (articles.isEmpty()) {
            System.out.println("暂无资讯，请先爬取数据");
            return;
        }
        printArticles(articles);
    }
    private void showStatistics() {
        Map<String, Long> stats = crawlerService.getStatistics();
        System.out.println("\n=== 数据统计 ===");
        stats.forEach((source, count) -> System.out.println(source + ": " + count + " 条"));
    }
    private void showCrawlHistory() {
        List<CrawlHistory> histories = crawlerService.getCrawlHistories();
        if (histories.isEmpty()) {
            System.out.println("暂无爬取历史");
            return;
        }
        System.out.println("\n=== 爬取历史 ===");
        histories.forEach(h -> System.out.println(h.toString()));
    }
    private void handleImportExport() {
        System.out.println("\n=== 数据导入/导出 ===");
        System.out.println("1. 导出数据到JSON");
        System.out.println("2. 从JSON导入数据");
        System.out.print("选择: ");
        String choice = scanner.nextLine().trim();
        switch (choice) {
            case "1" -> exportToJson();
            case "2" -> importFromJson();
            default -> System.out.println("无效选择");
        }
    }
    private void exportToJson() {
        List<Article> articles = crawlerService.getAllArticles();
        if (articles.isEmpty()) {
            System.out.println("暂无数据可导出，请先爬取数据");
            return;
        }
        Set<String> sources = articles.stream()
                .map(Article::getSource)
                .filter(Objects::nonNull)
                .collect(Collectors.toSet());
        String prefix;
        if (sources.size() > 1) {
            prefix = "全";
        } else if (sources.contains("菜鸟教程资讯")) {
            prefix = "菜鸟";
        } else if (sources.contains("知乎日报")) {
            prefix = "知乎";
        } else if (sources.contains("搜狐资讯")) {
            prefix = "搜狐";
        } else {
            prefix = "articles";
        }
        String filename = generateNumberedFilename(prefix);
        try {
            JsonUtil.exportToJson(articles, filename);
            System.out.println("导出成功: " + filename);
        } catch (Exception e) {
            System.out.println("导出失败: " + e.getMessage());
            logger.error("导出JSON失败", e);
        }
    }
    private String generateNumberedFilename(String prefix) {
        java.io.File dir = new java.io.File("data");
        if (!dir.exists()) {
            return prefix + "1.json";
        }
        java.io.File[] files = dir.listFiles((d, name) -> name.startsWith(prefix) && name.endsWith(".json"));
        if (files == null || files.length == 0) {
            return prefix + "1.json";
        }
        int maxNum = 0;
        for (java.io.File file : files) {
            String name = file.getName();
            try {
                String numStr = name.substring(prefix.length(), name.length() - 5);
                int num = Integer.parseInt(numStr);
                if (num > maxNum) {
                    maxNum = num;
                }
            } catch (NumberFormatException ignored) {
            }
        }
        return prefix + (maxNum + 1) + ".json";
    }
    private void importFromJson() {
        java.io.File dir = new java.io.File("data");
        if (!dir.exists()) {
            System.out.println("当前目录下没有JSON文件，请先导出数据");
            return;
        }
        java.io.File[] files = dir.listFiles((d, name) -> name.endsWith(".json"));
        if (files == null || files.length == 0) {
            System.out.println("当前目录下没有JSON文件，请先导出数据");
            return;
        }
        System.out.println("\n当前目录下的JSON文件:");
        for (int i = 0; i < files.length; i++) {
            String name = files[i].getName();
            System.out.println((i + 1) + ". " + name);
        }
        System.out.print("\n请输入要导入的文件编号,或输入0返回: ");
        String input = scanner.nextLine().trim();
        if ("0".equals(input)) {
            return;
        }
        if (input.isEmpty()) {
            System.out.println("输入不能为空");
            return;
        }
        int index;
        try {
            index = Integer.parseInt(input) - 1;
        } catch (NumberFormatException e) {
            System.out.println("请输入有效的数字编号");
            return;
        }
        if (index < 0 || index >= files.length) {
            System.out.println("编号超出范围");
            return;
        }
        String filename = files[index].getName();
        try {
            List<Article> articles = JsonUtil.importFromJson(filename);
            crawlerService.getRepository().addAll(articles);
            System.out.println("导入成功: " + articles.size() + " 篇文章(已自动去重)");
        } catch (Exception e) {
            System.out.println("导入失败: " + e.getMessage());
            logger.error("导入JSON失败", e);
        }
    }
    private void manageData() {
        System.out.println("\n=== 数据管理 ===");
        System.out.println("1. 删除单条资讯");
        System.out.println("2. 清空所有资讯");
        System.out.print("选择: ");
        String choice = scanner.nextLine().trim();
        switch (choice) {
            case "1" -> deleteSingleArticle();
            case "2" -> clearAllArticles();
            default -> System.out.println("无效选择");
        }
    }
    private void deleteSingleArticle() {
        List<Article> articles = crawlerService.getAllArticles();
        if (articles.isEmpty()) {
            System.out.println("暂无资讯可删除");
            return;
        }
        System.out.println("\n=== 当前数据库中的资讯 ===");
        for (int i = 0; i < articles.size(); i++) {
            Article article = articles.get(i);
            System.out.printf("[%d] %s%n", i + 1, article.getTitle());
            System.out.println("    来源: " + article.getSource());
            if (article.getPublishDate() != null && !article.getPublishDate().isEmpty()) {
                System.out.println("    时间: " + article.getPublishDate());
            }
            System.out.println("-".repeat(60));
        }
        System.out.print("\n请输入要删除的文章编号(输入0取消): ");
        String input = scanner.nextLine().trim();
        if ("0".equals(input)) {
            System.out.println("已取消操作");
            return;
        }
        int index;
        try {
            index = Integer.parseInt(input) - 1;
        } catch (NumberFormatException e) {
            System.out.println("请输入有效的数字");
            return;
        }
        if (index < 0 || index >= articles.size()) {
            System.out.println("编号超出范围");
            return;
        }
        Article articleToDelete = articles.get(index);
        if (crawlerService.removeArticle(articleToDelete.getId())) {
            System.out.println("删除成功: " + articleToDelete.getTitle());
        } else {
            System.out.println("删除失败");
        }
    }
    private void clearAllArticles() {
        System.out.print("确定要清空所有数据吗?(y/n): ");
        String confirm = scanner.nextLine().trim().toLowerCase();
        if ("y".equals(confirm)) {
            crawlerService.clearAllArticles();
            System.out.println("已清空所有数据");
        } else {
            System.out.println("已取消操作");
        }
    }
    private void printArticles(List<Article> articles) {
        System.out.println("-".repeat(80));
        for (int i = 0; i < articles.size(); i++) {
            Article article = articles.get(i);
            System.out.printf("[%d] %s%n", i + 1, article.getTitle());
            System.out.println("    来源: " + article.getSource());
            if (article.getAuthor() != null && !article.getAuthor().isEmpty()) {
                System.out.println("    作者: " + article.getAuthor());
            }
            if (article.getPublishDate() != null && !article.getPublishDate().isEmpty()) {
                System.out.println("    时间: " + article.getPublishDate());
            }
            if (article.getSummary() != null && !article.getSummary().isEmpty()) {
                String summary = article.getSummary();
                if (summary.length() > 50) {
                    summary = summary.substring(0, 50) + "...";
                }
                System.out.println("    摘要: " + summary);
            }
            System.out.println("    ID: " + article.getId());
            System.out.println("-".repeat(80));
        }
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/entity/Article.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/entity/Article.java
@ -0,0 +1,113 @@
 package com.newscrawler.entity;
 import java.util.Objects;
 import java.util.UUID;
 public class Article {
    private String id;
    private String title;
    private String summary;
    private String publishDate;
    private String articleUrl;
    private String author;
    private String source;
    private String crawledAt;
    public Article() {
        this.id = UUID.randomUUID().toString();
        this.crawledAt = java.time.LocalDateTime.now().toString();
    }
    public String getId() {
        return id;
    }
    public void setId(String id) {
        this.id = id;
    }
    public String getTitle() {
        return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public String getSummary() {
        return summary;
    }
    public void setSummary(String summary) {
        this.summary = summary;
    }
    public String getPublishDate() {
        return publishDate;
    }
    public void setPublishDate(String publishDate) {
        this.publishDate = publishDate;
    }
    public String getArticleUrl() {
        return articleUrl;
    }
    public void setArticleUrl(String articleUrl) {
        this.articleUrl = articleUrl;
    }
    public String getAuthor() {
        return author;
    }
    public void setAuthor(String author) {
        this.author = author;
    }
    public String getSource() {
        return source;
    }
    public void setSource(String source) {
        this.source = source;
    }
    public String getCrawledAt() {
        return crawledAt;
    }
    public void setCrawledAt(String crawledAt) {
        this.crawledAt = crawledAt;
    }
    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        Article article = (Article) o;
        return Objects.equals(title, article.title) &&
                Objects.equals(source, article.source) &&
                Objects.equals(publishDate, article.publishDate);
    }
    @Override
    public int hashCode() {
        return Objects.hash(title, source, publishDate);
    }
    @Override
    public String toString() {
        return "Article{" +
                "id='" + id + '\'' +
                ", title='" + title + '\'' +
                ", summary='" + summary + '\'' +
                ", publishDate='" + publishDate + '\'' +
                ", articleUrl='" + articleUrl + '\'' +
                ", author='" + author + '\'' +
                ", source='" + source + '\'' +
                ", crawledAt='" + crawledAt + '\'' +
                '}';
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/entity/CrawlHistory.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/entity/CrawlHistory.java
@ -0,0 +1,98 @@
 package com.newscrawler.entity;
 import java.time.LocalDateTime;
 public class CrawlHistory {
    private String id;
    private String siteName;
    private String siteUrl;
    private int articleCount;
    private LocalDateTime crawlTime;
    private boolean success;
    private String errorMessage;
    public CrawlHistory() {
        this.id = java.util.UUID.randomUUID().toString();
        this.crawlTime = LocalDateTime.now();
        this.success = true;
    }
    public CrawlHistory(String siteName, String siteUrl) {
        this();
        this.siteName = siteName;
        this.siteUrl = siteUrl;
    }
    public CrawlHistory(String siteName, String siteUrl, int articleCount, boolean success, String errorMessage) {
        this(siteName, siteUrl);
        this.articleCount = articleCount;
        this.success = success;
        this.errorMessage = errorMessage;
    }
    public String getId() {
        return id;
    }
    public void setId(String id) {
        this.id = id;
    }
    public String getSiteName() {
        return siteName;
    }
    public void setSiteName(String siteName) {
        this.siteName = siteName;
    }
    public String getSiteUrl() {
        return siteUrl;
    }
    public void setSiteUrl(String siteUrl) {
        this.siteUrl = siteUrl;
    }
    public int getArticleCount() {
        return articleCount;
    }
    public void setArticleCount(int articleCount) {
        this.articleCount = articleCount;
    }
    public LocalDateTime getCrawlTime() {
        return crawlTime;
    }
    public void setCrawlTime(LocalDateTime crawlTime) {
        this.crawlTime = crawlTime;
    }
    public boolean isSuccess() {
        return success;
    }
    public void setSuccess(boolean success) {
        this.success = success;
    }
    public String getErrorMessage() {
        return errorMessage;
    }
    public void setErrorMessage(String errorMessage) {
        this.errorMessage = errorMessage;
    }
    @Override
    public String toString() {
        return String.format("[%s] %s - %s - %d条 - %s",
                crawlTime.toString().replace("T", " "),
                siteName,
                success ? "成功" : "失败",
                articleCount,
                success ? "" : errorMessage);
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/exception/CrawlerException.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/exception/CrawlerException.java
@ -0,0 +1,21 @@
 package com.newscrawler.exception;
 public class CrawlerException extends Exception {
    private static final long serialVersionUID = 1L;
    public CrawlerException() {
        super();
    }
    public CrawlerException(String message) {
        super(message);
    }
    public CrawlerException(String message, Throwable cause) {
        super(message, cause);
    }
    public CrawlerException(Throwable cause) {
        super(cause);
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/exception/NetworkException.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/exception/NetworkException.java
@ -0,0 +1,21 @@
 package com.newscrawler.exception;
 public class NetworkException extends CrawlerException {
    private static final long serialVersionUID = 1L;
    public NetworkException() {
        super();
    }
    public NetworkException(String message) {
        super(message);
    }
    public NetworkException(String message, Throwable cause) {
        super(message, cause);
    }
    public NetworkException(Throwable cause) {
        super(cause);
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/exception/ParseException.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/exception/ParseException.java
@ -0,0 +1,21 @@
 package com.newscrawler.exception;
 public class ParseException extends CrawlerException {
    private static final long serialVersionUID = 1L;
    public ParseException() {
        super();
    }
    public ParseException(String message) {
        super(message);
    }
    public ParseException(String message, Throwable cause) {
        super(message, cause);
    }
    public ParseException(Throwable cause) {
        super(cause);
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/repository/ArticleRepository.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/repository/ArticleRepository.java
@ -0,0 +1,126 @@
 package com.newscrawler.repository;
 import com.newscrawler.entity.Article;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.stream.Collectors;
 public class ArticleRepository {
    private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class);
    private final List<Article> articles;
    public ArticleRepository() {
        this.articles = new ArrayList<>();
    }
    public void add(Article article) {
        if (article == null) {
            logger.warn("尝试添加空文章到仓库");
            throw new IllegalArgumentException("文章不能为空");
        }
        if (article.getTitle() == null || article.getTitle().trim().isEmpty()) {
            logger.warn("尝试添加标题为空的文章");
            throw new IllegalArgumentException("文章标题不能为空");
        }
        if (!exists(article)) {
            articles.add(article);
            logger.debug("添加文章: {}", article.getTitle());
        } else {
            logger.debug("文章已存在，跳过: {}", article.getTitle());
        }
    }
    public void addAll(Collection<Article> articlesToAdd) {
        if (articlesToAdd == null) {
            logger.warn("尝试添加空集合到仓库");
            throw new IllegalArgumentException("文章集合不能为空");
        }
        int count = 0;
        for (Article article : articlesToAdd) {
            try {
                add(article);
                count++;
            } catch (IllegalArgumentException e) {
                logger.warn("跳过无效文章: {}", e.getMessage());
            }
        }
        logger.info("批量添加完成，成功添加{}篇文章", count);
    }
    public boolean remove(Article article) {
        if (article == null) {
            logger.warn("尝试删除空文章");
            return false;
        }
        boolean removed = articles.remove(article);
        if (removed) {
            logger.debug("删除文章: {}", article.getTitle());
        }
        return removed;
    }
    public boolean removeById(String id) {
        if (id == null || id.trim().isEmpty()) {
            logger.warn("尝试用空ID删除文章");
            return false;
        }
        boolean removed = articles.removeIf(a -> a.getId().equals(id));
        if (removed) {
            logger.debug("通过ID删除文章: {}", id);
        }
        return removed;
    }
    public void clear() {
        int size = articles.size();
        articles.clear();
        logger.info("清空仓库，删除了{}篇文章", size);
    }
    public List<Article> getAll() {
        return new ArrayList<>(articles);
    }
    public List<Article> findBySource(String source) {
        if (source == null || source.trim().isEmpty()) {
            logger.warn("使用空来源查询");
            return new ArrayList<>();
        }
        return articles.stream()
                .filter(a -> a.getSource() != null && a.getSource().contains(source))
                .collect(Collectors.toList());
    }
    public boolean exists(Article article) {
        if (article == null) {
            return false;
        }
        return articles.contains(article);
    }
    public int size() {
        return articles.size();
    }
    public boolean isEmpty() {
        return articles.isEmpty();
    }
    public long countBySource(String source) {
        return findBySource(source).size();
    }
    public Article findById(String id) {
        if (id == null || id.trim().isEmpty()) {
            return null;
        }
        return articles.stream()
                .filter(a -> a.getId().equals(id))
                .findFirst()
                .orElse(null);
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/service/CrawlerService.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/service/CrawlerService.java
@ -0,0 +1,113 @@
 package com.newscrawler.service;
 import com.newscrawler.entity.Article;
 import com.newscrawler.entity.CrawlHistory;
 import com.newscrawler.exception.CrawlerException;
 import com.newscrawler.exception.ParseException;
 import com.newscrawler.repository.ArticleRepository;
 import com.newscrawler.strategy.CrawlStrategy;
 import com.newscrawler.strategy.StrategyFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;
 public class CrawlerService {
    private static final Logger logger = LoggerFactory.getLogger(CrawlerService.class);
    private final ArticleRepository repository;
    private final List<CrawlHistory> crawlHistories;
    public CrawlerService() {
        this.repository = new ArticleRepository();
        this.crawlHistories = new ArrayList<>();
    }
    public List<Article> crawlSingleSite(String siteKey) throws CrawlerException {
        logger.info("开始爬取单个站点: {}", siteKey);
        CrawlStrategy strategy = StrategyFactory.getStrategy(siteKey);
        return executeCrawl(strategy);
    }
    public List<Article> crawlAllSites() throws CrawlerException {
        logger.info("开始批量爬取所有站点");
        List<Article> allArticles = new ArrayList<>();
        Map<String, CrawlStrategy> strategies = StrategyFactory.getAllStrategies();
        for (Map.Entry<String, CrawlStrategy> entry : strategies.entrySet()) {
            try {
                List<Article> articles = executeCrawl(entry.getValue());
                allArticles.addAll(articles);
            } catch (CrawlerException e) {
                logger.error("爬取站点{}失败: {}", entry.getKey(), e.getMessage());
            }
        }
        logger.info("批量爬取完成，共获取{}篇文章", allArticles.size());
        return allArticles;
    }
    private List<Article> executeCrawl(CrawlStrategy strategy) throws CrawlerException {
        String siteName = strategy.getSiteName();
        String siteUrl = strategy.getSiteUrl();
        List<Article> articles = new ArrayList<>();
        try {
            logger.info("开始爬取: {} - {}", siteName, siteUrl);
            articles = strategy.crawl();
            repository.addAll(articles);
            CrawlHistory history = new CrawlHistory(siteName, siteUrl, articles.size(), true, null);
            crawlHistories.add(history);
            logger.info("爬取{}成功，获取{}篇文章", siteName, articles.size());
        } catch (ParseException e) {
            CrawlHistory history = new CrawlHistory(siteName, siteUrl, 0, false, e.getMessage());
            crawlHistories.add(history);
            logger.error("爬取{}失败: {}", siteName, e.getMessage());
            throw e;
        }
        return articles;
    }
    public ArticleRepository getRepository() {
        return repository;
    }
    public List<CrawlHistory> getCrawlHistories() {
        return new ArrayList<>(crawlHistories);
    }
    public List<Article> getAllArticles() {
        return repository.getAll();
    }
    public List<Article> getArticlesBySource(String source) {
        return repository.findBySource(source);
    }
    public boolean removeArticle(String id) {
        return repository.removeById(id);
    }
    public void clearAllArticles() {
        repository.clear();
        logger.info("已清空所有文章");
    }
    public Map<String, Long> getStatistics() {
        Map<String, Long> stats = repository.getAll().stream()
                .collect(Collectors.groupingBy(Article::getSource, Collectors.counting()));
        stats.put("总计", (long) repository.size());
        return stats;
    }
    public void clearHistory() {
        crawlHistories.clear();
        logger.info("已清空爬取历史");
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/strategy/AbstractBaseStrategy.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/strategy/AbstractBaseStrategy.java
@ -0,0 +1,83 @@
 package com.newscrawler.strategy;
 import com.newscrawler.entity.Article;
 import com.newscrawler.exception.NetworkException;
 import com.newscrawler.exception.ParseException;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 public abstract class AbstractBaseStrategy implements CrawlStrategy {
    protected static final Logger logger = LoggerFactory.getLogger(AbstractBaseStrategy.class);
    protected static final int MAX_RETRIES = 3;
    protected static final int RETRY_DELAY_MS = 2000;
    protected static final int TIMEOUT_MS = 30000;
    @Override
    public List<Article> crawl() throws ParseException {
        logger.info("开始爬取站点: {}", getSiteName());
        String html = fetchWithRetry();
        if (html == null || html.isEmpty()) {
            throw new ParseException("获取HTML内容为空");
        }
        return parseHtml(html);
    }
    protected String fetchWithRetry() throws ParseException {
        int attempts = 0;
        Exception lastException = null;
        while (attempts < MAX_RETRIES) {
            try {
                attempts++;
                logger.debug("第{}次尝试获取页面: {}", attempts, getSiteUrl());
                Document doc = Jsoup.connect(getSiteUrl())
                        .timeout(TIMEOUT_MS)
                        .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
                        .get();
                return doc.html();
            } catch (IOException e) {
                lastException = e;
                logger.warn("第{}次尝试失败: {}", attempts, e.getMessage());
                if (attempts < MAX_RETRIES) {
                    try {
                        Thread.sleep(RETRY_DELAY_MS * attempts);
                    } catch (InterruptedException ie) {
                        Thread.currentThread().interrupt();
                        throw new ParseException("爬取被中断", ie);
                    }
                }
            }
        }
        NetworkException networkEx = new NetworkException("网络请求失败，已重试" + MAX_RETRIES + "次", lastException);
        logger.error("网络请求最终失败: {}", networkEx.getMessage());
        throw new ParseException("获取页面失败", networkEx);
    }
    protected abstract List<Article> parseHtml(String html) throws ParseException;
    protected List<Article> createArticles(List<String> titles, List<String> summaries,
                                           List<String> publishDates, List<String> articleUrls,
                                           List<String> authors) {
        List<Article> articles = new ArrayList<>();
        int size = titles.size();
        for (int i = 0; i < size; i++) {
            Article article = new Article();
            article.setTitle(i < titles.size() ? titles.get(i) : "");
            article.setSummary(i < summaries.size() ? summaries.get(i) : "");
            article.setPublishDate(i < publishDates.size() ? publishDates.get(i) : "");
            article.setArticleUrl(i < articleUrls.size() ? articleUrls.get(i) : "");
            article.setAuthor(i < authors.size() ? authors.get(i) : "");
            article.setSource(getSiteName());
            articles.add(article);
        }
        return articles;
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/strategy/CrawlStrategy.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/strategy/CrawlStrategy.java
@ -0,0 +1,14 @@
 package com.newscrawler.strategy;
 import com.newscrawler.entity.Article;
 import com.newscrawler.exception.ParseException;
 import java.util.List;
 public interface CrawlStrategy {
    String getSiteName();
    String getSiteUrl();
    List<Article> crawl() throws ParseException;
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/strategy/RunoobStrategy.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/strategy/RunoobStrategy.java
@ -0,0 +1,93 @@
 package com.newscrawler.strategy;
 import com.newscrawler.entity.Article;
 import com.newscrawler.exception.ParseException;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import java.util.ArrayList;
 import java.util.List;
 public class RunoobStrategy extends AbstractBaseStrategy {
    private static final String SITE_NAME = "菜鸟教程资讯";
    private static final String SITE_URL = "https://www.runoob.com/";
    @Override
    public String getSiteName() {
        return SITE_NAME;
    }
    @Override
    public String getSiteUrl() {
        return SITE_URL;
    }
    @Override
    protected List<Article> parseHtml(String html) throws ParseException {
        List<Article> articles = new ArrayList<>();
        try {
            org.jsoup.nodes.Document doc = Jsoup.parse(html);
            Elements newsItems = doc.select(".article-list .item, .list-group-item, .news-item, .content li, article");
            for (Element item : newsItems) {
                Article article = new Article();
                article.setSource(SITE_NAME);
                Element titleElem = item.selectFirst("h3, h4, h2, .title, .news-title, a[href]");
                article.setTitle(titleElem != null ? titleElem.text().trim() : "");
                Element summaryElem = item.selectFirst(".desc, .summary, .news-desc, p");
                article.setSummary(summaryElem != null ? summaryElem.text().trim() : "");
                Element dateElem = item.selectFirst(".date, time, .time, span");
                article.setPublishDate(dateElem != null ? dateElem.text().trim() : "");
                Element linkElem = item.selectFirst("a[href]");
                article.setArticleUrl(linkElem != null && linkElem.hasAttr("href")
                        ? linkElem.attr("href") : "");
                if (!article.getTitle().isEmpty()) {
                    articles.add(article);
                }
            }
            if (articles.isEmpty()) {
                Elements links = doc.select("a[href*='/article/'], a[href*='/tutorial/'], a[href*='/course/']");
                for (Element link : links) {
                    String title = link.text().trim();
                    if (title.length() > 5 && !title.contains("首页") && !title.contains("教程") && !title.contains("学习")) {
                        Article article = new Article();
                        article.setSource(SITE_NAME);
                        article.setTitle(title);
                        article.setArticleUrl(link.hasAttr("href") ? link.attr("href") : "");
                        articles.add(article);
                    }
                }
            }
            if (articles.isEmpty()) {
                Elements allLinks = doc.select("a");
                for (Element link : allLinks) {
                    String title = link.text().trim();
                    if (title.length() > 5 && title.length() < 100) {
                        String href = link.hasAttr("href") ? link.attr("href") : "";
                        if (href.contains("/") && !href.startsWith("#") && !href.contains("javascript")) {
                            Article article = new Article();
                            article.setSource(SITE_NAME);
                            article.setTitle(title);
                            article.setArticleUrl(href);
                            articles.add(article);
                        }
                    }
                }
            }
            logger.info("菜鸟教程解析到{}条资讯", articles.size());
        } catch (Exception e) {
            throw new ParseException("解析菜鸟教程页面失败", e);
        }
        return articles;
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/strategy/SohuStrategy.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/strategy/SohuStrategy.java
@ -0,0 +1,90 @@
 package com.newscrawler.strategy;
 import com.newscrawler.entity.Article;
 import com.newscrawler.exception.ParseException;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import java.util.ArrayList;
 import java.util.List;
 public class SohuStrategy extends AbstractBaseStrategy {
    private static final String SITE_NAME = "搜狐资讯";
    private static final String SITE_URL = "https://news.sohu.com/";
    @Override
    public String getSiteName() {
        return SITE_NAME;
    }
    @Override
    public String getSiteUrl() {
        return SITE_URL;
    }
    @Override
    protected List<Article> parseHtml(String html) throws ParseException {
        List<Article> articles = new ArrayList<>();
        try {
            org.jsoup.nodes.Document doc = Jsoup.parse(html);
            Elements allLinks = doc.select("a");
            int count = 0;
            for (Element link : allLinks) {
                String title = link.text().trim();
                String href = link.attr("href");
                if (title.length() > 8 && title.length() < 80 &&
                    !title.contains("登录") && !title.contains("注册") &&
                    !title.contains("评论") && !title.contains("分享") &&
                    !title.contains("更多") && !title.contains("首页") &&
                    href.contains("/a/")) {
                    Article article = new Article();
                    article.setSource(SITE_NAME);
                    article.setTitle(title);
                    article.setAuthor("搜狐");
                    if (href.startsWith("//")) {
                        href = "https:" + href;
                    } else if (href.startsWith("/")) {
                        href = "https://news.sohu.com" + href;
                    }
                    article.setArticleUrl(href);
                    articles.add(article);
                    count++;
                    if (count >= 20) break;
                }
            }
            if (articles.isEmpty()) {
                Elements mainNews = doc.select(".focus-news-list li, .main-news li, .listCon li");
                for (Element item : mainNews) {
                    Element titleLink = item.selectFirst("a");
                    if (titleLink != null) {
                        String title = titleLink.text().trim();
                        if (title.length() > 5) {
                            Article article = new Article();
                            article.setSource(SITE_NAME);
                            article.setTitle(title);
                            article.setAuthor("搜狐");
                            String href = titleLink.attr("href");
                            if (href.startsWith("/")) {
                                href = "https://news.sohu.com" + href;
                            }
                            article.setArticleUrl(href);
                            articles.add(article);
                        }
                    }
                }
            }
            logger.info("搜狐资讯解析到{}条资讯", articles.size());
        } catch (Exception e) {
            throw new ParseException("解析搜狐资讯页面失败", e);
        }
        return articles;
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/strategy/StrategyFactory.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/strategy/StrategyFactory.java
@ -0,0 +1,39 @@
 package com.newscrawler.strategy;
 import java.util.HashMap;
 import java.util.Map;
 public class StrategyFactory {
    private static final Map<String, CrawlStrategy> STRATEGY_MAP = new HashMap<>();
    static {
        STRATEGY_MAP.put("runoob", new RunoobStrategy());
        STRATEGY_MAP.put("youth", new YouthStrategy());
        STRATEGY_MAP.put("sohu", new SohuStrategy());
    }
    public static CrawlStrategy getStrategy(String siteKey) {
        CrawlStrategy strategy = STRATEGY_MAP.get(siteKey.toLowerCase());
        if (strategy == null) {
            throw new IllegalArgumentException("不支持的站点: " + siteKey);
        }
        return strategy;
    }
    public static CrawlStrategy getStrategyBySiteName(String siteName) {
        for (Map.Entry<String, CrawlStrategy> entry : STRATEGY_MAP.entrySet()) {
            if (entry.getValue().getSiteName().contains(siteName)) {
                return entry.getValue();
            }
        }
        throw new IllegalArgumentException("未找到站点对应的策略: " + siteName);
    }
    public static Map<String, CrawlStrategy> getAllStrategies() {
        return new HashMap<>(STRATEGY_MAP);
    }
    public static String[] getSiteKeys() {
        return STRATEGY_MAP.keySet().toArray(new String[0]);
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/strategy/YouthStrategy.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/strategy/YouthStrategy.java
@ -0,0 +1,70 @@
 package com.newscrawler.strategy;
 import com.newscrawler.entity.Article;
 import com.newscrawler.exception.ParseException;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import java.util.ArrayList;
 import java.util.List;
 public class YouthStrategy extends AbstractBaseStrategy {
    private static final String SITE_NAME = "知乎日报";
    private static final String SITE_URL = "https://daily.zhihu.com/";
    @Override
    public String getSiteName() {
        return SITE_NAME;
    }
    @Override
    public String getSiteUrl() {
        return SITE_URL;
    }
    @Override
    protected List<Article> parseHtml(String html) throws ParseException {
        List<Article> articles = new ArrayList<>();
        try {
            logger.info("开始解析知乎日报页面，HTML长度: {}", html.length());
            org.jsoup.nodes.Document doc = Jsoup.parse(html);
            Elements allLinks = doc.select("a");
            logger.info("页面共有 {} 个链接", allLinks.size());
            for (Element link : allLinks) {
                String title = link.text().trim();
                String href = link.attr("href");
                if (title.length() >= 4 && title.length() <= 50 && !title.isEmpty()) {
                    Article article = new Article();
                    article.setSource(SITE_NAME);
                    article.setTitle(title);
                    article.setAuthor("知乎日报");
                    if (href.startsWith("//")) {
                        href = "https:" + href;
                    } else if (href.startsWith("/")) {
                        href = "https://daily.zhihu.com" + href;
                    }
                    article.setArticleUrl(href);
                    articles.add(article);
                }
            }
            logger.info("初步解析到{}条资讯", articles.size());
            if (articles.size() > 20) {
                articles = articles.subList(0, 20);
            }
            logger.info("知乎日报最终解析到{}条资讯", articles.size());
        } catch (Exception e) {
            logger.error("解析异常: {}", e.getMessage());
            throw new ParseException("解析知乎日报页面失败", e);
        }
        return articles;
    }
 }
--- a/project/资讯爬虫/src/main/java/com/newscrawler/util/JsonUtil.java
+++ b/project/资讯爬虫/src/main/java/com/newscrawler/util/JsonUtil.java
@ -0,0 +1,134 @@
 package com.newscrawler.util;
 import com.google.gson.Gson;
 import com.google.gson.GsonBuilder;
 import com.google.gson.reflect.TypeToken;
 import com.newscrawler.entity.Article;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.*;
 import java.lang.reflect.Type;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 public class JsonUtil {
    private static final Logger logger = LoggerFactory.getLogger(JsonUtil.class);
    private static final Gson GSON = new GsonBuilder()
            .setPrettyPrinting()
            .setDateFormat("yyyy-MM-dd'T'HH:mm:ss")
            .create();
    private static final String DATA_DIR = "data";
    static {
        try {
            Files.createDirectories(Paths.get(DATA_DIR));
        } catch (IOException e) {
            logger.warn("创建数据目录失败: {}", e.getMessage());
        }
    }
    public static void exportToJson(List<Article> articles, String filename) throws IOException {
        if (articles == null) {
            throw new IllegalArgumentException("文章列表不能为空");
        }
        if (filename == null || filename.trim().isEmpty()) {
            throw new IllegalArgumentException("文件名不能为空");
        }
        String fullPath = getFullPath(filename);
        try (Writer writer = new BufferedWriter(
                new OutputStreamWriter(
                        new FileOutputStream(fullPath), StandardCharsets.UTF_8))) {
            GSON.toJson(articles, writer);
            logger.info("成功导出{}篇文章到{}", articles.size(), fullPath);
        } catch (IOException e) {
            logger.error("导出JSON失败: {}", e.getMessage());
            throw e;
        }
    }
    public static List<Article> importFromJson(String filename) throws IOException {
        if (filename == null || filename.trim().isEmpty()) {
            throw new IllegalArgumentException("文件名不能为空");
        }
        String fullPath = getFullPath(filename);
        Path path = Paths.get(fullPath);
        if (!Files.exists(path)) {
            throw new FileNotFoundException("文件不存在: " + fullPath);
        }
        List<Article> importedArticles;
        try (Reader reader = new BufferedReader(
                new InputStreamReader(
                        new FileInputStream(fullPath), StandardCharsets.UTF_8))) {
            Type listType = new TypeToken<ArrayList<Article>>() {}.getType();
            importedArticles = GSON.fromJson(reader, listType);
        } catch (IOException e) {
            logger.error("导入JSON失败: {}", e.getMessage());
            throw e;
        }
        if (importedArticles == null) {
            importedArticles = new ArrayList<>();
        }
        List<Article> deduplicatedArticles = deduplicate(importedArticles);
        logger.info("从{}导入{}篇文章，去重后保留{}篇",
                fullPath, importedArticles.size(), deduplicatedArticles.size());
        return deduplicatedArticles;
    }
    private static List<Article> deduplicate(List<Article> articles) {
        Set<String> seen = new HashSet<>();
        List<Article> deduplicated = new ArrayList<>();
        for (Article article : articles) {
            String key = generateDeduplicateKey(article);
            if (!seen.contains(key)) {
                seen.add(key);
                deduplicated.add(article);
            } else {
                logger.debug("去重重复文章: {}", article.getTitle());
            }
        }
        return deduplicated;
    }
    private static String generateDeduplicateKey(Article article) {
        return (article.getTitle() != null ? article.getTitle() : "") + "|" +
                (article.getSource() != null ? article.getSource() : "") + "|" +
                (article.getPublishDate() != null ? article.getPublishDate() : "");
    }
    private static String getFullPath(String filename) {
        if (filename.endsWith(".json")) {
            return DATA_DIR + File.separator + filename;
        }
        return DATA_DIR + File.separator + filename + ".json";
    }
    public static void exportHistoriesToJson(List<?> histories, String filename) throws IOException {
        if (histories == null) {
            throw new IllegalArgumentException("历史记录列表不能为空");
        }
        String fullPath = DATA_DIR + File.separator + filename + "_history.json";
        try (Writer writer = new BufferedWriter(
                new OutputStreamWriter(
                        new FileOutputStream(fullPath), StandardCharsets.UTF_8))) {
            GSON.toJson(histories, writer);
            logger.info("成功导出{}条历史记录到{}", histories.size(), fullPath);
        }
    }
 }
--- a/project/资讯爬虫/src/main/resources/logback.xml
+++ b/project/资讯爬虫/src/main/resources/logback.xml
@ -0,0 +1,37 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <configuration>
    <property name="LOG_PATTERN" value="%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n"/>
    <appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
        <encoder>
            <pattern>${LOG_PATTERN}</pattern>
            <charset>GBK</charset>
        </encoder>
    </appender>
    <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
        <file>logs/crawler.log</file>
        <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
            <fileNamePattern>logs/crawler-%d{yyyy-MM-dd}.log</fileNamePattern>
            <maxHistory>30</maxHistory>
        </rollingPolicy>
        <encoder>
            <pattern>${LOG_PATTERN}</pattern>
            <charset>UTF-8</charset>
        </encoder>
    </appender>
    <logger name="com.newscrawler" level="DEBUG" additivity="false">
        <appender-ref ref="CONSOLE"/>
        <appender-ref ref="FILE"/>
    </logger>
    <logger name="org.jsoup" level="WARN" additivity="false">
        <appender-ref ref="CONSOLE"/>
    </logger>
    <root level="INFO">
        <appender-ref ref="CONSOLE"/>
        <appender-ref ref="FILE"/>
    </root>
 </configuration>
--- a/project/资讯爬虫/target/classes/com/newscrawler/Main.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/Main.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/command/MenuCommand.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/command/MenuCommand.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/entity/Article.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/entity/Article.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/entity/CrawlHistory.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/entity/CrawlHistory.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/exception/CrawlerException.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/exception/CrawlerException.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/exception/NetworkException.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/exception/NetworkException.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/exception/ParseException.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/exception/ParseException.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/repository/ArticleRepository.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/repository/ArticleRepository.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/service/CrawlerService.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/service/CrawlerService.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/strategy/AbstractBaseStrategy.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/strategy/AbstractBaseStrategy.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/strategy/CrawlStrategy.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/strategy/CrawlStrategy.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/strategy/RunoobStrategy.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/strategy/RunoobStrategy.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/strategy/SohuStrategy.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/strategy/SohuStrategy.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/strategy/StrategyFactory.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/strategy/StrategyFactory.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/strategy/YouthStrategy.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/strategy/YouthStrategy.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/util/JsonUtil$1.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/util/JsonUtil$1.class
--- a/project/资讯爬虫/target/classes/com/newscrawler/util/JsonUtil.class
+++ b/project/资讯爬虫/target/classes/com/newscrawler/util/JsonUtil.class
--- a/project/资讯爬虫/target/classes/logback.xml
+++ b/project/资讯爬虫/target/classes/logback.xml
@ -0,0 +1,37 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <configuration>
    <property name="LOG_PATTERN" value="%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n"/>
    <appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
        <encoder>
            <pattern>${LOG_PATTERN}</pattern>
            <charset>GBK</charset>
        </encoder>
    </appender>
    <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
        <file>logs/crawler.log</file>
        <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
            <fileNamePattern>logs/crawler-%d{yyyy-MM-dd}.log</fileNamePattern>
            <maxHistory>30</maxHistory>
        </rollingPolicy>
        <encoder>
            <pattern>${LOG_PATTERN}</pattern>
            <charset>UTF-8</charset>
        </encoder>
    </appender>
    <logger name="com.newscrawler" level="DEBUG" additivity="false">
        <appender-ref ref="CONSOLE"/>
        <appender-ref ref="FILE"/>
    </logger>
    <logger name="org.jsoup" level="WARN" additivity="false">
        <appender-ref ref="CONSOLE"/>
    </logger>
    <root level="INFO">
        <appender-ref ref="CONSOLE"/>
        <appender-ref ref="FILE"/>
    </root>
 </configuration>
--- a/project/资讯爬虫/target/dependency/gson-2.10.1.jar
+++ b/project/资讯爬虫/target/dependency/gson-2.10.1.jar
--- a/project/资讯爬虫/target/dependency/jsoup-1.17.2.jar
+++ b/project/资讯爬虫/target/dependency/jsoup-1.17.2.jar
--- a/project/资讯爬虫/target/dependency/logback-classic-1.4.14.jar
+++ b/project/资讯爬虫/target/dependency/logback-classic-1.4.14.jar
--- a/project/资讯爬虫/target/dependency/logback-core-1.4.14.jar
+++ b/project/资讯爬虫/target/dependency/logback-core-1.4.14.jar
--- a/project/资讯爬虫/target/dependency/slf4j-api-2.0.11.jar
+++ b/project/资讯爬虫/target/dependency/slf4j-api-2.0.11.jar
--- a/project/资讯爬虫/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
+++ b/project/资讯爬虫/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
@ -0,0 +1,17 @@
 com\newscrawler\util\JsonUtil$1.class
 com\newscrawler\entity\CrawlHistory.class
 com\newscrawler\exception\ParseException.class
 com\newscrawler\entity\Article.class
 com\newscrawler\repository\ArticleRepository.class
 com\newscrawler\strategy\AbstractBaseStrategy.class
 com\newscrawler\strategy\StrategyFactory.class
 com\newscrawler\command\MenuCommand.class
 com\newscrawler\service\CrawlerService.class
 com\newscrawler\strategy\RunoobStrategy.class
 com\newscrawler\exception\CrawlerException.class
 com\newscrawler\strategy\CrawlStrategy.class
 com\newscrawler\strategy\SohuStrategy.class
 com\newscrawler\exception\NetworkException.class
 com\newscrawler\util\JsonUtil.class
 com\newscrawler\strategy\YouthStrategy.class
 com\newscrawler\Main.class
--- a/project/资讯爬虫/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
+++ b/project/资讯爬虫/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
@ -0,0 +1,16 @@
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\exception\NetworkException.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\util\JsonUtil.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\exception\ParseException.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\YouthStrategy.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\entity\CrawlHistory.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\service\CrawlerService.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\RunoobStrategy.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\exception\CrawlerException.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\repository\ArticleRepository.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\command\MenuCommand.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\StrategyFactory.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\entity\Article.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\Main.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\SohuStrategy.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\CrawlStrategy.java
 D:\桌面\资讯爬虫\src\main\java\com\newscrawler\strategy\AbstractBaseStrategy.java