上传文件至 'project'

3 weeks ago · c7855dd75f
5 changed files with 242 additions and 0 deletions
--- a/project/Article.java
+++ b/project/Article.java
@ -0,0 +1,52 @@
+package com.example.datacollect.model;
+
+import java.io.Serializable;
+
+public class Article implements Serializable {
+    private static final long serialVersionUID = 1L;
+    
+    private String title;
+    private String url;
+    private String content;
+
+    public Article() {
+    }
+
+    public Article(String title, String url, String content) {
+        this.title = title;
+        this.url = url;
+        this.content = content;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public void setTitle(String title) {
+        this.title = title;
+    }
+
+    public String getUrl() {
+        return url;
+    }
+
+    public void setUrl(String url) {
+        this.url = url;
+    }
+
+    public String getContent() {
+        return content;
+    }
+
+    public void setContent(String content) {
+        this.content = content;
+    }
+
+    @Override
+    public String toString() {
+        return "Article{"
+                + "title='" + title + '\''
+                + ", url='" + url + '\''
+                + '}';
+    }
+}
--- a/project/ArticleRepository.java
+++ b/project/ArticleRepository.java
@ -0,0 +1,111 @@
+package com.example.datacollect.repository;
+
+import com.example.datacollect.exception.CrawlerException;
+import com.example.datacollect.exception.ErrorCode;
+import com.example.datacollect.model.Article;
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public class ArticleRepository {
+    private final List<Article> articles = new ArrayList<>();
+    private static final String STORAGE_FILE = "articles.dat";
+
+    public ArticleRepository() {
+        loadFromFile();
+    }
+
+    public void add(Article article) {
+        if (article == null) {
+            throw new IllegalArgumentException("Article cannot be null");
+        }
+        articles.add(article);
+        saveToFile();
+    }
+    
+    public void addAll(List<Article> articleList) {
+        if (articleList == null) {
+            throw new IllegalArgumentException("列表不能为 null");
+        }
+        for (Article article : articleList) {
+            add(article);
+        }
+    }
+
+    public List<Article> getAll() {
+        return Collections.unmodifiableList(articles);
+    }
+
+    public int size() {
+        return articles.size();
+    }
+
+    public void clear() {
+        articles.clear();
+        saveToFile();
+    }
+
+    private void saveToFile() {
+        try (ObjectOutputStream oos = new ObjectOutputStream(
+                new FileOutputStream(STORAGE_FILE))) {
+            oos.writeObject(new ArrayList<>(articles));
+        } catch (IOException e) {
+            throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "保存数据到文件失败", e);
+        }
+    }
+
+    @SuppressWarnings("unchecked")
+    private void loadFromFile() {
+        Path path = Paths.get(STORAGE_FILE);
+        if (!Files.exists(path)) {
+            return;
+        }
+        try (ObjectInputStream ois = new ObjectInputStream(
+                new FileInputStream(STORAGE_FILE))) {
+            List<Article> loaded = (List<Article>) ois.readObject();
+            articles.addAll(loaded);
+        } catch (IOException | ClassNotFoundException e) {
+            throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "从文件加载数据失败", e);
+        }
+    }
+
+    public void exportToJson(String filename) {
+        StringBuilder json = new StringBuilder();
+        json.append("[\n");
+        for (int i = 0; i < articles.size(); i++) {
+            Article a = articles.get(i);
+            json.append("  {\n");
+            json.append("    \"title\": \"").append(escapeJson(a.getTitle())).append("\",\n");
+            json.append("    \"url\": \"").append(escapeJson(a.getUrl())).append("\",\n");
+            json.append("    \"content\": \"").append(escapeJson(a.getContent())).append("\"\n");
+            json.append("  }");
+            if (i < articles.size() - 1) {
+                json.append(",");
+            }
+            json.append("\n");
+        }
+        json.append("]");
+        
+        try {
+            Files.writeString(Paths.get(filename), json.toString(), StandardCharsets.UTF_8);
+        } catch (IOException e) {
+            throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "导出JSON失败", e);
+        }
+    }
+
+    private String escapeJson(String str) {
+        if (str == null) {
+            return "";
+        }
+        return str.replace("\\", "\\\\")
+                  .replace("\"", "\\\"")
+                  .replace("\n", "\\n")
+                  .replace("\r", "\\r")
+                  .replace("\t", "\\t");
+    }
+}
--- a/project/BaiduBaikeStrategy.java
+++ b/project/BaiduBaikeStrategy.java
@ -0,0 +1,47 @@
+package com.example.datacollect.strategy;
+
+import com.example.datacollect.model.Article;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class BaiduBaikeStrategy implements CrawlStrategy {
+    @Override
+    public boolean supports(String url) {
+        return url.contains("baike.baidu.com");
+    }
+
+    @Override
+    public List<Article> parse(String url, Document doc) {
+        List<Article> articles = new ArrayList<>();
+        
+        String title = doc.selectFirst("h1.lemma-title, h1.title-text") != null 
+            ? doc.selectFirst("h1.lemma-title, h1.title-text").text().trim() 
+            : "";
+        
+        String content = "";
+        Element contentEl = doc.selectFirst("div.lemma-summary, div.summary-content, div.j-summary");
+        if (contentEl != null) {
+            content = contentEl.text().trim();
+        }
+        
+        if (!title.isEmpty()) {
+            articles.add(new Article(title, url, content));
+        }
+        
+        Elements relatedLinks = doc.select("a[href*='/item/']");
+        for (Element link : relatedLinks) {
+            String linkUrl = link.attr("abs:href");
+            String linkTitle = link.text().trim();
+            
+            if (!linkTitle.isEmpty() && !linkTitle.contains("编辑") && !linkTitle.contains("分享")) {
+                articles.add(new Article(linkTitle, linkUrl, ""));
+            }
+        }
+        
+        return articles;
+    }
+}
--- a/project/BlogStrategy.java
+++ b/project/BlogStrategy.java
@ -0,0 +1,25 @@
+package com.example.datacollect.strategy;
+
+import com.example.datacollect.model.Article;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import java.util.ArrayList;
+import java.util.List;
+
+public class BlogStrategy implements CrawlStrategy {
+    @Override
+    public boolean supports(String url) {
+        return url.contains("blog.example.com");
+    }
+
+    @Override
+    public List<Article> parse(String url, Document doc) {
+        List<Article> articles = new ArrayList<>();
+        Elements titles = doc.select(".post-title");
+        for (Element e : titles) {
+            articles.add(new Article(e.text(), url, ""));
+        }
+        return articles;
+    }
+}
--- a/project/StrategyNotFoundException.java
+++ b/project/StrategyNotFoundException.java
@ -0,0 +1,7 @@
+package com.example.datacollect.exception;
+
+public class StrategyNotFoundException extends CrawlerException {
+    public StrategyNotFoundException(String url) {
+        super(ErrorCode.STRATEGY_NOT_FOUND, "未找到匹配 " + url + " 的解析策略");
+    }
+}