From c7855dd75fcc84274f179170a3a97fdd1fc42602 Mon Sep 17 00:00:00 2001 From: wuqiuyu <2135752554@qq.com> Date: Sun, 31 May 2026 14:21:44 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20'project'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- project/Article.java | 52 ++++++++++++ project/ArticleRepository.java | 111 +++++++++++++++++++++++++ project/BaiduBaikeStrategy.java | 47 +++++++++++ project/BlogStrategy.java | 25 ++++++ project/StrategyNotFoundException.java | 7 ++ 5 files changed, 242 insertions(+) create mode 100644 project/Article.java create mode 100644 project/ArticleRepository.java create mode 100644 project/BaiduBaikeStrategy.java create mode 100644 project/BlogStrategy.java create mode 100644 project/StrategyNotFoundException.java diff --git a/project/Article.java b/project/Article.java new file mode 100644 index 0000000..4e64807 --- /dev/null +++ b/project/Article.java @@ -0,0 +1,52 @@ +package com.example.datacollect.model; + +import java.io.Serializable; + +public class Article implements Serializable { + private static final long serialVersionUID = 1L; + + private String title; + private String url; + private String content; + + public Article() { + } + + public Article(String title, String url, String content) { + this.title = title; + this.url = url; + this.content = content; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + @Override + public String toString() { + return "Article{" + + "title='" + title + '\'' + + ", url='" + url + '\'' + + '}'; + } +} \ No newline at end of file diff --git a/project/ArticleRepository.java b/project/ArticleRepository.java new file mode 100644 index 0000000..2db610e --- /dev/null +++ b/project/ArticleRepository.java @@ -0,0 +1,111 @@ +package com.example.datacollect.repository; + +import com.example.datacollect.exception.CrawlerException; +import com.example.datacollect.exception.ErrorCode; +import com.example.datacollect.model.Article; +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ArticleRepository { + private final List
articles = new ArrayList<>(); + private static final String STORAGE_FILE = "articles.dat"; + + public ArticleRepository() { + loadFromFile(); + } + + public void add(Article article) { + if (article == null) { + throw new IllegalArgumentException("Article cannot be null"); + } + articles.add(article); + saveToFile(); + } + + public void addAll(List
articleList) { + if (articleList == null) { + throw new IllegalArgumentException("列表不能为 null"); + } + for (Article article : articleList) { + add(article); + } + } + + public List
getAll() { + return Collections.unmodifiableList(articles); + } + + public int size() { + return articles.size(); + } + + public void clear() { + articles.clear(); + saveToFile(); + } + + private void saveToFile() { + try (ObjectOutputStream oos = new ObjectOutputStream( + new FileOutputStream(STORAGE_FILE))) { + oos.writeObject(new ArrayList<>(articles)); + } catch (IOException e) { + throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "保存数据到文件失败", e); + } + } + + @SuppressWarnings("unchecked") + private void loadFromFile() { + Path path = Paths.get(STORAGE_FILE); + if (!Files.exists(path)) { + return; + } + try (ObjectInputStream ois = new ObjectInputStream( + new FileInputStream(STORAGE_FILE))) { + List
loaded = (List
) ois.readObject(); + articles.addAll(loaded); + } catch (IOException | ClassNotFoundException e) { + throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "从文件加载数据失败", e); + } + } + + public void exportToJson(String filename) { + StringBuilder json = new StringBuilder(); + json.append("[\n"); + for (int i = 0; i < articles.size(); i++) { + Article a = articles.get(i); + json.append(" {\n"); + json.append(" \"title\": \"").append(escapeJson(a.getTitle())).append("\",\n"); + json.append(" \"url\": \"").append(escapeJson(a.getUrl())).append("\",\n"); + json.append(" \"content\": \"").append(escapeJson(a.getContent())).append("\"\n"); + json.append(" }"); + if (i < articles.size() - 1) { + json.append(","); + } + json.append("\n"); + } + json.append("]"); + + try { + Files.writeString(Paths.get(filename), json.toString(), StandardCharsets.UTF_8); + } catch (IOException e) { + throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "导出JSON失败", e); + } + } + + private String escapeJson(String str) { + if (str == null) { + return ""; + } + return str.replace("\\", "\\\\") + .replace("\"", "\\\"") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t"); + } +} \ No newline at end of file diff --git a/project/BaiduBaikeStrategy.java b/project/BaiduBaikeStrategy.java new file mode 100644 index 0000000..e752bd3 --- /dev/null +++ b/project/BaiduBaikeStrategy.java @@ -0,0 +1,47 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +public class BaiduBaikeStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("baike.baidu.com"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + + String title = doc.selectFirst("h1.lemma-title, h1.title-text") != null + ? doc.selectFirst("h1.lemma-title, h1.title-text").text().trim() + : ""; + + String content = ""; + Element contentEl = doc.selectFirst("div.lemma-summary, div.summary-content, div.j-summary"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + + if (!title.isEmpty()) { + articles.add(new Article(title, url, content)); + } + + Elements relatedLinks = doc.select("a[href*='/item/']"); + for (Element link : relatedLinks) { + String linkUrl = link.attr("abs:href"); + String linkTitle = link.text().trim(); + + if (!linkTitle.isEmpty() && !linkTitle.contains("编辑") && !linkTitle.contains("分享")) { + articles.add(new Article(linkTitle, linkUrl, "")); + } + } + + return articles; + } +} \ No newline at end of file diff --git a/project/BlogStrategy.java b/project/BlogStrategy.java new file mode 100644 index 0000000..e1c9a27 --- /dev/null +++ b/project/BlogStrategy.java @@ -0,0 +1,25 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("blog.example.com"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/project/StrategyNotFoundException.java b/project/StrategyNotFoundException.java new file mode 100644 index 0000000..3beaa90 --- /dev/null +++ b/project/StrategyNotFoundException.java @@ -0,0 +1,7 @@ +package com.example.datacollect.exception; + +public class StrategyNotFoundException extends CrawlerException { + public StrategyNotFoundException(String url) { + super(ErrorCode.STRATEGY_NOT_FOUND, "未找到匹配 " + url + " 的解析策略"); + } +} \ No newline at end of file