Browse Source

上传文件至 'project'

main
wuqiuyu 3 weeks ago
parent
commit
c7855dd75f
  1. 52
      project/Article.java
  2. 111
      project/ArticleRepository.java
  3. 47
      project/BaiduBaikeStrategy.java
  4. 25
      project/BlogStrategy.java
  5. 7
      project/StrategyNotFoundException.java

52
project/Article.java

@ -0,0 +1,52 @@
package com.example.datacollect.model;
import java.io.Serializable;
public class Article implements Serializable {
private static final long serialVersionUID = 1L;
private String title;
private String url;
private String content;
public Article() {
}
public Article(String title, String url, String content) {
this.title = title;
this.url = url;
this.content = content;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
@Override
public String toString() {
return "Article{"
+ "title='" + title + '\''
+ ", url='" + url + '\''
+ '}';
}
}

111
project/ArticleRepository.java

@ -0,0 +1,111 @@
package com.example.datacollect.repository;
import com.example.datacollect.exception.CrawlerException;
import com.example.datacollect.exception.ErrorCode;
import com.example.datacollect.model.Article;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class ArticleRepository {
private final List<Article> articles = new ArrayList<>();
private static final String STORAGE_FILE = "articles.dat";
public ArticleRepository() {
loadFromFile();
}
public void add(Article article) {
if (article == null) {
throw new IllegalArgumentException("Article cannot be null");
}
articles.add(article);
saveToFile();
}
public void addAll(List<Article> articleList) {
if (articleList == null) {
throw new IllegalArgumentException("列表不能为 null");
}
for (Article article : articleList) {
add(article);
}
}
public List<Article> getAll() {
return Collections.unmodifiableList(articles);
}
public int size() {
return articles.size();
}
public void clear() {
articles.clear();
saveToFile();
}
private void saveToFile() {
try (ObjectOutputStream oos = new ObjectOutputStream(
new FileOutputStream(STORAGE_FILE))) {
oos.writeObject(new ArrayList<>(articles));
} catch (IOException e) {
throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "保存数据到文件失败", e);
}
}
@SuppressWarnings("unchecked")
private void loadFromFile() {
Path path = Paths.get(STORAGE_FILE);
if (!Files.exists(path)) {
return;
}
try (ObjectInputStream ois = new ObjectInputStream(
new FileInputStream(STORAGE_FILE))) {
List<Article> loaded = (List<Article>) ois.readObject();
articles.addAll(loaded);
} catch (IOException | ClassNotFoundException e) {
throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "从文件加载数据失败", e);
}
}
public void exportToJson(String filename) {
StringBuilder json = new StringBuilder();
json.append("[\n");
for (int i = 0; i < articles.size(); i++) {
Article a = articles.get(i);
json.append(" {\n");
json.append(" \"title\": \"").append(escapeJson(a.getTitle())).append("\",\n");
json.append(" \"url\": \"").append(escapeJson(a.getUrl())).append("\",\n");
json.append(" \"content\": \"").append(escapeJson(a.getContent())).append("\"\n");
json.append(" }");
if (i < articles.size() - 1) {
json.append(",");
}
json.append("\n");
}
json.append("]");
try {
Files.writeString(Paths.get(filename), json.toString(), StandardCharsets.UTF_8);
} catch (IOException e) {
throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "导出JSON失败", e);
}
}
private String escapeJson(String str) {
if (str == null) {
return "";
}
return str.replace("\\", "\\\\")
.replace("\"", "\\\"")
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t");
}
}

47
project/BaiduBaikeStrategy.java

@ -0,0 +1,47 @@
package com.example.datacollect.strategy;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class BaiduBaikeStrategy implements CrawlStrategy {
@Override
public boolean supports(String url) {
return url.contains("baike.baidu.com");
}
@Override
public List<Article> parse(String url, Document doc) {
List<Article> articles = new ArrayList<>();
String title = doc.selectFirst("h1.lemma-title, h1.title-text") != null
? doc.selectFirst("h1.lemma-title, h1.title-text").text().trim()
: "";
String content = "";
Element contentEl = doc.selectFirst("div.lemma-summary, div.summary-content, div.j-summary");
if (contentEl != null) {
content = contentEl.text().trim();
}
if (!title.isEmpty()) {
articles.add(new Article(title, url, content));
}
Elements relatedLinks = doc.select("a[href*='/item/']");
for (Element link : relatedLinks) {
String linkUrl = link.attr("abs:href");
String linkTitle = link.text().trim();
if (!linkTitle.isEmpty() && !linkTitle.contains("编辑") && !linkTitle.contains("分享")) {
articles.add(new Article(linkTitle, linkUrl, ""));
}
}
return articles;
}
}

25
project/BlogStrategy.java

@ -0,0 +1,25 @@
package com.example.datacollect.strategy;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class BlogStrategy implements CrawlStrategy {
@Override
public boolean supports(String url) {
return url.contains("blog.example.com");
}
@Override
public List<Article> parse(String url, Document doc) {
List<Article> articles = new ArrayList<>();
Elements titles = doc.select(".post-title");
for (Element e : titles) {
articles.add(new Article(e.text(), url, ""));
}
return articles;
}
}

7
project/StrategyNotFoundException.java

@ -0,0 +1,7 @@
package com.example.datacollect.exception;
public class StrategyNotFoundException extends CrawlerException {
public StrategyNotFoundException(String url) {
super(ErrorCode.STRATEGY_NOT_FOUND, "未找到匹配 " + url + " 的解析策略");
}
}
Loading…
Cancel
Save