5 changed files with 242 additions and 0 deletions
@ -0,0 +1,52 @@ |
|||||
|
package com.example.datacollect.model; |
||||
|
|
||||
|
import java.io.Serializable; |
||||
|
|
||||
|
public class Article implements Serializable { |
||||
|
private static final long serialVersionUID = 1L; |
||||
|
|
||||
|
private String title; |
||||
|
private String url; |
||||
|
private String content; |
||||
|
|
||||
|
public Article() { |
||||
|
} |
||||
|
|
||||
|
public Article(String title, String url, String content) { |
||||
|
this.title = title; |
||||
|
this.url = url; |
||||
|
this.content = content; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public String getUrl() { |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
public void setUrl(String url) { |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
public String getContent() { |
||||
|
return content; |
||||
|
} |
||||
|
|
||||
|
public void setContent(String content) { |
||||
|
this.content = content; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Article{" |
||||
|
+ "title='" + title + '\'' |
||||
|
+ ", url='" + url + '\'' |
||||
|
+ '}'; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,111 @@ |
|||||
|
package com.example.datacollect.repository; |
||||
|
|
||||
|
import com.example.datacollect.exception.CrawlerException; |
||||
|
import com.example.datacollect.exception.ErrorCode; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import java.io.*; |
||||
|
import java.nio.charset.StandardCharsets; |
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.nio.file.Paths; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.Collections; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ArticleRepository { |
||||
|
private final List<Article> articles = new ArrayList<>(); |
||||
|
private static final String STORAGE_FILE = "articles.dat"; |
||||
|
|
||||
|
public ArticleRepository() { |
||||
|
loadFromFile(); |
||||
|
} |
||||
|
|
||||
|
public void add(Article article) { |
||||
|
if (article == null) { |
||||
|
throw new IllegalArgumentException("Article cannot be null"); |
||||
|
} |
||||
|
articles.add(article); |
||||
|
saveToFile(); |
||||
|
} |
||||
|
|
||||
|
public void addAll(List<Article> articleList) { |
||||
|
if (articleList == null) { |
||||
|
throw new IllegalArgumentException("列表不能为 null"); |
||||
|
} |
||||
|
for (Article article : articleList) { |
||||
|
add(article); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public List<Article> getAll() { |
||||
|
return Collections.unmodifiableList(articles); |
||||
|
} |
||||
|
|
||||
|
public int size() { |
||||
|
return articles.size(); |
||||
|
} |
||||
|
|
||||
|
public void clear() { |
||||
|
articles.clear(); |
||||
|
saveToFile(); |
||||
|
} |
||||
|
|
||||
|
private void saveToFile() { |
||||
|
try (ObjectOutputStream oos = new ObjectOutputStream( |
||||
|
new FileOutputStream(STORAGE_FILE))) { |
||||
|
oos.writeObject(new ArrayList<>(articles)); |
||||
|
} catch (IOException e) { |
||||
|
throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "保存数据到文件失败", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@SuppressWarnings("unchecked") |
||||
|
private void loadFromFile() { |
||||
|
Path path = Paths.get(STORAGE_FILE); |
||||
|
if (!Files.exists(path)) { |
||||
|
return; |
||||
|
} |
||||
|
try (ObjectInputStream ois = new ObjectInputStream( |
||||
|
new FileInputStream(STORAGE_FILE))) { |
||||
|
List<Article> loaded = (List<Article>) ois.readObject(); |
||||
|
articles.addAll(loaded); |
||||
|
} catch (IOException | ClassNotFoundException e) { |
||||
|
throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "从文件加载数据失败", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void exportToJson(String filename) { |
||||
|
StringBuilder json = new StringBuilder(); |
||||
|
json.append("[\n"); |
||||
|
for (int i = 0; i < articles.size(); i++) { |
||||
|
Article a = articles.get(i); |
||||
|
json.append(" {\n"); |
||||
|
json.append(" \"title\": \"").append(escapeJson(a.getTitle())).append("\",\n"); |
||||
|
json.append(" \"url\": \"").append(escapeJson(a.getUrl())).append("\",\n"); |
||||
|
json.append(" \"content\": \"").append(escapeJson(a.getContent())).append("\"\n"); |
||||
|
json.append(" }"); |
||||
|
if (i < articles.size() - 1) { |
||||
|
json.append(","); |
||||
|
} |
||||
|
json.append("\n"); |
||||
|
} |
||||
|
json.append("]"); |
||||
|
|
||||
|
try { |
||||
|
Files.writeString(Paths.get(filename), json.toString(), StandardCharsets.UTF_8); |
||||
|
} catch (IOException e) { |
||||
|
throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "导出JSON失败", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private String escapeJson(String str) { |
||||
|
if (str == null) { |
||||
|
return ""; |
||||
|
} |
||||
|
return str.replace("\\", "\\\\") |
||||
|
.replace("\"", "\\\"") |
||||
|
.replace("\n", "\\n") |
||||
|
.replace("\r", "\\r") |
||||
|
.replace("\t", "\\t"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,47 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class BaiduBaikeStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("baike.baidu.com"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
|
||||
|
String title = doc.selectFirst("h1.lemma-title, h1.title-text") != null |
||||
|
? doc.selectFirst("h1.lemma-title, h1.title-text").text().trim() |
||||
|
: ""; |
||||
|
|
||||
|
String content = ""; |
||||
|
Element contentEl = doc.selectFirst("div.lemma-summary, div.summary-content, div.j-summary"); |
||||
|
if (contentEl != null) { |
||||
|
content = contentEl.text().trim(); |
||||
|
} |
||||
|
|
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, url, content)); |
||||
|
} |
||||
|
|
||||
|
Elements relatedLinks = doc.select("a[href*='/item/']"); |
||||
|
for (Element link : relatedLinks) { |
||||
|
String linkUrl = link.attr("abs:href"); |
||||
|
String linkTitle = link.text().trim(); |
||||
|
|
||||
|
if (!linkTitle.isEmpty() && !linkTitle.contains("编辑") && !linkTitle.contains("分享")) { |
||||
|
articles.add(new Article(linkTitle, linkUrl, "")); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,25 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class BlogStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("blog.example.com"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
Elements titles = doc.select(".post-title"); |
||||
|
for (Element e : titles) { |
||||
|
articles.add(new Article(e.text(), url, "")); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,7 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class StrategyNotFoundException extends CrawlerException { |
||||
|
public StrategyNotFoundException(String url) { |
||||
|
super(ErrorCode.STRATEGY_NOT_FOUND, "未找到匹配 " + url + " 的解析策略"); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue