5 changed files with 242 additions and 0 deletions
@ -0,0 +1,52 @@ |
|||
package com.example.datacollect.model; |
|||
|
|||
import java.io.Serializable; |
|||
|
|||
public class Article implements Serializable { |
|||
private static final long serialVersionUID = 1L; |
|||
|
|||
private String title; |
|||
private String url; |
|||
private String content; |
|||
|
|||
public Article() { |
|||
} |
|||
|
|||
public Article(String title, String url, String content) { |
|||
this.title = title; |
|||
this.url = url; |
|||
this.content = content; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
|
|||
public void setUrl(String url) { |
|||
this.url = url; |
|||
} |
|||
|
|||
public String getContent() { |
|||
return content; |
|||
} |
|||
|
|||
public void setContent(String content) { |
|||
this.content = content; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Article{" |
|||
+ "title='" + title + '\'' |
|||
+ ", url='" + url + '\'' |
|||
+ '}'; |
|||
} |
|||
} |
|||
@ -0,0 +1,111 @@ |
|||
package com.example.datacollect.repository; |
|||
|
|||
import com.example.datacollect.exception.CrawlerException; |
|||
import com.example.datacollect.exception.ErrorCode; |
|||
import com.example.datacollect.model.Article; |
|||
import java.io.*; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Path; |
|||
import java.nio.file.Paths; |
|||
import java.util.ArrayList; |
|||
import java.util.Collections; |
|||
import java.util.List; |
|||
|
|||
public class ArticleRepository { |
|||
private final List<Article> articles = new ArrayList<>(); |
|||
private static final String STORAGE_FILE = "articles.dat"; |
|||
|
|||
public ArticleRepository() { |
|||
loadFromFile(); |
|||
} |
|||
|
|||
public void add(Article article) { |
|||
if (article == null) { |
|||
throw new IllegalArgumentException("Article cannot be null"); |
|||
} |
|||
articles.add(article); |
|||
saveToFile(); |
|||
} |
|||
|
|||
public void addAll(List<Article> articleList) { |
|||
if (articleList == null) { |
|||
throw new IllegalArgumentException("列表不能为 null"); |
|||
} |
|||
for (Article article : articleList) { |
|||
add(article); |
|||
} |
|||
} |
|||
|
|||
public List<Article> getAll() { |
|||
return Collections.unmodifiableList(articles); |
|||
} |
|||
|
|||
public int size() { |
|||
return articles.size(); |
|||
} |
|||
|
|||
public void clear() { |
|||
articles.clear(); |
|||
saveToFile(); |
|||
} |
|||
|
|||
private void saveToFile() { |
|||
try (ObjectOutputStream oos = new ObjectOutputStream( |
|||
new FileOutputStream(STORAGE_FILE))) { |
|||
oos.writeObject(new ArrayList<>(articles)); |
|||
} catch (IOException e) { |
|||
throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "保存数据到文件失败", e); |
|||
} |
|||
} |
|||
|
|||
@SuppressWarnings("unchecked") |
|||
private void loadFromFile() { |
|||
Path path = Paths.get(STORAGE_FILE); |
|||
if (!Files.exists(path)) { |
|||
return; |
|||
} |
|||
try (ObjectInputStream ois = new ObjectInputStream( |
|||
new FileInputStream(STORAGE_FILE))) { |
|||
List<Article> loaded = (List<Article>) ois.readObject(); |
|||
articles.addAll(loaded); |
|||
} catch (IOException | ClassNotFoundException e) { |
|||
throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "从文件加载数据失败", e); |
|||
} |
|||
} |
|||
|
|||
public void exportToJson(String filename) { |
|||
StringBuilder json = new StringBuilder(); |
|||
json.append("[\n"); |
|||
for (int i = 0; i < articles.size(); i++) { |
|||
Article a = articles.get(i); |
|||
json.append(" {\n"); |
|||
json.append(" \"title\": \"").append(escapeJson(a.getTitle())).append("\",\n"); |
|||
json.append(" \"url\": \"").append(escapeJson(a.getUrl())).append("\",\n"); |
|||
json.append(" \"content\": \"").append(escapeJson(a.getContent())).append("\"\n"); |
|||
json.append(" }"); |
|||
if (i < articles.size() - 1) { |
|||
json.append(","); |
|||
} |
|||
json.append("\n"); |
|||
} |
|||
json.append("]"); |
|||
|
|||
try { |
|||
Files.writeString(Paths.get(filename), json.toString(), StandardCharsets.UTF_8); |
|||
} catch (IOException e) { |
|||
throw new CrawlerException(ErrorCode.FILE_IO_ERROR, "导出JSON失败", e); |
|||
} |
|||
} |
|||
|
|||
private String escapeJson(String str) { |
|||
if (str == null) { |
|||
return ""; |
|||
} |
|||
return str.replace("\\", "\\\\") |
|||
.replace("\"", "\\\"") |
|||
.replace("\n", "\\n") |
|||
.replace("\r", "\\r") |
|||
.replace("\t", "\\t"); |
|||
} |
|||
} |
|||
@ -0,0 +1,47 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class BaiduBaikeStrategy implements CrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("baike.baidu.com"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
|
|||
String title = doc.selectFirst("h1.lemma-title, h1.title-text") != null |
|||
? doc.selectFirst("h1.lemma-title, h1.title-text").text().trim() |
|||
: ""; |
|||
|
|||
String content = ""; |
|||
Element contentEl = doc.selectFirst("div.lemma-summary, div.summary-content, div.j-summary"); |
|||
if (contentEl != null) { |
|||
content = contentEl.text().trim(); |
|||
} |
|||
|
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, url, content)); |
|||
} |
|||
|
|||
Elements relatedLinks = doc.select("a[href*='/item/']"); |
|||
for (Element link : relatedLinks) { |
|||
String linkUrl = link.attr("abs:href"); |
|||
String linkTitle = link.text().trim(); |
|||
|
|||
if (!linkTitle.isEmpty() && !linkTitle.contains("编辑") && !linkTitle.contains("分享")) { |
|||
articles.add(new Article(linkTitle, linkUrl, "")); |
|||
} |
|||
} |
|||
|
|||
return articles; |
|||
} |
|||
} |
|||
@ -0,0 +1,25 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class BlogStrategy implements CrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("blog.example.com"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
Elements titles = doc.select(".post-title"); |
|||
for (Element e : titles) { |
|||
articles.add(new Article(e.text(), url, "")); |
|||
} |
|||
return articles; |
|||
} |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
package com.example.datacollect.exception; |
|||
|
|||
public class StrategyNotFoundException extends CrawlerException { |
|||
public StrategyNotFoundException(String url) { |
|||
super(ErrorCode.STRATEGY_NOT_FOUND, "未找到匹配 " + url + " 的解析策略"); |
|||
} |
|||
} |
|||
Loading…
Reference in new issue