You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

152 lines
6.4 KiB

package com.crawler.util;
import com.crawler.model.Article;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
public class DataPersistence {
private static final String DATA_FOLDER = "data";
private static final String INDEX_FILE = DATA_FOLDER + File.separator + "index.txt";
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
static {
File folder = new File(DATA_FOLDER);
if (!folder.exists()) {
folder.mkdirs();
}
}
public static void saveArticles(List<Article> articles) {
try {
for (Article article : articles) {
saveSingleArticle(article);
}
saveIndex(articles);
System.out.println(ColorUtil.green("✓ Saved " + articles.size() + " articles to '" + DATA_FOLDER + "' folder"));
} catch (Exception e) {
System.err.println(ColorUtil.red("✗ Failed to save articles: " + e.getMessage()));
}
}
private static void saveSingleArticle(Article article) throws IOException {
String filename = DATA_FOLDER + File.separator + "article_" + article.getId() + ".txt";
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), StandardCharsets.UTF_8))) {
writer.write("========================================\n");
writer.write(" 文章详细信息\n");
writer.write("========================================\n\n");
writer.write("ID: " + article.getId() + "\n");
writer.write("标题: " + article.getTitle() + "\n");
writer.write("URL: " + article.getUrl() + "\n");
if (article.getAuthor() != null) {
writer.write("作者: " + article.getAuthor() + "\n");
}
if (article.getSource() != null) {
writer.write("来源: " + article.getSource() + "\n");
}
if (article.getPublishDate() != null) {
writer.write("发布时间: " + article.getPublishDate().format(DATE_FORMATTER) + "\n");
}
writer.write("爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n");
writer.write("\n========================================\n");
writer.write(" 文章内容\n");
writer.write("========================================\n");
if (article.getContent() != null) {
writer.write(article.getContent());
}
writer.write("\n\n");
}
}
private static void saveIndex(List<Article> articles) throws IOException {
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(INDEX_FILE), StandardCharsets.UTF_8))) {
writer.write("========================================\n");
writer.write(" 文章索引\n");
writer.write("========================================\n\n");
writer.write("共有 " + articles.size() + " 篇文章\n\n");
for (Article article : articles) {
writer.write("[" + article.getId() + "] " + article.getTitle() + "\n");
writer.write(" URL: " + article.getUrl() + "\n");
writer.write(" 文件名: article_" + article.getId() + ".txt\n");
if (article.getCrawlDate() != null) {
writer.write(" 爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n");
}
writer.write("\n");
}
}
}
public static List<Article> loadArticles() {
List<Article> articles = new ArrayList<>();
File indexFile = new File(INDEX_FILE);
if (!indexFile.exists()) {
return articles;
}
File folder = new File(DATA_FOLDER);
File[] files = folder.listFiles((dir, name) -> name.startsWith("article_") && name.endsWith(".txt"));
if (files != null) {
for (File file : files) {
try {
Article article = loadSingleArticle(file);
if (article != null) {
articles.add(article);
}
} catch (Exception e) {
System.err.println(ColorUtil.yellow("⚠ 无法加载文件: " + file.getName()));
}
}
}
System.out.println(ColorUtil.green("✓ Loaded " + articles.size() + " articles from '" + DATA_FOLDER + "' folder"));
return articles;
}
private static Article loadSingleArticle(File file) throws IOException {
Article article = new Article();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8))) {
String line;
StringBuilder content = new StringBuilder();
boolean inContent = false;
while ((line = reader.readLine()) != null) {
if (line.contains("文章内容")) {
inContent = true;
// Skip the next separator line
reader.readLine();
continue;
}
if (!inContent) {
if (line.startsWith("ID: ")) {
article.setId(line.substring(4));
} else if (line.startsWith("标题: ")) {
article.setTitle(line.substring(4));
} else if (line.startsWith("URL: ")) {
article.setUrl(line.substring(5));
} else if (line.startsWith("作者: ")) {
article.setAuthor(line.substring(4));
} else if (line.startsWith("来源: ")) {
article.setSource(line.substring(4));
}
} else {
if (content.length() > 0) {
content.append("\n");
}
content.append(line);
}
}
if (content.length() > 0) {
article.setContent(content.toString());
}
}
return article;
}
}