You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
152 lines
6.4 KiB
152 lines
6.4 KiB
package com.crawler.util;
|
|
|
|
import com.crawler.model.Article;
|
|
|
|
import java.io.*;
|
|
import java.nio.charset.StandardCharsets;
|
|
import java.time.LocalDateTime;
|
|
import java.time.format.DateTimeFormatter;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class DataPersistence {
|
|
private static final String DATA_FOLDER = "data";
|
|
private static final String INDEX_FILE = DATA_FOLDER + File.separator + "index.txt";
|
|
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
|
|
|
|
static {
|
|
File folder = new File(DATA_FOLDER);
|
|
if (!folder.exists()) {
|
|
folder.mkdirs();
|
|
}
|
|
}
|
|
|
|
public static void saveArticles(List<Article> articles) {
|
|
try {
|
|
for (Article article : articles) {
|
|
saveSingleArticle(article);
|
|
}
|
|
saveIndex(articles);
|
|
System.out.println(ColorUtil.green("✓ Saved " + articles.size() + " articles to '" + DATA_FOLDER + "' folder"));
|
|
} catch (Exception e) {
|
|
System.err.println(ColorUtil.red("✗ Failed to save articles: " + e.getMessage()));
|
|
}
|
|
}
|
|
|
|
private static void saveSingleArticle(Article article) throws IOException {
|
|
String filename = DATA_FOLDER + File.separator + "article_" + article.getId() + ".txt";
|
|
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), StandardCharsets.UTF_8))) {
|
|
writer.write("========================================\n");
|
|
writer.write(" 文章详细信息\n");
|
|
writer.write("========================================\n\n");
|
|
writer.write("ID: " + article.getId() + "\n");
|
|
writer.write("标题: " + article.getTitle() + "\n");
|
|
writer.write("URL: " + article.getUrl() + "\n");
|
|
if (article.getAuthor() != null) {
|
|
writer.write("作者: " + article.getAuthor() + "\n");
|
|
}
|
|
if (article.getSource() != null) {
|
|
writer.write("来源: " + article.getSource() + "\n");
|
|
}
|
|
if (article.getPublishDate() != null) {
|
|
writer.write("发布时间: " + article.getPublishDate().format(DATE_FORMATTER) + "\n");
|
|
}
|
|
writer.write("爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n");
|
|
writer.write("\n========================================\n");
|
|
writer.write(" 文章内容\n");
|
|
writer.write("========================================\n");
|
|
if (article.getContent() != null) {
|
|
writer.write(article.getContent());
|
|
}
|
|
writer.write("\n\n");
|
|
}
|
|
}
|
|
|
|
private static void saveIndex(List<Article> articles) throws IOException {
|
|
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(INDEX_FILE), StandardCharsets.UTF_8))) {
|
|
writer.write("========================================\n");
|
|
writer.write(" 文章索引\n");
|
|
writer.write("========================================\n\n");
|
|
writer.write("共有 " + articles.size() + " 篇文章\n\n");
|
|
|
|
for (Article article : articles) {
|
|
writer.write("[" + article.getId() + "] " + article.getTitle() + "\n");
|
|
writer.write(" URL: " + article.getUrl() + "\n");
|
|
writer.write(" 文件名: article_" + article.getId() + ".txt\n");
|
|
if (article.getCrawlDate() != null) {
|
|
writer.write(" 爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n");
|
|
}
|
|
writer.write("\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
public static List<Article> loadArticles() {
|
|
List<Article> articles = new ArrayList<>();
|
|
File indexFile = new File(INDEX_FILE);
|
|
if (!indexFile.exists()) {
|
|
return articles;
|
|
}
|
|
|
|
File folder = new File(DATA_FOLDER);
|
|
File[] files = folder.listFiles((dir, name) -> name.startsWith("article_") && name.endsWith(".txt"));
|
|
|
|
if (files != null) {
|
|
for (File file : files) {
|
|
try {
|
|
Article article = loadSingleArticle(file);
|
|
if (article != null) {
|
|
articles.add(article);
|
|
}
|
|
} catch (Exception e) {
|
|
System.err.println(ColorUtil.yellow("⚠ 无法加载文件: " + file.getName()));
|
|
}
|
|
}
|
|
}
|
|
|
|
System.out.println(ColorUtil.green("✓ Loaded " + articles.size() + " articles from '" + DATA_FOLDER + "' folder"));
|
|
return articles;
|
|
}
|
|
|
|
private static Article loadSingleArticle(File file) throws IOException {
|
|
Article article = new Article();
|
|
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8))) {
|
|
String line;
|
|
StringBuilder content = new StringBuilder();
|
|
boolean inContent = false;
|
|
|
|
while ((line = reader.readLine()) != null) {
|
|
if (line.contains("文章内容")) {
|
|
inContent = true;
|
|
// Skip the next separator line
|
|
reader.readLine();
|
|
continue;
|
|
}
|
|
|
|
if (!inContent) {
|
|
if (line.startsWith("ID: ")) {
|
|
article.setId(line.substring(4));
|
|
} else if (line.startsWith("标题: ")) {
|
|
article.setTitle(line.substring(4));
|
|
} else if (line.startsWith("URL: ")) {
|
|
article.setUrl(line.substring(5));
|
|
} else if (line.startsWith("作者: ")) {
|
|
article.setAuthor(line.substring(4));
|
|
} else if (line.startsWith("来源: ")) {
|
|
article.setSource(line.substring(4));
|
|
}
|
|
} else {
|
|
if (content.length() > 0) {
|
|
content.append("\n");
|
|
}
|
|
content.append(line);
|
|
}
|
|
}
|
|
|
|
if (content.length() > 0) {
|
|
article.setContent(content.toString());
|
|
}
|
|
}
|
|
return article;
|
|
}
|
|
}
|
|
|