6 changed files with 250 additions and 0 deletions
@ -0,0 +1,11 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class CrawlerException extends RuntimeException { |
||||
|
public CrawlerException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class NetworkException extends CrawlerException { |
||||
|
public NetworkException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class ParseException extends CrawlerException { |
||||
|
public ParseException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class UrlFormatException extends CrawlerException { |
||||
|
public UrlFormatException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public UrlFormatException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,54 @@ |
|||||
|
package com.crawler.util; |
||||
|
|
||||
|
public class ColorUtil { |
||||
|
public static final String RESET = "\u001B[0m"; |
||||
|
public static final String BLACK = "\u001B[30m"; |
||||
|
public static final String RED = "\u001B[31m"; |
||||
|
public static final String GREEN = "\u001B[32m"; |
||||
|
public static final String YELLOW = "\u001B[33m"; |
||||
|
public static final String BLUE = "\u001B[34m"; |
||||
|
public static final String PURPLE = "\u001B[35m"; |
||||
|
public static final String CYAN = "\u001B[36m"; |
||||
|
public static final String WHITE = "\u001B[37m"; |
||||
|
|
||||
|
public static final String BLACK_BG = "\u001B[40m"; |
||||
|
public static final String RED_BG = "\u001B[41m"; |
||||
|
public static final String GREEN_BG = "\u001B[42m"; |
||||
|
public static final String YELLOW_BG = "\u001B[43m"; |
||||
|
public static final String BLUE_BG = "\u001B[44m"; |
||||
|
public static final String PURPLE_BG = "\u001B[45m"; |
||||
|
public static final String CYAN_BG = "\u001B[46m"; |
||||
|
public static final String WHITE_BG = "\u001B[47m"; |
||||
|
|
||||
|
public static String colorize(String text, String color) { |
||||
|
return color + text + RESET; |
||||
|
} |
||||
|
|
||||
|
public static String green(String text) { |
||||
|
return colorize(text, GREEN); |
||||
|
} |
||||
|
|
||||
|
public static String red(String text) { |
||||
|
return colorize(text, RED); |
||||
|
} |
||||
|
|
||||
|
public static String yellow(String text) { |
||||
|
return colorize(text, YELLOW); |
||||
|
} |
||||
|
|
||||
|
public static String blue(String text) { |
||||
|
return colorize(text, BLUE); |
||||
|
} |
||||
|
|
||||
|
public static String cyan(String text) { |
||||
|
return colorize(text, CYAN); |
||||
|
} |
||||
|
|
||||
|
public static String purple(String text) { |
||||
|
return colorize(text, PURPLE); |
||||
|
} |
||||
|
|
||||
|
public static String bold(String text) { |
||||
|
return "\u001B[1m" + text + RESET; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,152 @@ |
|||||
|
package com.crawler.util; |
||||
|
|
||||
|
import com.crawler.model.Article; |
||||
|
|
||||
|
import java.io.*; |
||||
|
import java.nio.charset.StandardCharsets; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class DataPersistence { |
||||
|
private static final String DATA_FOLDER = "data"; |
||||
|
private static final String INDEX_FILE = DATA_FOLDER + File.separator + "index.txt"; |
||||
|
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
||||
|
|
||||
|
static { |
||||
|
File folder = new File(DATA_FOLDER); |
||||
|
if (!folder.exists()) { |
||||
|
folder.mkdirs(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static void saveArticles(List<Article> articles) { |
||||
|
try { |
||||
|
for (Article article : articles) { |
||||
|
saveSingleArticle(article); |
||||
|
} |
||||
|
saveIndex(articles); |
||||
|
System.out.println(ColorUtil.green("✓ Saved " + articles.size() + " articles to '" + DATA_FOLDER + "' folder")); |
||||
|
} catch (Exception e) { |
||||
|
System.err.println(ColorUtil.red("✗ Failed to save articles: " + e.getMessage())); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static void saveSingleArticle(Article article) throws IOException { |
||||
|
String filename = DATA_FOLDER + File.separator + "article_" + article.getId() + ".txt"; |
||||
|
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), StandardCharsets.UTF_8))) { |
||||
|
writer.write("========================================\n"); |
||||
|
writer.write(" 文章详细信息\n"); |
||||
|
writer.write("========================================\n\n"); |
||||
|
writer.write("ID: " + article.getId() + "\n"); |
||||
|
writer.write("标题: " + article.getTitle() + "\n"); |
||||
|
writer.write("URL: " + article.getUrl() + "\n"); |
||||
|
if (article.getAuthor() != null) { |
||||
|
writer.write("作者: " + article.getAuthor() + "\n"); |
||||
|
} |
||||
|
if (article.getSource() != null) { |
||||
|
writer.write("来源: " + article.getSource() + "\n"); |
||||
|
} |
||||
|
if (article.getPublishDate() != null) { |
||||
|
writer.write("发布时间: " + article.getPublishDate().format(DATE_FORMATTER) + "\n"); |
||||
|
} |
||||
|
writer.write("爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n"); |
||||
|
writer.write("\n========================================\n"); |
||||
|
writer.write(" 文章内容\n"); |
||||
|
writer.write("========================================\n"); |
||||
|
if (article.getContent() != null) { |
||||
|
writer.write(article.getContent()); |
||||
|
} |
||||
|
writer.write("\n\n"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static void saveIndex(List<Article> articles) throws IOException { |
||||
|
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(INDEX_FILE), StandardCharsets.UTF_8))) { |
||||
|
writer.write("========================================\n"); |
||||
|
writer.write(" 文章索引\n"); |
||||
|
writer.write("========================================\n\n"); |
||||
|
writer.write("共有 " + articles.size() + " 篇文章\n\n"); |
||||
|
|
||||
|
for (Article article : articles) { |
||||
|
writer.write("[" + article.getId() + "] " + article.getTitle() + "\n"); |
||||
|
writer.write(" URL: " + article.getUrl() + "\n"); |
||||
|
writer.write(" 文件名: article_" + article.getId() + ".txt\n"); |
||||
|
if (article.getCrawlDate() != null) { |
||||
|
writer.write(" 爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n"); |
||||
|
} |
||||
|
writer.write("\n"); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static List<Article> loadArticles() { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
File indexFile = new File(INDEX_FILE); |
||||
|
if (!indexFile.exists()) { |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
File folder = new File(DATA_FOLDER); |
||||
|
File[] files = folder.listFiles((dir, name) -> name.startsWith("article_") && name.endsWith(".txt")); |
||||
|
|
||||
|
if (files != null) { |
||||
|
for (File file : files) { |
||||
|
try { |
||||
|
Article article = loadSingleArticle(file); |
||||
|
if (article != null) { |
||||
|
articles.add(article); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
System.err.println(ColorUtil.yellow("⚠ 无法加载文件: " + file.getName())); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
System.out.println(ColorUtil.green("✓ Loaded " + articles.size() + " articles from '" + DATA_FOLDER + "' folder")); |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
private static Article loadSingleArticle(File file) throws IOException { |
||||
|
Article article = new Article(); |
||||
|
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8))) { |
||||
|
String line; |
||||
|
StringBuilder content = new StringBuilder(); |
||||
|
boolean inContent = false; |
||||
|
|
||||
|
while ((line = reader.readLine()) != null) { |
||||
|
if (line.contains("文章内容")) { |
||||
|
inContent = true; |
||||
|
// Skip the next separator line
|
||||
|
reader.readLine(); |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
if (!inContent) { |
||||
|
if (line.startsWith("ID: ")) { |
||||
|
article.setId(line.substring(4)); |
||||
|
} else if (line.startsWith("标题: ")) { |
||||
|
article.setTitle(line.substring(4)); |
||||
|
} else if (line.startsWith("URL: ")) { |
||||
|
article.setUrl(line.substring(5)); |
||||
|
} else if (line.startsWith("作者: ")) { |
||||
|
article.setAuthor(line.substring(4)); |
||||
|
} else if (line.startsWith("来源: ")) { |
||||
|
article.setSource(line.substring(4)); |
||||
|
} |
||||
|
} else { |
||||
|
if (content.length() > 0) { |
||||
|
content.append("\n"); |
||||
|
} |
||||
|
content.append(line); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (content.length() > 0) { |
||||
|
article.setContent(content.toString()); |
||||
|
} |
||||
|
} |
||||
|
return article; |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue