6 changed files with 250 additions and 0 deletions
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class CrawlerException extends RuntimeException { |
|||
public CrawlerException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public CrawlerException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class NetworkException extends CrawlerException { |
|||
public NetworkException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public NetworkException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class ParseException extends CrawlerException { |
|||
public ParseException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public ParseException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class UrlFormatException extends CrawlerException { |
|||
public UrlFormatException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public UrlFormatException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,54 @@ |
|||
package com.crawler.util; |
|||
|
|||
public class ColorUtil { |
|||
public static final String RESET = "\u001B[0m"; |
|||
public static final String BLACK = "\u001B[30m"; |
|||
public static final String RED = "\u001B[31m"; |
|||
public static final String GREEN = "\u001B[32m"; |
|||
public static final String YELLOW = "\u001B[33m"; |
|||
public static final String BLUE = "\u001B[34m"; |
|||
public static final String PURPLE = "\u001B[35m"; |
|||
public static final String CYAN = "\u001B[36m"; |
|||
public static final String WHITE = "\u001B[37m"; |
|||
|
|||
public static final String BLACK_BG = "\u001B[40m"; |
|||
public static final String RED_BG = "\u001B[41m"; |
|||
public static final String GREEN_BG = "\u001B[42m"; |
|||
public static final String YELLOW_BG = "\u001B[43m"; |
|||
public static final String BLUE_BG = "\u001B[44m"; |
|||
public static final String PURPLE_BG = "\u001B[45m"; |
|||
public static final String CYAN_BG = "\u001B[46m"; |
|||
public static final String WHITE_BG = "\u001B[47m"; |
|||
|
|||
public static String colorize(String text, String color) { |
|||
return color + text + RESET; |
|||
} |
|||
|
|||
public static String green(String text) { |
|||
return colorize(text, GREEN); |
|||
} |
|||
|
|||
public static String red(String text) { |
|||
return colorize(text, RED); |
|||
} |
|||
|
|||
public static String yellow(String text) { |
|||
return colorize(text, YELLOW); |
|||
} |
|||
|
|||
public static String blue(String text) { |
|||
return colorize(text, BLUE); |
|||
} |
|||
|
|||
public static String cyan(String text) { |
|||
return colorize(text, CYAN); |
|||
} |
|||
|
|||
public static String purple(String text) { |
|||
return colorize(text, PURPLE); |
|||
} |
|||
|
|||
public static String bold(String text) { |
|||
return "\u001B[1m" + text + RESET; |
|||
} |
|||
} |
|||
@ -0,0 +1,152 @@ |
|||
package com.crawler.util; |
|||
|
|||
import com.crawler.model.Article; |
|||
|
|||
import java.io.*; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class DataPersistence { |
|||
private static final String DATA_FOLDER = "data"; |
|||
private static final String INDEX_FILE = DATA_FOLDER + File.separator + "index.txt"; |
|||
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
|
|||
static { |
|||
File folder = new File(DATA_FOLDER); |
|||
if (!folder.exists()) { |
|||
folder.mkdirs(); |
|||
} |
|||
} |
|||
|
|||
public static void saveArticles(List<Article> articles) { |
|||
try { |
|||
for (Article article : articles) { |
|||
saveSingleArticle(article); |
|||
} |
|||
saveIndex(articles); |
|||
System.out.println(ColorUtil.green("✓ Saved " + articles.size() + " articles to '" + DATA_FOLDER + "' folder")); |
|||
} catch (Exception e) { |
|||
System.err.println(ColorUtil.red("✗ Failed to save articles: " + e.getMessage())); |
|||
} |
|||
} |
|||
|
|||
private static void saveSingleArticle(Article article) throws IOException { |
|||
String filename = DATA_FOLDER + File.separator + "article_" + article.getId() + ".txt"; |
|||
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), StandardCharsets.UTF_8))) { |
|||
writer.write("========================================\n"); |
|||
writer.write(" 文章详细信息\n"); |
|||
writer.write("========================================\n\n"); |
|||
writer.write("ID: " + article.getId() + "\n"); |
|||
writer.write("标题: " + article.getTitle() + "\n"); |
|||
writer.write("URL: " + article.getUrl() + "\n"); |
|||
if (article.getAuthor() != null) { |
|||
writer.write("作者: " + article.getAuthor() + "\n"); |
|||
} |
|||
if (article.getSource() != null) { |
|||
writer.write("来源: " + article.getSource() + "\n"); |
|||
} |
|||
if (article.getPublishDate() != null) { |
|||
writer.write("发布时间: " + article.getPublishDate().format(DATE_FORMATTER) + "\n"); |
|||
} |
|||
writer.write("爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n"); |
|||
writer.write("\n========================================\n"); |
|||
writer.write(" 文章内容\n"); |
|||
writer.write("========================================\n"); |
|||
if (article.getContent() != null) { |
|||
writer.write(article.getContent()); |
|||
} |
|||
writer.write("\n\n"); |
|||
} |
|||
} |
|||
|
|||
private static void saveIndex(List<Article> articles) throws IOException { |
|||
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(INDEX_FILE), StandardCharsets.UTF_8))) { |
|||
writer.write("========================================\n"); |
|||
writer.write(" 文章索引\n"); |
|||
writer.write("========================================\n\n"); |
|||
writer.write("共有 " + articles.size() + " 篇文章\n\n"); |
|||
|
|||
for (Article article : articles) { |
|||
writer.write("[" + article.getId() + "] " + article.getTitle() + "\n"); |
|||
writer.write(" URL: " + article.getUrl() + "\n"); |
|||
writer.write(" 文件名: article_" + article.getId() + ".txt\n"); |
|||
if (article.getCrawlDate() != null) { |
|||
writer.write(" 爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n"); |
|||
} |
|||
writer.write("\n"); |
|||
} |
|||
} |
|||
} |
|||
|
|||
public static List<Article> loadArticles() { |
|||
List<Article> articles = new ArrayList<>(); |
|||
File indexFile = new File(INDEX_FILE); |
|||
if (!indexFile.exists()) { |
|||
return articles; |
|||
} |
|||
|
|||
File folder = new File(DATA_FOLDER); |
|||
File[] files = folder.listFiles((dir, name) -> name.startsWith("article_") && name.endsWith(".txt")); |
|||
|
|||
if (files != null) { |
|||
for (File file : files) { |
|||
try { |
|||
Article article = loadSingleArticle(file); |
|||
if (article != null) { |
|||
articles.add(article); |
|||
} |
|||
} catch (Exception e) { |
|||
System.err.println(ColorUtil.yellow("⚠ 无法加载文件: " + file.getName())); |
|||
} |
|||
} |
|||
} |
|||
|
|||
System.out.println(ColorUtil.green("✓ Loaded " + articles.size() + " articles from '" + DATA_FOLDER + "' folder")); |
|||
return articles; |
|||
} |
|||
|
|||
private static Article loadSingleArticle(File file) throws IOException { |
|||
Article article = new Article(); |
|||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8))) { |
|||
String line; |
|||
StringBuilder content = new StringBuilder(); |
|||
boolean inContent = false; |
|||
|
|||
while ((line = reader.readLine()) != null) { |
|||
if (line.contains("文章内容")) { |
|||
inContent = true; |
|||
// Skip the next separator line
|
|||
reader.readLine(); |
|||
continue; |
|||
} |
|||
|
|||
if (!inContent) { |
|||
if (line.startsWith("ID: ")) { |
|||
article.setId(line.substring(4)); |
|||
} else if (line.startsWith("标题: ")) { |
|||
article.setTitle(line.substring(4)); |
|||
} else if (line.startsWith("URL: ")) { |
|||
article.setUrl(line.substring(5)); |
|||
} else if (line.startsWith("作者: ")) { |
|||
article.setAuthor(line.substring(4)); |
|||
} else if (line.startsWith("来源: ")) { |
|||
article.setSource(line.substring(4)); |
|||
} |
|||
} else { |
|||
if (content.length() > 0) { |
|||
content.append("\n"); |
|||
} |
|||
content.append(line); |
|||
} |
|||
} |
|||
|
|||
if (content.length() > 0) { |
|||
article.setContent(content.toString()); |
|||
} |
|||
} |
|||
return article; |
|||
} |
|||
} |
|||
Loading…
Reference in new issue