diff --git a/w11/exception/CrawlerException.java b/w11/exception/CrawlerException.java new file mode 100644 index 0000000..ff2583c --- /dev/null +++ b/w11/exception/CrawlerException.java @@ -0,0 +1,11 @@ +package com.crawler.exception; + +public class CrawlerException extends RuntimeException { + public CrawlerException(String message) { + super(message); + } + + public CrawlerException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/w11/exception/NetworkException.java b/w11/exception/NetworkException.java new file mode 100644 index 0000000..8d8b9e7 --- /dev/null +++ b/w11/exception/NetworkException.java @@ -0,0 +1,11 @@ +package com.crawler.exception; + +public class NetworkException extends CrawlerException { + public NetworkException(String message) { + super(message); + } + + public NetworkException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/w11/exception/ParseException.java b/w11/exception/ParseException.java new file mode 100644 index 0000000..9248f23 --- /dev/null +++ b/w11/exception/ParseException.java @@ -0,0 +1,11 @@ +package com.crawler.exception; + +public class ParseException extends CrawlerException { + public ParseException(String message) { + super(message); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/w11/exception/UrlFormatException.java b/w11/exception/UrlFormatException.java new file mode 100644 index 0000000..3a60fab --- /dev/null +++ b/w11/exception/UrlFormatException.java @@ -0,0 +1,11 @@ +package com.crawler.exception; + +public class UrlFormatException extends CrawlerException { + public UrlFormatException(String message) { + super(message); + } + + public UrlFormatException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/w11/util/ColorUtil.java b/w11/util/ColorUtil.java new file mode 100644 index 0000000..6003abc --- /dev/null +++ b/w11/util/ColorUtil.java @@ -0,0 +1,54 @@ +package com.crawler.util; + +public class ColorUtil { + public static final String RESET = "\u001B[0m"; + public static final String BLACK = "\u001B[30m"; + public static final String RED = "\u001B[31m"; + public static final String GREEN = "\u001B[32m"; + public static final String YELLOW = "\u001B[33m"; + public static final String BLUE = "\u001B[34m"; + public static final String PURPLE = "\u001B[35m"; + public static final String CYAN = "\u001B[36m"; + public static final String WHITE = "\u001B[37m"; + + public static final String BLACK_BG = "\u001B[40m"; + public static final String RED_BG = "\u001B[41m"; + public static final String GREEN_BG = "\u001B[42m"; + public static final String YELLOW_BG = "\u001B[43m"; + public static final String BLUE_BG = "\u001B[44m"; + public static final String PURPLE_BG = "\u001B[45m"; + public static final String CYAN_BG = "\u001B[46m"; + public static final String WHITE_BG = "\u001B[47m"; + + public static String colorize(String text, String color) { + return color + text + RESET; + } + + public static String green(String text) { + return colorize(text, GREEN); + } + + public static String red(String text) { + return colorize(text, RED); + } + + public static String yellow(String text) { + return colorize(text, YELLOW); + } + + public static String blue(String text) { + return colorize(text, BLUE); + } + + public static String cyan(String text) { + return colorize(text, CYAN); + } + + public static String purple(String text) { + return colorize(text, PURPLE); + } + + public static String bold(String text) { + return "\u001B[1m" + text + RESET; + } +} diff --git a/w11/util/DataPersistence.java b/w11/util/DataPersistence.java new file mode 100644 index 0000000..9734128 --- /dev/null +++ b/w11/util/DataPersistence.java @@ -0,0 +1,152 @@ +package com.crawler.util; + +import com.crawler.model.Article; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.List; + +public class DataPersistence { + private static final String DATA_FOLDER = "data"; + private static final String INDEX_FILE = DATA_FOLDER + File.separator + "index.txt"; + private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + + static { + File folder = new File(DATA_FOLDER); + if (!folder.exists()) { + folder.mkdirs(); + } + } + + public static void saveArticles(List
articles) { + try { + for (Article article : articles) { + saveSingleArticle(article); + } + saveIndex(articles); + System.out.println(ColorUtil.green("✓ Saved " + articles.size() + " articles to '" + DATA_FOLDER + "' folder")); + } catch (Exception e) { + System.err.println(ColorUtil.red("✗ Failed to save articles: " + e.getMessage())); + } + } + + private static void saveSingleArticle(Article article) throws IOException { + String filename = DATA_FOLDER + File.separator + "article_" + article.getId() + ".txt"; + try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), StandardCharsets.UTF_8))) { + writer.write("========================================\n"); + writer.write(" 文章详细信息\n"); + writer.write("========================================\n\n"); + writer.write("ID: " + article.getId() + "\n"); + writer.write("标题: " + article.getTitle() + "\n"); + writer.write("URL: " + article.getUrl() + "\n"); + if (article.getAuthor() != null) { + writer.write("作者: " + article.getAuthor() + "\n"); + } + if (article.getSource() != null) { + writer.write("来源: " + article.getSource() + "\n"); + } + if (article.getPublishDate() != null) { + writer.write("发布时间: " + article.getPublishDate().format(DATE_FORMATTER) + "\n"); + } + writer.write("爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n"); + writer.write("\n========================================\n"); + writer.write(" 文章内容\n"); + writer.write("========================================\n"); + if (article.getContent() != null) { + writer.write(article.getContent()); + } + writer.write("\n\n"); + } + } + + private static void saveIndex(List
articles) throws IOException { + try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(INDEX_FILE), StandardCharsets.UTF_8))) { + writer.write("========================================\n"); + writer.write(" 文章索引\n"); + writer.write("========================================\n\n"); + writer.write("共有 " + articles.size() + " 篇文章\n\n"); + + for (Article article : articles) { + writer.write("[" + article.getId() + "] " + article.getTitle() + "\n"); + writer.write(" URL: " + article.getUrl() + "\n"); + writer.write(" 文件名: article_" + article.getId() + ".txt\n"); + if (article.getCrawlDate() != null) { + writer.write(" 爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n"); + } + writer.write("\n"); + } + } + } + + public static List
loadArticles() { + List
articles = new ArrayList<>(); + File indexFile = new File(INDEX_FILE); + if (!indexFile.exists()) { + return articles; + } + + File folder = new File(DATA_FOLDER); + File[] files = folder.listFiles((dir, name) -> name.startsWith("article_") && name.endsWith(".txt")); + + if (files != null) { + for (File file : files) { + try { + Article article = loadSingleArticle(file); + if (article != null) { + articles.add(article); + } + } catch (Exception e) { + System.err.println(ColorUtil.yellow("⚠ 无法加载文件: " + file.getName())); + } + } + } + + System.out.println(ColorUtil.green("✓ Loaded " + articles.size() + " articles from '" + DATA_FOLDER + "' folder")); + return articles; + } + + private static Article loadSingleArticle(File file) throws IOException { + Article article = new Article(); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8))) { + String line; + StringBuilder content = new StringBuilder(); + boolean inContent = false; + + while ((line = reader.readLine()) != null) { + if (line.contains("文章内容")) { + inContent = true; + // Skip the next separator line + reader.readLine(); + continue; + } + + if (!inContent) { + if (line.startsWith("ID: ")) { + article.setId(line.substring(4)); + } else if (line.startsWith("标题: ")) { + article.setTitle(line.substring(4)); + } else if (line.startsWith("URL: ")) { + article.setUrl(line.substring(5)); + } else if (line.startsWith("作者: ")) { + article.setAuthor(line.substring(4)); + } else if (line.startsWith("来源: ")) { + article.setSource(line.substring(4)); + } + } else { + if (content.length() > 0) { + content.append("\n"); + } + content.append(line); + } + } + + if (content.length() > 0) { + article.setContent(content.toString()); + } + } + return article; + } +}