From 4b8cda09c81642a626cbd5dd7288509cd423f71f Mon Sep 17 00:00:00 2001 From: XuJiexian <3445002374@qq.com> Date: Sat, 30 May 2026 21:27:35 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4=20w11=20=E4=BD=9C=E4=B8=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- w11/java-cli/.gitignore | 4 + w11/java-cli/README.md | 17 +++ w11/java-cli/pom.xml | 62 ++++++++++ .../java/com/example/datacollect/Main.java | 25 ++++ .../example/datacollect/command/Command.java | 8 ++ .../datacollect/command/CrawlCommand.java | 109 ++++++++++++++++++ .../datacollect/command/ExitCommand.java | 27 +++++ .../datacollect/command/HelpCommand.java | 26 +++++ .../datacollect/command/HistoryCommand.java | 60 ++++++++++ .../datacollect/command/ListCommand.java | 26 +++++ .../controller/CrawlerController.java | 59 ++++++++++ .../exception/CrawlerException.java | 11 ++ .../exception/NetworkException.java | 11 ++ .../datacollect/exception/ParseException.java | 11 ++ .../example/datacollect/model/Article.java | 75 ++++++++++++ .../repository/ArticleRepository.java | 44 +++++++ .../datacollect/strategy/BlogStrategy.java | 40 +++++++ .../datacollect/strategy/CrawlStrategy.java | 11 ++ .../datacollect/strategy/HnuNewsStrategy.java | 50 ++++++++ .../datacollect/strategy/NewsStrategy.java | 26 +++++ .../datacollect/strategy/StrategyFactory.java | 34 ++++++ .../example/datacollect/view/ConsoleView.java | 45 ++++++++ w11/java-cli/src/main/resources/logback.xml | 26 +++++ w11/logs/crawler.log | 3 + 24 files changed, 810 insertions(+) create mode 100644 w11/java-cli/.gitignore create mode 100644 w11/java-cli/README.md create mode 100644 w11/java-cli/pom.xml create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/Main.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/command/Command.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/command/HistoryCommand.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/exception/CrawlerException.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/exception/NetworkException.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/exception/ParseException.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/model/Article.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/strategy/StrategyFactory.java create mode 100644 w11/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java create mode 100644 w11/java-cli/src/main/resources/logback.xml create mode 100644 w11/logs/crawler.log diff --git a/w11/java-cli/.gitignore b/w11/java-cli/.gitignore new file mode 100644 index 0000000..0ebcf1a --- /dev/null +++ b/w11/java-cli/.gitignore @@ -0,0 +1,4 @@ +*.jar +*.jar +*.class +*.log \ No newline at end of file diff --git a/w11/java-cli/README.md b/w11/java-cli/README.md new file mode 100644 index 0000000..3ea02ec --- /dev/null +++ b/w11/java-cli/README.md @@ -0,0 +1,17 @@ +# DataCollect 教学项目 — 最小可运行版本 + +这是一个最小可用的 Java CLI 演示工程,目标:打印帮助信息以验证运行环境。 + +构建: +```bash +mvn -q package +``` + +运行(示例): +```bash +java -jar target/datacollect-cli-0.1.0-jar-with-dependencies.jar --help +``` + +项目结构(最小): +- `src/main/java/com/example/datacollect/Main.java` — CLI 入口,打印帮助 +- `pom.xml` — Maven 构建配置,生成可执行 jar diff --git a/w11/java-cli/pom.xml b/w11/java-cli/pom.xml new file mode 100644 index 0000000..a738b40 --- /dev/null +++ b/w11/java-cli/pom.xml @@ -0,0 +1,62 @@ + + 4.0.0 + com.example + datacollect-cli + 0.1.0 + + 11 + 11 + + + + org.jsoup + jsoup + 1.14.3 + + + ch.qos.logback + logback-classic + 1.2.11 + + + org.slf4j + slf4j-api + 1.7.36 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + + com.example.datacollect.Main + + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + + \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/Main.java b/w11/java-cli/src/main/java/com/example/datacollect/Main.java new file mode 100644 index 0000000..7839ef5 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/Main.java @@ -0,0 +1,25 @@ +package com.example.datacollect; + +import com.example.datacollect.controller.CrawlerController; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class Main { + private static final Logger logger = LoggerFactory.getLogger(Main.class); + + public static void main(String[] args) { + logger.info("Starting CLI Crawler W11"); + ConsoleView view = new ConsoleView(); + ArticleRepository repository = new ArticleRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + + view.printSuccess("Welcome to CLI Crawler (W11)! Type help for commands."); + while (true) { + controller.handle(view.readLine()); + } + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/command/Command.java b/w11/java-cli/src/main/java/com/example/datacollect/command/Command.java new file mode 100644 index 0000000..029cadc --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/command/Command.java @@ -0,0 +1,8 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; + +public interface Command { + String getName(); + void execute(String[] args, ArticleRepository repository); +} diff --git a/w11/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java b/w11/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java new file mode 100644 index 0000000..8a541c7 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java @@ -0,0 +1,109 @@ +package com.example.datacollect.command; + +import com.example.datacollect.exception.CrawlerException; +import com.example.datacollect.exception.NetworkException; +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.List; + +public class CrawlCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); + private static final int MAX_RETRIES = 3; + private static final int RETRY_DELAY_MS = 1000; + + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: crawl "); + return; + } + + String url = args[1]; + CrawlStrategy strategy = strategyFactory.getStrategy(url); + + if (strategy == null) { + view.printError("No strategy found for URL: " + url); + return; + } + + int attempts = 0; + Exception lastException = null; + + while (attempts < MAX_RETRIES) { + attempts++; + try { + Document doc = fetchWithRetry(url, attempts); + List
articles = strategy.parse(url, doc); + + for (Article article : articles) { + repository.add(article); + } + + logger.info("Successfully crawled {} - {} article(s)", url, articles.size()); + view.printSuccess("Crawled " + articles.size() + " article(s) from " + url); + return; + } catch (NetworkException e) { + lastException = e; + logger.warn("Network error fetching {} (attempt {}/{}): {}", + url, attempts, MAX_RETRIES, e.getMessage()); + if (attempts < MAX_RETRIES) { + try { + Thread.sleep(RETRY_DELAY_MS * attempts); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + break; + } + } + } catch (ParseException e) { + lastException = e; + logger.error("Parse error for {} (attempt {}/{}): {}", + url, attempts, MAX_RETRIES, e.getMessage()); + break; + } catch (CrawlerException e) { + lastException = e; + logger.error("Crawler error for {}: {}", url, e.getMessage()); + break; + } catch (Exception e) { + lastException = e; + logger.error("Unexpected error fetching {}: {}", url, e.getMessage()); + break; + } + } + + logger.error("Failed to crawl {} after {} attempts", url, attempts); + view.printError("Failed to crawl: " + (lastException != null ? lastException.getMessage() : "Unknown error")); + } + + private Document fetchWithRetry(String url, int attempt) throws NetworkException { + try { + logger.debug("Fetching {} (attempt {})", url, attempt); + return Jsoup.connect(url) + .userAgent("Mozilla/5.0") + .timeout(5000) + .get(); + } catch (Exception e) { + throw new NetworkException("Failed to fetch " + url, e); + } + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java b/w11/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java new file mode 100644 index 0000000..fc1ccdb --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java @@ -0,0 +1,27 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ExitCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class); + private final ConsoleView view; + + public ExitCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.info("User exiting application"); + view.printSuccess("Bye!"); + System.exit(0); + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java b/w11/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java new file mode 100644 index 0000000..eae0377 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java @@ -0,0 +1,26 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class HelpCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class); + private final ConsoleView view; + + public HelpCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "help"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.debug("Showing help"); + view.printInfo("Commands: crawl , list, help, history, exit"); + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/command/HistoryCommand.java b/w11/java-cli/src/main/java/com/example/datacollect/command/HistoryCommand.java new file mode 100644 index 0000000..82a4535 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/command/HistoryCommand.java @@ -0,0 +1,60 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + +public class HistoryCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(HistoryCommand.class); + private final ConsoleView view; + private final List commandHistory; + + public HistoryCommand(ConsoleView view) { + this.view = view; + this.commandHistory = new ArrayList<>(); + } + + public void addCommand(String command) { + commandHistory.add(command); + } + + public List getAllHistory() { + return new ArrayList<>(commandHistory); + } + + public String getCommand(int index) { + if (index < 0 || index >= commandHistory.size()) { + return null; + } + return commandHistory.get(index); + } + + public void clearHistory() { + commandHistory.clear(); + } + + public int getHistorySize() { + return commandHistory.size(); + } + + @Override + public String getName() { + return "history"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (commandHistory.isEmpty()) { + view.printInfo("No command history."); + return; + } + + view.printInfo("Command History:"); + for (int i = 0; i < commandHistory.size(); i++) { + view.printInfo((i + 1) + ". " + commandHistory.get(i)); + } + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java b/w11/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java new file mode 100644 index 0000000..9d7c650 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java @@ -0,0 +1,26 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ListCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(ListCommand.class); + private final ConsoleView view; + + public ListCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "list"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.debug("Listing articles"); + view.display(repository.getAll()); + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java b/w11/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java new file mode 100644 index 0000000..6a83224 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java @@ -0,0 +1,59 @@ +package com.example.datacollect.controller; + +import com.example.datacollect.command.Command; +import com.example.datacollect.command.CrawlCommand; +import com.example.datacollect.command.ExitCommand; +import com.example.datacollect.command.HelpCommand; +import com.example.datacollect.command.HistoryCommand; +import com.example.datacollect.command.ListCommand; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.HashMap; +import java.util.Map; + +public class CrawlerController { + private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class); + private final Map commands = new HashMap<>(); + private final ConsoleView view; + private final ArticleRepository repository; + private HistoryCommand historyCommand; + + public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { + this.view = view; + this.repository = repository; + register(new HelpCommand(view)); + register(new ListCommand(view)); + register(new CrawlCommand(view, strategyFactory)); + register(new ExitCommand(view)); + historyCommand = new HistoryCommand(view); + register(historyCommand); + logger.info("CrawlerController initialized"); + } + + private void register(Command command) { + commands.put(command.getName(), command); + } + + public void handle(String input) { + String text = input == null ? "" : input.trim(); + if (text.isEmpty()) { + return; + } + + historyCommand.addCommand(text); + + String[] args = text.split("\\s+"); + String cmdName = args[0].toLowerCase(); + Command command = commands.get(cmdName); + if (command == null) { + logger.warn("Unknown command: {}", cmdName); + view.printError("Unknown command: " + cmdName); + return; + } + logger.debug("Executing command: {}", cmdName); + command.execute(args, repository); + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/exception/CrawlerException.java b/w11/java-cli/src/main/java/com/example/datacollect/exception/CrawlerException.java new file mode 100644 index 0000000..e81c3c9 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/exception/CrawlerException.java @@ -0,0 +1,11 @@ +package com.example.datacollect.exception; + +public class CrawlerException extends Exception { + public CrawlerException(String message) { + super(message); + } + + public CrawlerException(String message, Throwable cause) { + super(message, cause); + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/exception/NetworkException.java b/w11/java-cli/src/main/java/com/example/datacollect/exception/NetworkException.java new file mode 100644 index 0000000..0fb8e5e --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/exception/NetworkException.java @@ -0,0 +1,11 @@ +package com.example.datacollect.exception; + +public class NetworkException extends CrawlerException { + public NetworkException(String message) { + super(message); + } + + public NetworkException(String message, Throwable cause) { + super(message, cause); + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/exception/ParseException.java b/w11/java-cli/src/main/java/com/example/datacollect/exception/ParseException.java new file mode 100644 index 0000000..205665a --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/exception/ParseException.java @@ -0,0 +1,11 @@ +package com.example.datacollect.exception; + +public class ParseException extends CrawlerException { + public ParseException(String message) { + super(message); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/model/Article.java b/w11/java-cli/src/main/java/com/example/datacollect/model/Article.java new file mode 100644 index 0000000..f3b0ca8 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/model/Article.java @@ -0,0 +1,75 @@ +package com.example.datacollect.model; + +import java.time.LocalDate; + +public class Article { + private String title; + private String url; + private String content; + private String author; + private LocalDate publishDate; + + public Article(String title, String url, String content) { + this.title = title; + this.url = url; + this.content = content; + } + + public Article(String title, String url, String content, String author, LocalDate publishDate) { + this.title = title; + this.url = url; + this.content = content; + this.author = author; + this.publishDate = publishDate; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getAuthor() { + return author; + } + + public void setAuthor(String author) { + this.author = author; + } + + public LocalDate getPublishDate() { + return publishDate; + } + + public void setPublishDate(LocalDate publishDate) { + this.publishDate = publishDate; + } + + @Override + public String toString() { + return "Article{" + + "title='" + title + '\'' + + ", url='" + url + '\'' + + ", author='" + author + '\'' + + ", publishDate=" + publishDate + + '}'; + } +} diff --git a/w11/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java b/w11/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java new file mode 100644 index 0000000..42be8b2 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java @@ -0,0 +1,44 @@ +package com.example.datacollect.repository; + +import com.example.datacollect.model.Article; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ArticleRepository { + private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); + private static final int MAX_CAPACITY = 10000; + private final List
articles = new ArrayList<>(); + + public void add(Article article) { + if (article == null) { + throw new IllegalArgumentException("Article cannot be null"); + } + if (article.getTitle() == null || article.getTitle().trim().isEmpty()) { + throw new IllegalArgumentException("Article title cannot be null or empty"); + } + if (article.getUrl() == null || article.getUrl().trim().isEmpty()) { + throw new IllegalArgumentException("Article URL cannot be null or empty"); + } + if (articles.size() >= MAX_CAPACITY) { + throw new IllegalStateException("Repository capacity exceeded: " + MAX_CAPACITY); + } + articles.add(article); + logger.debug("Added article: {}", article.getTitle()); + } + + public List
getAll() { + return Collections.unmodifiableList(articles); + } + + public int size() { + return articles.size(); + } + + public void clear() { + articles.clear(); + logger.debug("Cleared all articles"); + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java b/w11/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java new file mode 100644 index 0000000..238d407 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java @@ -0,0 +1,40 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("blog.example.com"); + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + Elements postItems = doc.select(".post-item"); + + for (Element item : postItems) { + Element titleEl = item.selectFirst(".post-title"); + Element linkEl = item.selectFirst("a"); + Element contentEl = item.selectFirst(".post-excerpt"); + + if (titleEl == null) continue; + + String title = titleEl.text().trim(); + String articleUrl = linkEl != null ? linkEl.attr("href") : url; + String content = contentEl != null ? contentEl.text().trim() : ""; + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + return articles; + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java b/w11/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java new file mode 100644 index 0000000..3758b21 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java @@ -0,0 +1,11 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface CrawlStrategy { + List
parse(String url, Document doc) throws ParseException; + boolean supports(String url); +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java b/w11/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..c32fac9 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java @@ -0,0 +1,50 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class HnuNewsStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("news.hnu.edu.cn"); + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + Elements listItems = doc.select("ul.list11 li"); + + for (Element li : listItems) { + Element link = li.selectFirst("a"); + if (link == null) continue; + + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); + } + + String title = ""; + Element titleEl = link.selectFirst("h4.l2.h4s2"); + if (titleEl != null) { + title = titleEl.text().trim(); + } + + String content = ""; + Element contentEl = link.selectFirst("p.l3.ps3"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + return articles; + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java b/w11/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java new file mode 100644 index 0000000..4abe877 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java @@ -0,0 +1,26 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class NewsStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("news.example.com"); + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + Elements items = doc.select(".article-headline"); + for (Element e : items) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/strategy/StrategyFactory.java b/w11/java-cli/src/main/java/com/example/datacollect/strategy/StrategyFactory.java new file mode 100644 index 0000000..22bbc21 --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/strategy/StrategyFactory.java @@ -0,0 +1,34 @@ +package com.example.datacollect.strategy; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + +public class StrategyFactory { + private static final Logger logger = LoggerFactory.getLogger(StrategyFactory.class); + private final List strategies = new ArrayList<>(); + + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + logger.info("StrategyFactory initialized with {} strategies", strategies.size()); + } + + public CrawlStrategy getStrategy(String url) { + for (CrawlStrategy s : strategies) { + if (s.supports(url)) { + logger.debug("Found strategy {} for URL: {}", s.getClass().getSimpleName(), url); + return s; + } + } + logger.warn("No strategy found for URL: {}", url); + return null; + } + + public void register(CrawlStrategy strategy) { + strategies.add(strategy); + logger.info("Registered new strategy: {}", strategy.getClass().getSimpleName()); + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java b/w11/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java new file mode 100644 index 0000000..22b4f3a --- /dev/null +++ b/w11/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java @@ -0,0 +1,45 @@ +package com.example.datacollect.view; + +import com.example.datacollect.model.Article; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.List; +import java.util.Scanner; + +public class ConsoleView { + private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class); + private static final String ANSI_RESET = "\u001B[0m"; + private static final String ANSI_GREEN = "\u001B[32m"; + private static final String ANSI_RED = "\u001B[31m"; + private static final String ANSI_BLUE = "\u001B[34m"; + + private final Scanner scanner = new Scanner(System.in); + + public String readLine() { + System.out.print("> "); + return scanner.nextLine(); + } + + public void printSuccess(String msg) { + System.out.println(ANSI_GREEN + msg + ANSI_RESET); + } + + public void printError(String msg) { + System.out.println(ANSI_RED + msg + ANSI_RESET); + } + + public void printInfo(String msg) { + System.out.println(ANSI_BLUE + msg + ANSI_RESET); + } + + public void display(List
articles) { + if (articles.isEmpty()) { + printInfo("暂无文章,请先执行 crawl。"); + return; + } + for (int i = 0; i < articles.size(); i++) { + Article a = articles.get(i); + System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); + } + } +} \ No newline at end of file diff --git a/w11/java-cli/src/main/resources/logback.xml b/w11/java-cli/src/main/resources/logback.xml new file mode 100644 index 0000000..893d39f --- /dev/null +++ b/w11/java-cli/src/main/resources/logback.xml @@ -0,0 +1,26 @@ + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + logs/crawler.log + + logs/crawler-%d{yyyy-MM-dd}.log + 7 + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + + + \ No newline at end of file diff --git a/w11/logs/crawler.log b/w11/logs/crawler.log new file mode 100644 index 0000000..20c4eca --- /dev/null +++ b/w11/logs/crawler.log @@ -0,0 +1,3 @@ +2026-05-30 21:21:52.964 [main] INFO com.example.datacollect.Main - Starting CLI Crawler W11 +2026-05-30 21:21:53.090 [main] INFO c.e.d.strategy.StrategyFactory - StrategyFactory initialized with 3 strategies +2026-05-30 21:21:53.245 [main] INFO c.e.d.controller.CrawlerController - CrawlerController initialized