diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/Main.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/Main.java new file mode 100644 index 0000000..d179115 --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/Main.java @@ -0,0 +1,21 @@ +package com.example.datacollect; + +import com.example.datacollect.controller.CrawlerController; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; + +public class Main { + + public static void main(String[] args) { + ConsoleView view = new ConsoleView(); + ArticleRepository repository = new ArticleRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + + view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); + while (true) { + controller.handle(view.readLine()); + } + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/AnalyzeCommand.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/AnalyzeCommand.java new file mode 100644 index 0000000..8b70a80 --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/AnalyzeCommand.java @@ -0,0 +1,93 @@ +package com.example.datacollect.command; + +import com.example.datacollect.model.Article; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.net.URL; + +public class AnalyzeCommand implements Command { + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "analyze"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: analyze "); + return; + } + String url = args[1]; + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + view.printError("No strategy found for: " + url); + return; + } + + try { + view.printInfo("Analyzing: " + url); + Document doc = Jsoup.connect(url).get(); + var articles = strategy.parse(url, doc); + + view.printSuccess("Analysis completed (not stored)"); + view.printInfo("Total articles found: " + articles.size()); + + if (!articles.isEmpty()) { + Map sourceCounts = new HashMap<>(); + int totalLength = 0; + + for (Article article : articles) { + String domain = extractDomain(article.getUrl()); + sourceCounts.merge(domain, 1, Integer::sum); + if (article.getContent() != null) { + totalLength += article.getContent().length(); + } + } + + view.printInfo("Domain distribution:"); + for (Map.Entry entry : sourceCounts.entrySet()) { + view.printInfo(" - " + entry.getKey() + ": " + entry.getValue() + " articles"); + } + + if (articles.size() > 0) { + int avgLength = totalLength / articles.size(); + view.printInfo("Average content length: " + avgLength + " characters"); + } + + view.printInfo("First 3 article titles:"); + int count = Math.min(3, articles.size()); + for (int i = 0; i < count; i++) { + view.printInfo(" " + (i + 1) + ". " + articles.get(i).getTitle()); + } + } + } catch (Exception e) { + view.printError("Failed to analyze: " + e.getMessage()); + } + } + + private String extractDomain(String url) { + try { + URL urlObj = new URL(url); + return urlObj.getHost(); + } catch (Exception e) { + return "Unknown"; + } + } +} \ No newline at end of file diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/Command.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/Command.java new file mode 100644 index 0000000..029cadc --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/Command.java @@ -0,0 +1,8 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; + +public interface Command { + String getName(); + void execute(String[] args, ArticleRepository repository); +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java new file mode 100644 index 0000000..0841d57 --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java @@ -0,0 +1,50 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +public class CrawlCommand implements Command { + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: crawl "); + return; + } + String url = args[1]; + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + view.printError("No strategy found for: " + url); + return; + } + + try { + view.printInfo("Crawling: " + url); + Document doc = Jsoup.connect(url).get(); + var articles = strategy.parse(url, doc); + for (var article : articles) { + repository.add(article); + } + view.printSuccess("Crawled " + articles.size() + " articles."); + } catch (Exception e) { + view.printError("Failed to crawl: " + e.getMessage()); + } + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java new file mode 100644 index 0000000..eafcd1d --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java @@ -0,0 +1,23 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class ExitCommand implements Command { + private final ConsoleView view; + + public ExitCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.printSuccess("Bye!"); + System.exit(0); + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java new file mode 100644 index 0000000..e984366 --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java @@ -0,0 +1,22 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class HelpCommand implements Command { + private final ConsoleView view; + + public HelpCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "help"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.printInfo("Commands: crawl , analyze , list, help, exit"); + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java new file mode 100644 index 0000000..8147be8 --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java @@ -0,0 +1,22 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class ListCommand implements Command { + private final ConsoleView view; + + public ListCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "list"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.display(repository.getAll()); + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java new file mode 100644 index 0000000..a043ac4 --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java @@ -0,0 +1,49 @@ +package com.example.datacollect.controller; + +import com.example.datacollect.command.AnalyzeCommand; +import com.example.datacollect.command.Command; +import com.example.datacollect.command.CrawlCommand; +import com.example.datacollect.command.ExitCommand; +import com.example.datacollect.command.HelpCommand; +import com.example.datacollect.command.ListCommand; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import java.util.HashMap; +import java.util.Map; + +public class CrawlerController { + private final Map commands = new HashMap<>(); + private final ConsoleView view; + private final ArticleRepository repository; + + public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { + this.view = view; + this.repository = repository; + register(new HelpCommand(view)); + register(new ListCommand(view)); + register(new CrawlCommand(view, strategyFactory)); + register(new AnalyzeCommand(view, strategyFactory)); + register(new ExitCommand(view)); + } + + private void register(Command command) { + commands.put(command.getName(), command); + } + + public void handle(String input) { + String text = input == null ? "" : input.trim(); + if (text.isEmpty()) { + return; + } + + String[] args = text.split("\\s+"); + String cmdName = args[0].toLowerCase(); + Command command = commands.get(cmdName); + if (command == null) { + view.printError("Unknown command: " + cmdName); + return; + } + command.execute(args, repository); + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/model/Article.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/model/Article.java new file mode 100644 index 0000000..147dbe6 --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/model/Article.java @@ -0,0 +1,45 @@ +package com.example.datacollect.model; + +public class Article { + private String title; + private String url; + private String content; + + public Article(String title, String url, String content) { + this.title = title; + this.url = url; + this.content = content; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + @Override + public String toString() { + return "Article{" + + "title='" + title + '\'' + + ", url='" + url + '\'' + + '}'; + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java new file mode 100644 index 0000000..cafa5ab --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java @@ -0,0 +1,41 @@ +package com.example.datacollect.repository; + +import com.example.datacollect.model.Article; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ArticleRepository { + private final List
articles = new ArrayList<>(); + + public void add(Article article) { + if (article == null) { + throw new IllegalArgumentException("Article cannot be null"); + } + articles.add(article); + } + + public void addAll(List
articleList) { + if (articleList == null) { + throw new IllegalArgumentException("Article list cannot be null"); + } + for (Article article : articleList) { + if (article == null) { + throw new IllegalArgumentException("Article cannot be null"); + } + articles.add(article); + } + } + + public List
getAll() { + return Collections.unmodifiableList(articles); + } + + public int size() { + return articles.size(); + } + + public void clear() { + articles.clear(); + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java new file mode 100644 index 0000000..a771ef5 --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java @@ -0,0 +1,32 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy implements CrawlStrategy { + private static final int PRIORITY = 5; + + @Override + public boolean supports(String url) { + return url.contains("blog.example.com"); + } + + @Override + public int getPriority() { + return PRIORITY; + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java new file mode 100644 index 0000000..2586429 --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java @@ -0,0 +1,11 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface CrawlStrategy { + List
parse(String url, Document doc); + boolean supports(String url); + int getPriority(); +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..98c2de5 --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java @@ -0,0 +1,56 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class HnuNewsStrategy implements CrawlStrategy { + private static final int PRIORITY = 10; + + @Override + public boolean supports(String url) { + return url.contains("news.hnu.edu.cn"); + } + + @Override + public int getPriority() { + return PRIORITY; + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements listItems = doc.select("ul.list11 li"); + + for (Element li : listItems) { + Element link = li.selectFirst("a"); + if (link == null) continue; + + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); + } + + String title = ""; + Element titleEl = link.selectFirst("h4.l2.h4s2"); + if (titleEl != null) { + title = titleEl.text().trim(); + } + + String content = ""; + Element contentEl = link.selectFirst("p.l3.ps3"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + return articles; + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java new file mode 100644 index 0000000..b92e81b --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java @@ -0,0 +1,32 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class NewsStrategy implements CrawlStrategy { + private static final int PRIORITY = 5; + + @Override + public boolean supports(String url) { + return url.contains("news.example.com"); + } + + @Override + public int getPriority() { + return PRIORITY; + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements items = doc.select(".article-headline"); + for (Element e : items) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/StrategyFactory.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/StrategyFactory.java new file mode 100644 index 0000000..618b81d --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/strategy/StrategyFactory.java @@ -0,0 +1,51 @@ +package com.example.datacollect.strategy; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.regex.Pattern; + +public class StrategyFactory { + private final List strategies = new ArrayList<>(); + private CrawlStrategy defaultStrategy; + + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + strategies.sort(Comparator.comparingInt(CrawlStrategy::getPriority).reversed()); + } + + public CrawlStrategy getStrategy(String url) { + CrawlStrategy bestMatch = null; + int highestPriority = Integer.MIN_VALUE; + + for (CrawlStrategy s : strategies) { + if (s.supports(url)) { + if (s.getPriority() > highestPriority) { + highestPriority = s.getPriority(); + bestMatch = s; + } + } + } + + return bestMatch != null ? bestMatch : defaultStrategy; + } + + public void register(CrawlStrategy strategy) { + strategies.add(strategy); + strategies.sort(Comparator.comparingInt(CrawlStrategy::getPriority).reversed()); + } + + public void setDefaultStrategy(CrawlStrategy strategy) { + this.defaultStrategy = strategy; + } + + public static boolean matchesPattern(String url, String pattern) { + try { + return Pattern.matches(pattern, url); + } catch (Exception e) { + return false; + } + } +} diff --git a/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java new file mode 100644 index 0000000..3c1d47a --- /dev/null +++ b/w10/java-cli-10/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java @@ -0,0 +1,42 @@ +package com.example.datacollect.view; + +import com.example.datacollect.model.Article; +import java.util.List; +import java.util.Scanner; + +public class ConsoleView { + private static final String ANSI_RESET = "\u001B[0m"; + private static final String ANSI_GREEN = "\u001B[32m"; + private static final String ANSI_RED = "\u001B[31m"; + private static final String ANSI_BLUE = "\u001B[34m"; + + private final Scanner scanner = new Scanner(System.in); + + public String readLine() { + System.out.print("> "); + return scanner.nextLine(); + } + + public void printSuccess(String msg) { + System.out.println(ANSI_GREEN + msg + ANSI_RESET); + } + + public void printError(String msg) { + System.out.println(ANSI_RED + msg + ANSI_RESET); + } + + public void printInfo(String msg) { + System.out.println(ANSI_BLUE + msg + ANSI_RESET); + } + + public void display(List
articles) { + if (articles.isEmpty()) { + printInfo("暂无文章,请先执行 crawl。"); + return; + } + for (int i = 0; i < articles.size(); i++) { + Article a = articles.get(i); + System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); + } + } +}