diff --git a/w10/w10ai架构审计.docx b/w10/w10ai架构审计.docx new file mode 100644 index 0000000..f247f85 Binary files /dev/null and b/w10/w10ai架构审计.docx differ diff --git a/w10/w10datacollect/Main.java b/w10/w10datacollect/Main.java new file mode 100644 index 0000000..d179115 --- /dev/null +++ b/w10/w10datacollect/Main.java @@ -0,0 +1,21 @@ +package com.example.datacollect; + +import com.example.datacollect.controller.CrawlerController; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; + +public class Main { + + public static void main(String[] args) { + ConsoleView view = new ConsoleView(); + ArticleRepository repository = new ArticleRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + + view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); + while (true) { + controller.handle(view.readLine()); + } + } +} diff --git a/w10/w10datacollect/command/AnalyzeCommand.java b/w10/w10datacollect/command/AnalyzeCommand.java new file mode 100644 index 0000000..47ca3e1 --- /dev/null +++ b/w10/w10datacollect/command/AnalyzeCommand.java @@ -0,0 +1,76 @@ +package com.example.datacollect.command; + +import com.example.datacollect.model.Article; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import com.example.datacollect.command.Command; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import java.util.List; + +public class AnalyzeCommand implements Command { + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + + public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "analyze"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage:analyze "); + return; + } + String url = args[1]; + + try { + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + view.printError("No strategy found for: " + url); + return; + } + Document doc = Jsoup.connect(url) + .userAgent("Mozilla/5.0") + .timeout(5000) + .get(); + + List
articles = strategy.parse(doc); + + + + int total = articles.size(); + int totalTitleLen = 0; + int totalContentLen = 0; + + for (Article a : articles) { + totalTitleLen += a.getTitle() == null ? 0 : a.getTitle().length(); + totalContentLen += a.getContent() == null ? 0 : a.getContent().length(); + } + + + view.printInfo("===== 分析统计结果 ====="); + view.printInfo("文章总数:" + total + " 篇"); + view.printInfo("标题总长度:" + totalTitleLen); + view.printInfo("内容总长度:" + totalContentLen); + if (total > 0) { + view.printInfo("平均标题长度:" + (totalTitleLen / total)); + view.printInfo("平均内容长度:" + (totalContentLen / total)); + } + view.printInfo("======================"); + view.printSuccess("分析完成(数据未保存)"); + + } catch (Exception e) { + view.printError("分析失败:" + e.getMessage()); + } + } +} \ No newline at end of file diff --git a/w10/w10datacollect/command/Command.java b/w10/w10datacollect/command/Command.java new file mode 100644 index 0000000..029cadc --- /dev/null +++ b/w10/w10datacollect/command/Command.java @@ -0,0 +1,8 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; + +public interface Command { + String getName(); + void execute(String[] args, ArticleRepository repository); +} diff --git a/w10/w10datacollect/command/CrawlCommand.java b/w10/w10datacollect/command/CrawlCommand.java new file mode 100644 index 0000000..0841d57 --- /dev/null +++ b/w10/w10datacollect/command/CrawlCommand.java @@ -0,0 +1,50 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +public class CrawlCommand implements Command { + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: crawl "); + return; + } + String url = args[1]; + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + view.printError("No strategy found for: " + url); + return; + } + + try { + view.printInfo("Crawling: " + url); + Document doc = Jsoup.connect(url).get(); + var articles = strategy.parse(url, doc); + for (var article : articles) { + repository.add(article); + } + view.printSuccess("Crawled " + articles.size() + " articles."); + } catch (Exception e) { + view.printError("Failed to crawl: " + e.getMessage()); + } + } +} diff --git a/w10/w10datacollect/command/ExitCommand.java b/w10/w10datacollect/command/ExitCommand.java new file mode 100644 index 0000000..eafcd1d --- /dev/null +++ b/w10/w10datacollect/command/ExitCommand.java @@ -0,0 +1,23 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class ExitCommand implements Command { + private final ConsoleView view; + + public ExitCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.printSuccess("Bye!"); + System.exit(0); + } +} diff --git a/w10/w10datacollect/command/HelpCommand.java b/w10/w10datacollect/command/HelpCommand.java new file mode 100644 index 0000000..cef04cc --- /dev/null +++ b/w10/w10datacollect/command/HelpCommand.java @@ -0,0 +1,22 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class HelpCommand implements Command { + private final ConsoleView view; + + public HelpCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "help"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.printInfo("Commands: crawl , list, help, exit,analyze"); + } +} diff --git a/w10/w10datacollect/command/ListCommand.java b/w10/w10datacollect/command/ListCommand.java new file mode 100644 index 0000000..8147be8 --- /dev/null +++ b/w10/w10datacollect/command/ListCommand.java @@ -0,0 +1,22 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class ListCommand implements Command { + private final ConsoleView view; + + public ListCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "list"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.display(repository.getAll()); + } +} diff --git a/w10/w10datacollect/controller/CrawlerController.java b/w10/w10datacollect/controller/CrawlerController.java new file mode 100644 index 0000000..2067cdb --- /dev/null +++ b/w10/w10datacollect/controller/CrawlerController.java @@ -0,0 +1,48 @@ +package com.example.datacollect.controller; + +import com.example.datacollect.command.Command; +import com.example.datacollect.command.CrawlCommand; +import com.example.datacollect.command.ExitCommand; +import com.example.datacollect.command.HelpCommand; +import com.example.datacollect.command.ListCommand; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import java.util.HashMap; +import java.util.Map; + +public class CrawlerController { + private final Map commands = new HashMap<>(); + private final ConsoleView view; + private final ArticleRepository repository; + + public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { + this.view = view; + this.repository = repository; + register(new HelpCommand(view)); + register(new ListCommand(view)); + register(new CrawlCommand(view, strategyFactory)); + register(new ExitCommand(view)); + register(new AnalyzeCommand(view,strategyFactory)); + } + + private void register(Command command) { + commands.put(command.getName(), command); + } + + public void handle(String input) { + String text = input == null ? "" : input.trim(); + if (text.isEmpty()) { + return; + } + + String[] args = text.split("\\s+"); + String cmdName = args[0].toLowerCase(); + Command command = commands.get(cmdName); + if (command == null) { + view.printError("Unknown command: " + cmdName); + return; + } + command.execute(args, repository); + } +} diff --git a/w10/w10datacollect/model/Article.java b/w10/w10datacollect/model/Article.java new file mode 100644 index 0000000..147dbe6 --- /dev/null +++ b/w10/w10datacollect/model/Article.java @@ -0,0 +1,45 @@ +package com.example.datacollect.model; + +public class Article { + private String title; + private String url; + private String content; + + public Article(String title, String url, String content) { + this.title = title; + this.url = url; + this.content = content; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + @Override + public String toString() { + return "Article{" + + "title='" + title + '\'' + + ", url='" + url + '\'' + + '}'; + } +} diff --git a/w10/w10datacollect/repository/ArticleRepository.java b/w10/w10datacollect/repository/ArticleRepository.java new file mode 100644 index 0000000..698d907 --- /dev/null +++ b/w10/w10datacollect/repository/ArticleRepository.java @@ -0,0 +1,40 @@ +package com.example.datacollect.repository; + +import com.example.datacollect.model.Article; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ArticleRepository { + private final List
articles = new ArrayList<>(); + + public void add(Article article) { + if (article == null) { + throw new IllegalArgumentException("Article cannot be null"); + } + articles.add(article); + } + + public void addALL(List
articleList){ + if (articleList ==null){ + throw new IllegalArgumentException("List cannot be null"); + } + for (Article article : articleList){ + if (article !=null){ + articles.add(article); + } + } + } + + public List
getAll() { + return Collections.unmodifiableList(articles); + } + + public int size() { + return articles.size(); + } + + public void clear() { + articles.clear(); + } +} diff --git a/w10/w10datacollect/strategy/BlogStrategy.java b/w10/w10datacollect/strategy/BlogStrategy.java new file mode 100644 index 0000000..1e23b2b --- /dev/null +++ b/w10/w10datacollect/strategy/BlogStrategy.java @@ -0,0 +1,25 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("blog.example.com"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w10/w10datacollect/strategy/CrawlStrategy.java b/w10/w10datacollect/strategy/CrawlStrategy.java new file mode 100644 index 0000000..8b3cbe0 --- /dev/null +++ b/w10/w10datacollect/strategy/CrawlStrategy.java @@ -0,0 +1,10 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface CrawlStrategy { + List
parse(String url, Document doc); + boolean supports(String url); +} diff --git a/w10/w10datacollect/strategy/HnuNewsStrategy.java b/w10/w10datacollect/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..5ad3866 --- /dev/null +++ b/w10/w10datacollect/strategy/HnuNewsStrategy.java @@ -0,0 +1,49 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class HnuNewsStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("news.hnu.edu.cn"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements listItems = doc.select("ul.list11 li"); + + for (Element li : listItems) { + Element link = li.selectFirst("a"); + if (link == null) continue; + + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); + } + + String title = ""; + Element titleEl = link.selectFirst("h4.l2.h4s2"); + if (titleEl != null) { + title = titleEl.text().trim(); + } + + String content = ""; + Element contentEl = link.selectFirst("p.l3.ps3"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + return articles; + } +} diff --git a/w10/w10datacollect/strategy/NewsStrategy.java b/w10/w10datacollect/strategy/NewsStrategy.java new file mode 100644 index 0000000..f6eb4bd --- /dev/null +++ b/w10/w10datacollect/strategy/NewsStrategy.java @@ -0,0 +1,25 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class NewsStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("news.example.com"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements items = doc.select(".article-headline"); + for (Element e : items) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w10/w10datacollect/strategy/StrategyFactory.java b/w10/w10datacollect/strategy/StrategyFactory.java new file mode 100644 index 0000000..b66c696 --- /dev/null +++ b/w10/w10datacollect/strategy/StrategyFactory.java @@ -0,0 +1,27 @@ +package com.example.datacollect.strategy; + +import java.util.ArrayList; +import java.util.List; + +public class StrategyFactory { + private final List strategies = new ArrayList<>(); + + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + } + + public CrawlStrategy getStrategy(String url) { + for (CrawlStrategy s : strategies) { + if (s.supports(url)) { + return s; + } + } + return null; + } + + public void register(CrawlStrategy strategy) { + strategies.add(strategy); + } +} diff --git a/w10/w10datacollect/view/ConsoleView.java b/w10/w10datacollect/view/ConsoleView.java new file mode 100644 index 0000000..3c1d47a --- /dev/null +++ b/w10/w10datacollect/view/ConsoleView.java @@ -0,0 +1,42 @@ +package com.example.datacollect.view; + +import com.example.datacollect.model.Article; +import java.util.List; +import java.util.Scanner; + +public class ConsoleView { + private static final String ANSI_RESET = "\u001B[0m"; + private static final String ANSI_GREEN = "\u001B[32m"; + private static final String ANSI_RED = "\u001B[31m"; + private static final String ANSI_BLUE = "\u001B[34m"; + + private final Scanner scanner = new Scanner(System.in); + + public String readLine() { + System.out.print("> "); + return scanner.nextLine(); + } + + public void printSuccess(String msg) { + System.out.println(ANSI_GREEN + msg + ANSI_RESET); + } + + public void printError(String msg) { + System.out.println(ANSI_RED + msg + ANSI_RESET); + } + + public void printInfo(String msg) { + System.out.println(ANSI_BLUE + msg + ANSI_RESET); + } + + public void display(List
articles) { + if (articles.isEmpty()) { + printInfo("暂无文章,请先执行 crawl。"); + return; + } + for (int i = 0; i < articles.size(); i++) { + Article a = articles.get(i); + System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); + } + } +}