diff --git a/w10/AI协同升级.png b/w10/AI协同升级.png new file mode 100644 index 0000000..0b83ff7 Binary files /dev/null and b/w10/AI协同升级.png differ diff --git a/w10/java-cli/.gitignore b/w10/java-cli/.gitignore new file mode 100644 index 0000000..0ebcf1a --- /dev/null +++ b/w10/java-cli/.gitignore @@ -0,0 +1,4 @@ +*.jar +*.jar +*.class +*.log \ No newline at end of file diff --git a/w10/java-cli/pom.xml b/w10/java-cli/pom.xml new file mode 100644 index 0000000..24624f6 --- /dev/null +++ b/w10/java-cli/pom.xml @@ -0,0 +1,52 @@ + + 4.0.0 + com.example + datacollect-cli + 0.1.0 + + 11 + 11 + + + + org.jsoup + jsoup + 1.17.2 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + + com.example.datacollect.Main + + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + + diff --git a/w10/java-cli/src/main/java/com/example/datacollect/Main.java b/w10/java-cli/src/main/java/com/example/datacollect/Main.java new file mode 100644 index 0000000..d179115 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/Main.java @@ -0,0 +1,21 @@ +package com.example.datacollect; + +import com.example.datacollect.controller.CrawlerController; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; + +public class Main { + + public static void main(String[] args) { + ConsoleView view = new ConsoleView(); + ArticleRepository repository = new ArticleRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + + view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); + while (true) { + controller.handle(view.readLine()); + } + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/command/AnalyzeCommand.java b/w10/java-cli/src/main/java/com/example/datacollect/command/AnalyzeCommand.java new file mode 100644 index 0000000..8a3a66b --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/command/AnalyzeCommand.java @@ -0,0 +1,64 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +public class AnalyzeCommand implements Command { + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "analyze"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: analyze "); + return; + } + String url = args[1]; + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + + try { + view.printInfo("Analyzing: " + url); + Document doc = Jsoup.connect(url).get(); + var articles = strategy.parse(url, doc); + + int count = articles.size(); + int totalTitleLength = 0; + int totalContentLength = 0; + + for (var article : articles) { + if (article.getTitle() != null) { + totalTitleLength += article.getTitle().length(); + } + if (article.getContent() != null) { + totalContentLength += article.getContent().length(); + } + } + + double avgTitleLength = count > 0 ? (double) totalTitleLength / count : 0; + double avgContentLength = count > 0 ? (double) totalContentLength / count : 0; + + view.printSuccess("Analysis Results:"); + view.printInfo(" Total Articles: " + count); + view.printInfo(" Average Title Length: " + String.format("%.2f", avgTitleLength)); + view.printInfo(" Average Content Length: " + String.format("%.2f", avgContentLength)); + view.printInfo(" Strategy Used: " + strategy.getClass().getSimpleName()); + } catch (Exception e) { + view.printError("Failed to analyze: " + e.getMessage()); + } + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/command/Command.java b/w10/java-cli/src/main/java/com/example/datacollect/command/Command.java new file mode 100644 index 0000000..029cadc --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/command/Command.java @@ -0,0 +1,8 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; + +public interface Command { + String getName(); + void execute(String[] args, ArticleRepository repository); +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java b/w10/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java new file mode 100644 index 0000000..ac11a70 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java @@ -0,0 +1,44 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +public class CrawlCommand implements Command { + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: crawl "); + return; + } + String url = args[1]; + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + + try { + view.printInfo("Crawling: " + url); + Document doc = Jsoup.connect(url).get(); + var articles = strategy.parse(url, doc); + repository.addAll(articles); + view.printSuccess("Crawled " + articles.size() + " articles."); + } catch (Exception e) { + view.printError("Failed to crawl: " + e.getMessage()); + } + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java b/w10/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java new file mode 100644 index 0000000..eafcd1d --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java @@ -0,0 +1,23 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class ExitCommand implements Command { + private final ConsoleView view; + + public ExitCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.printSuccess("Bye!"); + System.exit(0); + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java b/w10/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java new file mode 100644 index 0000000..e984366 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java @@ -0,0 +1,22 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class HelpCommand implements Command { + private final ConsoleView view; + + public HelpCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "help"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.printInfo("Commands: crawl , analyze , list, help, exit"); + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java b/w10/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java new file mode 100644 index 0000000..8147be8 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java @@ -0,0 +1,22 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; + +public class ListCommand implements Command { + private final ConsoleView view; + + public ListCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "list"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + view.display(repository.getAll()); + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java b/w10/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java new file mode 100644 index 0000000..a043ac4 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java @@ -0,0 +1,49 @@ +package com.example.datacollect.controller; + +import com.example.datacollect.command.AnalyzeCommand; +import com.example.datacollect.command.Command; +import com.example.datacollect.command.CrawlCommand; +import com.example.datacollect.command.ExitCommand; +import com.example.datacollect.command.HelpCommand; +import com.example.datacollect.command.ListCommand; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import java.util.HashMap; +import java.util.Map; + +public class CrawlerController { + private final Map commands = new HashMap<>(); + private final ConsoleView view; + private final ArticleRepository repository; + + public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { + this.view = view; + this.repository = repository; + register(new HelpCommand(view)); + register(new ListCommand(view)); + register(new CrawlCommand(view, strategyFactory)); + register(new AnalyzeCommand(view, strategyFactory)); + register(new ExitCommand(view)); + } + + private void register(Command command) { + commands.put(command.getName(), command); + } + + public void handle(String input) { + String text = input == null ? "" : input.trim(); + if (text.isEmpty()) { + return; + } + + String[] args = text.split("\\s+"); + String cmdName = args[0].toLowerCase(); + Command command = commands.get(cmdName); + if (command == null) { + view.printError("Unknown command: " + cmdName); + return; + } + command.execute(args, repository); + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/model/Article.java b/w10/java-cli/src/main/java/com/example/datacollect/model/Article.java new file mode 100644 index 0000000..147dbe6 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/model/Article.java @@ -0,0 +1,45 @@ +package com.example.datacollect.model; + +public class Article { + private String title; + private String url; + private String content; + + public Article(String title, String url, String content) { + this.title = title; + this.url = url; + this.content = content; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + @Override + public String toString() { + return "Article{" + + "title='" + title + '\'' + + ", url='" + url + '\'' + + '}'; + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java b/w10/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java new file mode 100644 index 0000000..ae8afe6 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java @@ -0,0 +1,41 @@ +package com.example.datacollect.repository; + +import com.example.datacollect.model.Article; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ArticleRepository { + private final List
articles = new ArrayList<>(); + + public void add(Article article) { + if (article == null) { + throw new IllegalArgumentException("Article cannot be null"); + } + articles.add(article); + } + + public void addAll(List
articleList) { + if (articleList == null) { + throw new IllegalArgumentException("Article list cannot be null"); + } + for (Article article : articleList) { + if (article == null) { + throw new IllegalArgumentException("Article in list cannot be null"); + } + } + articles.addAll(articleList); + } + + public List
getAll() { + return Collections.unmodifiableList(articles); + } + + public int size() { + return articles.size(); + } + + public void clear() { + articles.clear(); + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java b/w10/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java new file mode 100644 index 0000000..a54be11 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java @@ -0,0 +1,27 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy extends PriorityStrategy { + private static final int PRIORITY = 100; + private static final String URL_PATTERN = ".*blog\\.example\\.com.*"; + + public BlogStrategy() { + super(PRIORITY, URL_PATTERN); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java b/w10/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java new file mode 100644 index 0000000..8b3cbe0 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java @@ -0,0 +1,10 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface CrawlStrategy { + List
parse(String url, Document doc); + boolean supports(String url); +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/strategy/DefaultStrategy.java b/w10/java-cli/src/main/java/com/example/datacollect/strategy/DefaultStrategy.java new file mode 100644 index 0000000..fff7530 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/strategy/DefaultStrategy.java @@ -0,0 +1,37 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class DefaultStrategy implements CrawlStrategy { + + @Override + public boolean supports(String url) { + return true; + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + + Elements links = doc.select("a[href]"); + for (Element link : links) { + String title = link.text().trim(); + String href = link.attr("abs:href"); + + if (!title.isEmpty() && title.length() > 5) { + articles.add(new Article(title, href.isEmpty() ? url : href, "")); + } + + if (articles.size() >= 20) { + break; + } + } + + return articles; + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java b/w10/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..b465581 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java @@ -0,0 +1,51 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class HnuNewsStrategy extends PriorityStrategy { + private static final int PRIORITY = 200; + private static final String URL_PATTERN = ".*news\\.hnu\\.edu\\.cn.*"; + + public HnuNewsStrategy() { + super(PRIORITY, URL_PATTERN); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements listItems = doc.select("ul.list11 li"); + + for (Element li : listItems) { + Element link = li.selectFirst("a"); + if (link == null) continue; + + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); + } + + String title = ""; + Element titleEl = link.selectFirst("h4.l2.h4s2"); + if (titleEl != null) { + title = titleEl.text().trim(); + } + + String content = ""; + Element contentEl = link.selectFirst("p.l3.ps3"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + return articles; + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java b/w10/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java new file mode 100644 index 0000000..9854752 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java @@ -0,0 +1,27 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class NewsStrategy extends PriorityStrategy { + private static final int PRIORITY = 100; + private static final String URL_PATTERN = ".*news\\.example\\.com.*"; + + public NewsStrategy() { + super(PRIORITY, URL_PATTERN); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements items = doc.select(".article-headline"); + for (Element e : items) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/strategy/PriorityStrategy.java b/w10/java-cli/src/main/java/com/example/datacollect/strategy/PriorityStrategy.java new file mode 100644 index 0000000..ded1239 --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/strategy/PriorityStrategy.java @@ -0,0 +1,27 @@ +package com.example.datacollect.strategy; + +import java.util.regex.Pattern; + +public abstract class PriorityStrategy implements CrawlStrategy, Comparable { + private final int priority; + private final Pattern urlPattern; + + public PriorityStrategy(int priority, String regexPattern) { + this.priority = priority; + this.urlPattern = Pattern.compile(regexPattern); + } + + @Override + public boolean supports(String url) { + return urlPattern.matcher(url).matches(); + } + + @Override + public int compareTo(PriorityStrategy other) { + return Integer.compare(other.priority, this.priority); + } + + public int getPriority() { + return priority; + } +} diff --git a/w10/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java b/w10/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java new file mode 100644 index 0000000..3c1d47a --- /dev/null +++ b/w10/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java @@ -0,0 +1,42 @@ +package com.example.datacollect.view; + +import com.example.datacollect.model.Article; +import java.util.List; +import java.util.Scanner; + +public class ConsoleView { + private static final String ANSI_RESET = "\u001B[0m"; + private static final String ANSI_GREEN = "\u001B[32m"; + private static final String ANSI_RED = "\u001B[31m"; + private static final String ANSI_BLUE = "\u001B[34m"; + + private final Scanner scanner = new Scanner(System.in); + + public String readLine() { + System.out.print("> "); + return scanner.nextLine(); + } + + public void printSuccess(String msg) { + System.out.println(ANSI_GREEN + msg + ANSI_RESET); + } + + public void printError(String msg) { + System.out.println(ANSI_RED + msg + ANSI_RESET); + } + + public void printInfo(String msg) { + System.out.println(ANSI_BLUE + msg + ANSI_RESET); + } + + public void display(List
articles) { + if (articles.isEmpty()) { + printInfo("暂无文章,请先执行 crawl。"); + return; + } + for (int i = 0; i < articles.size(); i++) { + Article a = articles.get(i); + System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); + } + } +} diff --git a/w10/思考题.png b/w10/思考题.png new file mode 100644 index 0000000..04e35d4 Binary files /dev/null and b/w10/思考题.png differ diff --git a/w10/进阶探究.png b/w10/进阶探究.png new file mode 100644 index 0000000..b850640 Binary files /dev/null and b/w10/进阶探究.png differ