diff --git a/w11/java-cli - 副本/.gitignore b/w11/java-cli - 副本/.gitignore new file mode 100644 index 0000000..0ebcf1a --- /dev/null +++ b/w11/java-cli - 副本/.gitignore @@ -0,0 +1,4 @@ +*.jar +*.jar +*.class +*.log \ No newline at end of file diff --git a/w11/java-cli - 副本/.vscode/settings.json b/w11/java-cli - 副本/.vscode/settings.json new file mode 100644 index 0000000..c5f3f6b --- /dev/null +++ b/w11/java-cli - 副本/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "java.configuration.updateBuildConfiguration": "interactive" +} \ No newline at end of file diff --git a/w11/java-cli - 副本/pom.xml b/w11/java-cli - 副本/pom.xml new file mode 100644 index 0000000..c5ae7b7 --- /dev/null +++ b/w11/java-cli - 副本/pom.xml @@ -0,0 +1,57 @@ + + 4.0.0 + com.example + datacollect-cli + 0.1.0 + + 11 + 11 + + + + org.jsoup + jsoup + 1.17.2 + + + ch.qos.logback + logback-classic + 1.4.14 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + + com.example.datacollect.Main + + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + + diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/Main.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/Main.java new file mode 100644 index 0000000..ef65b3e --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/Main.java @@ -0,0 +1,25 @@ +package com.example.datacollect; + +import com.example.datacollect.controller.CrawlerController; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class Main { + private static final Logger logger = LoggerFactory.getLogger(Main.class); + + public static void main(String[] args) { + logger.info("Starting CLI Crawler application"); + ConsoleView view = new ConsoleView(); + ArticleRepository repository = new ArticleRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + + view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); + while (true) { + controller.handle(view.readLine()); + } + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/AnalyzeCommand.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/AnalyzeCommand.java new file mode 100644 index 0000000..d73f855 --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/AnalyzeCommand.java @@ -0,0 +1,75 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class AnalyzeCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class); + + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "analyze"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + logger.warn("Analyze command called without URL argument"); + view.printError("Usage: analyze "); + return; + } + String url = args[1]; + logger.info("Analyzing URL: {}", url); + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + logger.debug("Using strategy: {}", strategy.getClass().getSimpleName()); + + try { + view.printInfo("Analyzing: " + url); + Document doc = Jsoup.connect(url).get(); + var articles = strategy.parse(url, doc); + + int count = articles.size(); + int totalTitleLength = 0; + int totalContentLength = 0; + + for (var article : articles) { + if (article.getTitle() != null) { + totalTitleLength += article.getTitle().length(); + } + if (article.getContent() != null) { + totalContentLength += article.getContent().length(); + } + } + + double avgTitleLength = count > 0 ? (double) totalTitleLength / count : 0; + double avgContentLength = count > 0 ? (double) totalContentLength / count : 0; + + logger.info("Analysis complete - Articles: {}, Avg Title Length: {:.2f}, Avg Content Length: {:.2f}", + count, avgTitleLength, avgContentLength); + + view.printSuccess("Analysis Results:"); + view.printInfo(" Total Articles: " + count); + view.printInfo(" Average Title Length: " + String.format("%.2f", avgTitleLength)); + view.printInfo(" Average Content Length: " + String.format("%.2f", avgContentLength)); + view.printInfo(" Strategy Used: " + strategy.getClass().getSimpleName()); + } catch (Exception e) { + logger.error("Failed to analyze URL {}: {}", url, e.getMessage(), e); + view.printError("Failed to analyze: " + e.getMessage()); + } + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/Command.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/Command.java new file mode 100644 index 0000000..029cadc --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/Command.java @@ -0,0 +1,8 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; + +public interface Command { + String getName(); + void execute(String[] args, ArticleRepository repository); +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/CrawlCommand.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/CrawlCommand.java new file mode 100644 index 0000000..13f5b3d --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/CrawlCommand.java @@ -0,0 +1,88 @@ +package com.example.datacollect.command; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; + +public class CrawlCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); + private static final int MAX_RETRIES = 3; + private static final long RETRY_DELAY_MS = 1000; + + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + logger.warn("Crawl command called without URL argument"); + view.printError("Usage: crawl "); + return; + } + String url = args[1]; + logger.info("Starting crawl for URL: {}", url); + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + logger.debug("Using strategy: {}", strategy.getClass().getSimpleName()); + + int retryCount = 0; + boolean success = false; + + while (retryCount < MAX_RETRIES && !success) { + try { + view.printInfo("Crawling: " + url + (retryCount > 0 ? " (attempt " + (retryCount + 1) + ")" : "")); + logger.debug("Attempt {} to fetch URL: {}", retryCount + 1, url); + Document doc = Jsoup.connect(url).get(); + var articles = strategy.parse(url, doc); + repository.addAll(articles); + logger.info("Successfully crawled {} articles from {}", articles.size(), url); + view.printSuccess("Crawled " + articles.size() + " articles."); + success = true; + } catch (IOException e) { + retryCount++; + logger.error("Network error on attempt {} for URL {}: {}", retryCount, url, e.getMessage()); + if (retryCount < MAX_RETRIES) { + view.printWarning("Network error: " + e.getMessage() + ", retrying..."); + sleep(RETRY_DELAY_MS); + } else { + logger.error("Failed to crawl URL {} after {} attempts", url, MAX_RETRIES); + view.printError("Failed to crawl after " + MAX_RETRIES + " attempts: " + e.getMessage()); + } + } catch (ParseException e) { + logger.error("Parse error for URL {}: {}", url, e.getMessage()); + view.printError("Parse error: " + e.getMessage()); + break; + } catch (Exception e) { + logger.error("Unexpected error for URL {}: {}", url, e.getMessage(), e); + view.printError("Unexpected error: " + e.getMessage()); + break; + } + } + } + + private void sleep(long millis) { + try { + Thread.sleep(millis); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + logger.warn("Sleep interrupted"); + } + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/ExitCommand.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/ExitCommand.java new file mode 100644 index 0000000..51ee001 --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/ExitCommand.java @@ -0,0 +1,28 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ExitCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class); + + private final ConsoleView view; + + public ExitCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.info("Exiting application"); + view.printSuccess("Bye!"); + System.exit(0); + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/HelpCommand.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/HelpCommand.java new file mode 100644 index 0000000..ee03cfb --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/HelpCommand.java @@ -0,0 +1,27 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class HelpCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class); + + private final ConsoleView view; + + public HelpCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "help"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.debug("Displaying help information"); + view.printInfo("Commands: crawl , analyze , list, help, exit"); + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/ListCommand.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/ListCommand.java new file mode 100644 index 0000000..ea383a8 --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/command/ListCommand.java @@ -0,0 +1,27 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ListCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(ListCommand.class); + + private final ConsoleView view; + + public ListCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "list"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.debug("Listing {} articles", repository.size()); + view.display(repository.getAll()); + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/controller/CrawlerController.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/controller/CrawlerController.java new file mode 100644 index 0000000..0ea7179 --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/controller/CrawlerController.java @@ -0,0 +1,57 @@ +package com.example.datacollect.controller; + +import com.example.datacollect.command.AnalyzeCommand; +import com.example.datacollect.command.Command; +import com.example.datacollect.command.CrawlCommand; +import com.example.datacollect.command.ExitCommand; +import com.example.datacollect.command.HelpCommand; +import com.example.datacollect.command.ListCommand; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.HashMap; +import java.util.Map; + +public class CrawlerController { + private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class); + + private final Map commands = new HashMap<>(); + private final ConsoleView view; + private final ArticleRepository repository; + + public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { + this.view = view; + this.repository = repository; + register(new HelpCommand(view)); + register(new ListCommand(view)); + register(new CrawlCommand(view, strategyFactory)); + register(new AnalyzeCommand(view, strategyFactory)); + register(new ExitCommand(view)); + logger.info("CrawlerController initialized with {} commands", commands.size()); + } + + private void register(Command command) { + commands.put(command.getName(), command); + logger.debug("Registered command: {}", command.getName()); + } + + public void handle(String input) { + String text = input == null ? "" : input.trim(); + if (text.isEmpty()) { + return; + } + + String[] args = text.split("\\s+"); + String cmdName = args[0].toLowerCase(); + Command command = commands.get(cmdName); + if (command == null) { + logger.warn("Unknown command: {}", cmdName); + view.printError("Unknown command: " + cmdName); + return; + } + logger.info("Executing command: {}", cmdName); + command.execute(args, repository); + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/exception/CrawlerException.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/exception/CrawlerException.java new file mode 100644 index 0000000..e81c3c9 --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/exception/CrawlerException.java @@ -0,0 +1,11 @@ +package com.example.datacollect.exception; + +public class CrawlerException extends Exception { + public CrawlerException(String message) { + super(message); + } + + public CrawlerException(String message, Throwable cause) { + super(message, cause); + } +} \ No newline at end of file diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/exception/NetworkException.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/exception/NetworkException.java new file mode 100644 index 0000000..0fb8e5e --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/exception/NetworkException.java @@ -0,0 +1,11 @@ +package com.example.datacollect.exception; + +public class NetworkException extends CrawlerException { + public NetworkException(String message) { + super(message); + } + + public NetworkException(String message, Throwable cause) { + super(message, cause); + } +} \ No newline at end of file diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/exception/ParseException.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/exception/ParseException.java new file mode 100644 index 0000000..205665a --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/exception/ParseException.java @@ -0,0 +1,11 @@ +package com.example.datacollect.exception; + +public class ParseException extends CrawlerException { + public ParseException(String message) { + super(message); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} \ No newline at end of file diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/model/Article.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/model/Article.java new file mode 100644 index 0000000..147dbe6 --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/model/Article.java @@ -0,0 +1,45 @@ +package com.example.datacollect.model; + +public class Article { + private String title; + private String url; + private String content; + + public Article(String title, String url, String content) { + this.title = title; + this.url = url; + this.content = content; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + @Override + public String toString() { + return "Article{" + + "title='" + title + '\'' + + ", url='" + url + '\'' + + '}'; + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/repository/ArticleRepository.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/repository/ArticleRepository.java new file mode 100644 index 0000000..ad717cc --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/repository/ArticleRepository.java @@ -0,0 +1,76 @@ +package com.example.datacollect.repository; + +import com.example.datacollect.model.Article; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ArticleRepository { + private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); + + private final List
articles = new ArrayList<>(); + + public void add(Article article) { + if (article == null) { + logger.error("Attempted to add null article"); + throw new IllegalArgumentException("Article cannot be null"); + } + if (article.getTitle() == null || article.getTitle().trim().isEmpty()) { + logger.warn("Attempted to add article with empty title"); + throw new IllegalArgumentException("Article title cannot be null or empty"); + } + if (article.getUrl() == null || article.getUrl().trim().isEmpty()) { + logger.warn("Attempted to add article with empty URL"); + throw new IllegalArgumentException("Article URL cannot be null or empty"); + } + articles.add(article); + logger.debug("Added article: {}", article.getTitle()); + } + + public void addAll(List
articleList) { + if (articleList == null) { + logger.error("Attempted to add null article list"); + throw new IllegalArgumentException("Article list cannot be null"); + } + if (articleList.isEmpty()) { + logger.debug("Attempted to add empty article list"); + return; + } + + for (int i = 0; i < articleList.size(); i++) { + Article article = articleList.get(i); + if (article == null) { + logger.warn("Skipping null article at index {}", i); + throw new IllegalArgumentException("Article in list cannot be null at index " + i); + } + if (article.getTitle() == null || article.getTitle().trim().isEmpty()) { + logger.warn("Skipping article with empty title at index {}", i); + throw new IllegalArgumentException("Article title cannot be null or empty at index " + i); + } + if (article.getUrl() == null || article.getUrl().trim().isEmpty()) { + logger.warn("Skipping article with empty URL at index {}", i); + throw new IllegalArgumentException("Article URL cannot be null or empty at index " + i); + } + } + + articles.addAll(articleList); + logger.info("Added {} articles to repository", articleList.size()); + } + + public List
getAll() { + logger.debug("Retrieving all articles, count: {}", articles.size()); + return Collections.unmodifiableList(articles); + } + + public int size() { + return articles.size(); + } + + public void clear() { + int size = articles.size(); + articles.clear(); + logger.info("Cleared repository, removed {} articles", size); + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/BlogStrategy.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/BlogStrategy.java new file mode 100644 index 0000000..66d5b16 --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/BlogStrategy.java @@ -0,0 +1,28 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy extends PriorityStrategy { + private static final int PRIORITY = 100; + private static final String URL_PATTERN = ".*blog\\.example\\.com.*"; + + public BlogStrategy() { + super(PRIORITY, URL_PATTERN); + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java new file mode 100644 index 0000000..ed69e19 --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java @@ -0,0 +1,11 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface CrawlStrategy { + List
parse(String url, Document doc) throws ParseException; + boolean supports(String url); +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/DefaultStrategy.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/DefaultStrategy.java new file mode 100644 index 0000000..fb46b0d --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/DefaultStrategy.java @@ -0,0 +1,38 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class DefaultStrategy implements CrawlStrategy { + + @Override + public boolean supports(String url) { + return true; + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + + Elements links = doc.select("a[href]"); + for (Element link : links) { + String title = link.text().trim(); + String href = link.attr("abs:href"); + + if (!title.isEmpty() && title.length() > 5) { + articles.add(new Article(title, href.isEmpty() ? url : href, "")); + } + + if (articles.size() >= 20) { + break; + } + } + + return articles; + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..bbf56b2 --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java @@ -0,0 +1,52 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class HnuNewsStrategy extends PriorityStrategy { + private static final int PRIORITY = 200; + private static final String URL_PATTERN = ".*news\\.hnu\\.edu\\.cn.*"; + + public HnuNewsStrategy() { + super(PRIORITY, URL_PATTERN); + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + Elements listItems = doc.select("ul.list11 li"); + + for (Element li : listItems) { + Element link = li.selectFirst("a"); + if (link == null) continue; + + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); + } + + String title = ""; + Element titleEl = link.selectFirst("h4.l2.h4s2"); + if (titleEl != null) { + title = titleEl.text().trim(); + } + + String content = ""; + Element contentEl = link.selectFirst("p.l3.ps3"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + return articles; + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/NewsStrategy.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/NewsStrategy.java new file mode 100644 index 0000000..c6c6b98 --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/NewsStrategy.java @@ -0,0 +1,28 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class NewsStrategy extends PriorityStrategy { + private static final int PRIORITY = 100; + private static final String URL_PATTERN = ".*news\\.example\\.com.*"; + + public NewsStrategy() { + super(PRIORITY, URL_PATTERN); + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + Elements items = doc.select(".article-headline"); + for (Element e : items) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/PriorityStrategy.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/PriorityStrategy.java new file mode 100644 index 0000000..ded1239 --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/PriorityStrategy.java @@ -0,0 +1,27 @@ +package com.example.datacollect.strategy; + +import java.util.regex.Pattern; + +public abstract class PriorityStrategy implements CrawlStrategy, Comparable { + private final int priority; + private final Pattern urlPattern; + + public PriorityStrategy(int priority, String regexPattern) { + this.priority = priority; + this.urlPattern = Pattern.compile(regexPattern); + } + + @Override + public boolean supports(String url) { + return urlPattern.matcher(url).matches(); + } + + @Override + public int compareTo(PriorityStrategy other) { + return Integer.compare(other.priority, this.priority); + } + + public int getPriority() { + return priority; + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/StrategyFactory.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/StrategyFactory.java new file mode 100644 index 0000000..26d217c --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/strategy/StrategyFactory.java @@ -0,0 +1,49 @@ +package com.example.datacollect.strategy; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class StrategyFactory { + private static final Logger logger = LoggerFactory.getLogger(StrategyFactory.class); + + private final List strategies = new ArrayList<>(); + private final CrawlStrategy defaultStrategy; + + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + Collections.sort(strategies); + this.defaultStrategy = new DefaultStrategy(); + logger.info("StrategyFactory initialized with {} strategies", strategies.size()); + } + + public CrawlStrategy getStrategy(String url) { + if (url == null || url.trim().isEmpty()) { + logger.debug("Empty URL provided, using default strategy"); + return defaultStrategy; + } + + for (PriorityStrategy s : strategies) { + if (s.supports(url)) { + logger.debug("URL {} matched strategy: {}", url, s.getClass().getSimpleName()); + return s; + } + } + logger.debug("URL {} did not match any specific strategy, using default", url); + return defaultStrategy; + } + + public void register(PriorityStrategy strategy) { + strategies.add(strategy); + Collections.sort(strategies); + logger.info("Registered new strategy: {}", strategy.getClass().getSimpleName()); + } + + public CrawlStrategy getDefaultStrategy() { + return defaultStrategy; + } +} diff --git a/w11/java-cli - 副本/src/main/java/com/example/datacollect/view/ConsoleView.java b/w11/java-cli - 副本/src/main/java/com/example/datacollect/view/ConsoleView.java new file mode 100644 index 0000000..6c058ba --- /dev/null +++ b/w11/java-cli - 副本/src/main/java/com/example/datacollect/view/ConsoleView.java @@ -0,0 +1,47 @@ +package com.example.datacollect.view; + +import com.example.datacollect.model.Article; +import java.util.List; +import java.util.Scanner; + +public class ConsoleView { + private static final String ANSI_RESET = "\u001B[0m"; + private static final String ANSI_GREEN = "\u001B[32m"; + private static final String ANSI_RED = "\u001B[31m"; + private static final String ANSI_BLUE = "\u001B[34m"; + private static final String ANSI_YELLOW = "\u001B[33m"; + + private final Scanner scanner = new Scanner(System.in); + + public String readLine() { + System.out.print("> "); + return scanner.nextLine(); + } + + public void printSuccess(String msg) { + System.out.println(ANSI_GREEN + msg + ANSI_RESET); + } + + public void printError(String msg) { + System.out.println(ANSI_RED + msg + ANSI_RESET); + } + + public void printInfo(String msg) { + System.out.println(ANSI_BLUE + msg + ANSI_RESET); + } + + public void printWarning(String msg) { + System.out.println(ANSI_YELLOW + msg + ANSI_RESET); + } + + public void display(List
articles) { + if (articles.isEmpty()) { + printInfo("暂无文章,请先执行 crawl。"); + return; + } + for (int i = 0; i < articles.size(); i++) { + Article a = articles.get(i); + System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); + } + } +} diff --git a/w11/java-cli - 副本/src/main/resources/logback.xml b/w11/java-cli - 副本/src/main/resources/logback.xml new file mode 100644 index 0000000..8a3b75d --- /dev/null +++ b/w11/java-cli - 副本/src/main/resources/logback.xml @@ -0,0 +1,26 @@ + + + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + logs/crawler.log + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + logs/crawler.%d{yyyy-MM-dd}.log + 30 + + + + + + + + + + \ No newline at end of file diff --git a/w11/java-cli - 副本/target/classes/logback.xml b/w11/java-cli - 副本/target/classes/logback.xml new file mode 100644 index 0000000..8a3b75d --- /dev/null +++ b/w11/java-cli - 副本/target/classes/logback.xml @@ -0,0 +1,26 @@ + + + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + logs/crawler.log + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + logs/crawler.%d{yyyy-MM-dd}.log + 30 + + + + + + + + + + \ No newline at end of file diff --git a/w11/java-cli - 副本/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst b/w11/java-cli - 副本/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst new file mode 100644 index 0000000..a9870f6 --- /dev/null +++ b/w11/java-cli - 副本/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst @@ -0,0 +1,21 @@ +com\example\datacollect\strategy\DefaultStrategy.class +com\example\datacollect\strategy\PriorityStrategy.class +com\example\datacollect\command\ListCommand.class +com\example\datacollect\command\CrawlCommand.class +com\example\datacollect\strategy\BlogStrategy.class +com\example\datacollect\repository\ArticleRepository.class +com\example\datacollect\Main.class +com\example\datacollect\view\ConsoleView.class +com\example\datacollect\command\ExitCommand.class +com\example\datacollect\command\HelpCommand.class +com\example\datacollect\strategy\NewsStrategy.class +com\example\datacollect\command\Command.class +com\example\datacollect\controller\CrawlerController.class +com\example\datacollect\exception\CrawlerException.class +com\example\datacollect\exception\NetworkException.class +com\example\datacollect\command\AnalyzeCommand.class +com\example\datacollect\strategy\StrategyFactory.class +com\example\datacollect\strategy\HnuNewsStrategy.class +com\example\datacollect\exception\ParseException.class +com\example\datacollect\strategy\CrawlStrategy.class +com\example\datacollect\model\Article.class diff --git a/w11/java-cli - 副本/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst b/w11/java-cli - 副本/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst new file mode 100644 index 0000000..99bc177 --- /dev/null +++ b/w11/java-cli - 副本/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst @@ -0,0 +1,21 @@ +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\strategy\DefaultStrategy.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\command\Command.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\exception\ParseException.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\command\ListCommand.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\exception\NetworkException.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\command\AnalyzeCommand.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\command\HelpCommand.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\strategy\NewsStrategy.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\Main.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\command\ExitCommand.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\strategy\CrawlStrategy.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\exception\CrawlerException.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\command\CrawlCommand.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\strategy\BlogStrategy.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\controller\CrawlerController.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\model\Article.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\repository\ArticleRepository.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\strategy\HnuNewsStrategy.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\view\ConsoleView.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\strategy\PriorityStrategy.java +D:\桌面\java-cli - 副本\src\main\java\com\example\datacollect\strategy\StrategyFactory.java