Browse Source

宋瑞-202506050301

main
Songrui 3 weeks ago
parent
commit
f6deacf725
  1. 4
      w12/java-cli/.gitignore
  2. 3
      w12/java-cli/.vscode/settings.json
  3. 67
      w12/java-cli/pom.xml
  4. 25
      w12/java-cli/src/main/java/com/example/datacollect/Main.java
  5. 75
      w12/java-cli/src/main/java/com/example/datacollect/command/AnalyzeCommand.java
  6. 8
      w12/java-cli/src/main/java/com/example/datacollect/command/Command.java
  7. 88
      w12/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java
  8. 28
      w12/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java
  9. 27
      w12/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java
  10. 84
      w12/java-cli/src/main/java/com/example/datacollect/command/JsonExporterCommand.java
  11. 118
      w12/java-cli/src/main/java/com/example/datacollect/command/JsonImporterCommand.java
  12. 27
      w12/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java
  13. 61
      w12/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java
  14. 11
      w12/java-cli/src/main/java/com/example/datacollect/exception/CrawlerException.java
  15. 11
      w12/java-cli/src/main/java/com/example/datacollect/exception/NetworkException.java
  16. 11
      w12/java-cli/src/main/java/com/example/datacollect/exception/ParseException.java
  17. 65
      w12/java-cli/src/main/java/com/example/datacollect/model/Article.java
  18. 76
      w12/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java
  19. 28
      w12/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java
  20. 11
      w12/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java
  21. 38
      w12/java-cli/src/main/java/com/example/datacollect/strategy/DefaultStrategy.java
  22. 52
      w12/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java
  23. 28
      w12/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java
  24. 27
      w12/java-cli/src/main/java/com/example/datacollect/strategy/PriorityStrategy.java
  25. 49
      w12/java-cli/src/main/java/com/example/datacollect/strategy/StrategyFactory.java
  26. 47
      w12/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java
  27. 26
      w12/java-cli/src/main/resources/logback.xml
  28. 26
      w12/java-cli/target/classes/logback.xml
  29. 3
      w12/java-cli/target/maven-archiver/pom.properties
  30. 21
      w12/java-cli/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
  31. 23
      w12/java-cli/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst

4
w12/java-cli/.gitignore

@ -0,0 +1,4 @@
*.jar
*.jar
*.class
*.log

3
w12/java-cli/.vscode/settings.json

@ -0,0 +1,3 @@
{
"java.configuration.updateBuildConfiguration": "interactive"
}

67
w12/java-cli/pom.xml

@ -0,0 +1,67 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>datacollect-cli</artifactId>
<version>0.1.0</version>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.4.14</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.15.3</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
<version>2.15.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<archive>
<manifest>
<mainClass>com.example.datacollect.Main</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

25
w12/java-cli/src/main/java/com/example/datacollect/Main.java

@ -0,0 +1,25 @@
package com.example.datacollect;
import com.example.datacollect.controller.CrawlerController;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.strategy.StrategyFactory;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Main {
private static final Logger logger = LoggerFactory.getLogger(Main.class);
public static void main(String[] args) {
logger.info("Starting CLI Crawler application");
ConsoleView view = new ConsoleView();
ArticleRepository repository = new ArticleRepository();
StrategyFactory strategyFactory = new StrategyFactory();
CrawlerController controller = new CrawlerController(view, repository, strategyFactory);
view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands.");
while (true) {
controller.handle(view.readLine());
}
}
}

75
w12/java-cli/src/main/java/com/example/datacollect/command/AnalyzeCommand.java

@ -0,0 +1,75 @@
package com.example.datacollect.command;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.strategy.CrawlStrategy;
import com.example.datacollect.strategy.StrategyFactory;
import com.example.datacollect.view.ConsoleView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class AnalyzeCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class);
private final ConsoleView view;
private final StrategyFactory strategyFactory;
public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) {
this.view = view;
this.strategyFactory = strategyFactory;
}
@Override
public String getName() {
return "analyze";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
if (args.length < 2) {
logger.warn("Analyze command called without URL argument");
view.printError("Usage: analyze <url>");
return;
}
String url = args[1];
logger.info("Analyzing URL: {}", url);
CrawlStrategy strategy = strategyFactory.getStrategy(url);
logger.debug("Using strategy: {}", strategy.getClass().getSimpleName());
try {
view.printInfo("Analyzing: " + url);
Document doc = Jsoup.connect(url).get();
var articles = strategy.parse(url, doc);
int count = articles.size();
int totalTitleLength = 0;
int totalContentLength = 0;
for (var article : articles) {
if (article.getTitle() != null) {
totalTitleLength += article.getTitle().length();
}
if (article.getContent() != null) {
totalContentLength += article.getContent().length();
}
}
double avgTitleLength = count > 0 ? (double) totalTitleLength / count : 0;
double avgContentLength = count > 0 ? (double) totalContentLength / count : 0;
logger.info("Analysis complete - Articles: {}, Avg Title Length: {:.2f}, Avg Content Length: {:.2f}",
count, avgTitleLength, avgContentLength);
view.printSuccess("Analysis Results:");
view.printInfo(" Total Articles: " + count);
view.printInfo(" Average Title Length: " + String.format("%.2f", avgTitleLength));
view.printInfo(" Average Content Length: " + String.format("%.2f", avgContentLength));
view.printInfo(" Strategy Used: " + strategy.getClass().getSimpleName());
} catch (Exception e) {
logger.error("Failed to analyze URL {}: {}", url, e.getMessage(), e);
view.printError("Failed to analyze: " + e.getMessage());
}
}
}

8
w12/java-cli/src/main/java/com/example/datacollect/command/Command.java

@ -0,0 +1,8 @@
package com.example.datacollect.command;
import com.example.datacollect.repository.ArticleRepository;
public interface Command {
String getName();
void execute(String[] args, ArticleRepository repository);
}

88
w12/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java

@ -0,0 +1,88 @@
package com.example.datacollect.command;
import com.example.datacollect.exception.ParseException;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.strategy.CrawlStrategy;
import com.example.datacollect.strategy.StrategyFactory;
import com.example.datacollect.view.ConsoleView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
public class CrawlCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class);
private static final int MAX_RETRIES = 3;
private static final long RETRY_DELAY_MS = 1000;
private final ConsoleView view;
private final StrategyFactory strategyFactory;
public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) {
this.view = view;
this.strategyFactory = strategyFactory;
}
@Override
public String getName() {
return "crawl";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
if (args.length < 2) {
logger.warn("Crawl command called without URL argument");
view.printError("Usage: crawl <url>");
return;
}
String url = args[1];
logger.info("Starting crawl for URL: {}", url);
CrawlStrategy strategy = strategyFactory.getStrategy(url);
logger.debug("Using strategy: {}", strategy.getClass().getSimpleName());
int retryCount = 0;
boolean success = false;
while (retryCount < MAX_RETRIES && !success) {
try {
view.printInfo("Crawling: " + url + (retryCount > 0 ? " (attempt " + (retryCount + 1) + ")" : ""));
logger.debug("Attempt {} to fetch URL: {}", retryCount + 1, url);
Document doc = Jsoup.connect(url).get();
var articles = strategy.parse(url, doc);
repository.addAll(articles);
logger.info("Successfully crawled {} articles from {}", articles.size(), url);
view.printSuccess("Crawled " + articles.size() + " articles.");
success = true;
} catch (IOException e) {
retryCount++;
logger.error("Network error on attempt {} for URL {}: {}", retryCount, url, e.getMessage());
if (retryCount < MAX_RETRIES) {
view.printWarning("Network error: " + e.getMessage() + ", retrying...");
sleep(RETRY_DELAY_MS);
} else {
logger.error("Failed to crawl URL {} after {} attempts", url, MAX_RETRIES);
view.printError("Failed to crawl after " + MAX_RETRIES + " attempts: " + e.getMessage());
}
} catch (ParseException e) {
logger.error("Parse error for URL {}: {}", url, e.getMessage());
view.printError("Parse error: " + e.getMessage());
break;
} catch (Exception e) {
logger.error("Unexpected error for URL {}: {}", url, e.getMessage(), e);
view.printError("Unexpected error: " + e.getMessage());
break;
}
}
}
private void sleep(long millis) {
try {
Thread.sleep(millis);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("Sleep interrupted");
}
}
}

28
w12/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java

@ -0,0 +1,28 @@
package com.example.datacollect.command;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ExitCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class);
private final ConsoleView view;
public ExitCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "exit";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
logger.info("Exiting application");
view.printSuccess("Bye!");
System.exit(0);
}
}

27
w12/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java

@ -0,0 +1,27 @@
package com.example.datacollect.command;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class HelpCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class);
private final ConsoleView view;
public HelpCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "help";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
logger.debug("Displaying help information");
view.printInfo("Commands: crawl <url>, analyze <url>, list, export [--format json], import <filename>, help, exit");
}
}

84
w12/java-cli/src/main/java/com/example/datacollect/command/JsonExporterCommand.java

@ -0,0 +1,84 @@
package com.example.datacollect.command;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class JsonExporterCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(JsonExporterCommand.class);
private static final String DEFAULT_FILENAME = "articles.json";
private final ConsoleView view;
public JsonExporterCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "export";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
String filename = DEFAULT_FILENAME;
String format = null;
for (int i = 1; i < args.length; i++) {
if (args[i].equals("--format") && i + 1 < args.length) {
format = args[i + 1];
i++;
} else if (!args[i].startsWith("-")) {
filename = args[i];
}
}
if (format != null && !format.equals("json")) {
logger.warn("Unsupported export format: {}", format);
view.printError("Unsupported format: " + format + ". Only 'json' is supported.");
return;
}
List<Article> articles = repository.getAll();
if (articles.isEmpty()) {
logger.warn("Attempted to export empty repository");
view.printWarning("No articles to export.");
return;
}
logger.info("Exporting {} articles to JSON file: {}", articles.size(), filename);
ObjectMapper mapper = new ObjectMapper();
mapper.registerModule(new JavaTimeModule());
mapper.enable(SerializationFeature.INDENT_OUTPUT);
mapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
Map<String, Object> exportData = new HashMap<>();
exportData.put("articles", articles);
exportData.put("count", articles.size());
exportData.put("exportedAt", java.time.LocalDateTime.now().toString());
Path path = Paths.get(filename);
try (FileWriter writer = new FileWriter(path.toFile())) {
mapper.writeValue(writer, exportData);
logger.info("Successfully exported articles to {}", path.toAbsolutePath());
view.printSuccess("Exported " + articles.size() + " articles to " + filename);
} catch (IOException e) {
logger.error("Failed to export articles to {}: {}", filename, e.getMessage());
view.printError("Failed to export: " + e.getMessage());
}
}
}

118
w12/java-cli/src/main/java/com/example/datacollect/command/JsonImporterCommand.java

@ -0,0 +1,118 @@
package com.example.datacollect.command;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class JsonImporterCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(JsonImporterCommand.class);
private final ConsoleView view;
public JsonImporterCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "import";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
if (args.length < 2) {
logger.warn("Import command called without filename argument");
view.printError("Usage: import <filename>");
return;
}
String filename = args[1];
Path path = Paths.get(filename);
if (!Files.exists(path)) {
logger.error("Import file does not exist: {}", filename);
view.printError("File not found: " + filename);
return;
}
ObjectMapper mapper = new ObjectMapper();
mapper.registerModule(new JavaTimeModule());
mapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
try {
String content = Files.readString(path);
Map<String, Object> data = mapper.readValue(content, Map.class);
List<Map<String, Object>> articlesList = (List<Map<String, Object>>) data.get("articles");
if (articlesList == null || articlesList.isEmpty()) {
logger.warn("No articles found in import file");
view.printWarning("No articles found in file.");
return;
}
Set<String> existingUrls = new HashSet<>();
for (Article article : repository.getAll()) {
existingUrls.add(article.getUrl());
}
int importedCount = 0;
int skippedCount = 0;
for (Map<String, Object> articleMap : articlesList) {
String title = (String) articleMap.get("title");
String url = (String) articleMap.get("url");
String contentStr = (String) articleMap.get("content");
if (title == null || url == null) {
logger.warn("Skipping article with missing title or url");
skippedCount++;
continue;
}
if (existingUrls.contains(url)) {
logger.debug("Skipping duplicate article with url: {}", url);
skippedCount++;
continue;
}
Article article;
if (articleMap.containsKey("crawledAt")) {
String crawledAtStr = (String) articleMap.get("crawledAt");
java.time.LocalDateTime crawledAt = java.time.LocalDateTime.parse(crawledAtStr);
article = new Article(title, url, contentStr, crawledAt);
} else {
article = new Article(title, url, contentStr);
}
repository.add(article);
existingUrls.add(url);
importedCount++;
}
logger.info("Imported {} articles, skipped {} duplicates", importedCount, skippedCount);
view.printSuccess("Imported " + importedCount + " articles, skipped " + skippedCount + " duplicates.");
} catch (IOException e) {
logger.error("Failed to import articles from {}: {}", filename, e.getMessage());
view.printError("Failed to import: " + e.getMessage());
} catch (Exception e) {
logger.error("Error parsing import file {}: {}", filename, e.getMessage());
view.printError("Invalid JSON format: " + e.getMessage());
}
}
}

27
w12/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java

@ -0,0 +1,27 @@
package com.example.datacollect.command;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ListCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(ListCommand.class);
private final ConsoleView view;
public ListCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "list";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
logger.debug("Listing {} articles", repository.size());
view.display(repository.getAll());
}
}

61
w12/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java

@ -0,0 +1,61 @@
package com.example.datacollect.controller;
import com.example.datacollect.command.AnalyzeCommand;
import com.example.datacollect.command.Command;
import com.example.datacollect.command.CrawlCommand;
import com.example.datacollect.command.ExitCommand;
import com.example.datacollect.command.HelpCommand;
import com.example.datacollect.command.JsonExporterCommand;
import com.example.datacollect.command.JsonImporterCommand;
import com.example.datacollect.command.ListCommand;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.strategy.StrategyFactory;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;
public class CrawlerController {
private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class);
private final Map<String, Command> commands = new HashMap<>();
private final ConsoleView view;
private final ArticleRepository repository;
public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) {
this.view = view;
this.repository = repository;
register(new HelpCommand(view));
register(new ListCommand(view));
register(new CrawlCommand(view, strategyFactory));
register(new AnalyzeCommand(view, strategyFactory));
register(new ExitCommand(view));
register(new JsonExporterCommand(view));
register(new JsonImporterCommand(view));
logger.info("CrawlerController initialized with {} commands", commands.size());
}
private void register(Command command) {
commands.put(command.getName(), command);
logger.debug("Registered command: {}", command.getName());
}
public void handle(String input) {
String text = input == null ? "" : input.trim();
if (text.isEmpty()) {
return;
}
String[] args = text.split("\\s+");
String cmdName = args[0].toLowerCase();
Command command = commands.get(cmdName);
if (command == null) {
logger.warn("Unknown command: {}", cmdName);
view.printError("Unknown command: " + cmdName);
return;
}
logger.info("Executing command: {}", cmdName);
command.execute(args, repository);
}
}

11
w12/java-cli/src/main/java/com/example/datacollect/exception/CrawlerException.java

@ -0,0 +1,11 @@
package com.example.datacollect.exception;
public class CrawlerException extends Exception {
public CrawlerException(String message) {
super(message);
}
public CrawlerException(String message, Throwable cause) {
super(message, cause);
}
}

11
w12/java-cli/src/main/java/com/example/datacollect/exception/NetworkException.java

@ -0,0 +1,11 @@
package com.example.datacollect.exception;
public class NetworkException extends CrawlerException {
public NetworkException(String message) {
super(message);
}
public NetworkException(String message, Throwable cause) {
super(message, cause);
}
}

11
w12/java-cli/src/main/java/com/example/datacollect/exception/ParseException.java

@ -0,0 +1,11 @@
package com.example.datacollect.exception;
public class ParseException extends CrawlerException {
public ParseException(String message) {
super(message);
}
public ParseException(String message, Throwable cause) {
super(message, cause);
}
}

65
w12/java-cli/src/main/java/com/example/datacollect/model/Article.java

@ -0,0 +1,65 @@
package com.example.datacollect.model;
import java.time.LocalDateTime;
public class Article {
private String title;
private String url;
private String content;
private LocalDateTime crawledAt;
public Article(String title, String url, String content) {
this.title = title;
this.url = url;
this.content = content;
this.crawledAt = LocalDateTime.now();
}
public Article(String title, String url, String content, LocalDateTime crawledAt) {
this.title = title;
this.url = url;
this.content = content;
this.crawledAt = crawledAt;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public LocalDateTime getCrawledAt() {
return crawledAt;
}
public void setCrawledAt(LocalDateTime crawledAt) {
this.crawledAt = crawledAt;
}
@Override
public String toString() {
return "Article{"
+ "title='" + title + '\''
+ ", url='" + url + '\''
+ ", crawledAt=" + crawledAt
+ '}';
}
}

76
w12/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java

@ -0,0 +1,76 @@
package com.example.datacollect.repository;
import com.example.datacollect.model.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class ArticleRepository {
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class);
private final List<Article> articles = new ArrayList<>();
public void add(Article article) {
if (article == null) {
logger.error("Attempted to add null article");
throw new IllegalArgumentException("Article cannot be null");
}
if (article.getTitle() == null || article.getTitle().trim().isEmpty()) {
logger.warn("Attempted to add article with empty title");
throw new IllegalArgumentException("Article title cannot be null or empty");
}
if (article.getUrl() == null || article.getUrl().trim().isEmpty()) {
logger.warn("Attempted to add article with empty URL");
throw new IllegalArgumentException("Article URL cannot be null or empty");
}
articles.add(article);
logger.debug("Added article: {}", article.getTitle());
}
public void addAll(List<Article> articleList) {
if (articleList == null) {
logger.error("Attempted to add null article list");
throw new IllegalArgumentException("Article list cannot be null");
}
if (articleList.isEmpty()) {
logger.debug("Attempted to add empty article list");
return;
}
for (int i = 0; i < articleList.size(); i++) {
Article article = articleList.get(i);
if (article == null) {
logger.warn("Skipping null article at index {}", i);
throw new IllegalArgumentException("Article in list cannot be null at index " + i);
}
if (article.getTitle() == null || article.getTitle().trim().isEmpty()) {
logger.warn("Skipping article with empty title at index {}", i);
throw new IllegalArgumentException("Article title cannot be null or empty at index " + i);
}
if (article.getUrl() == null || article.getUrl().trim().isEmpty()) {
logger.warn("Skipping article with empty URL at index {}", i);
throw new IllegalArgumentException("Article URL cannot be null or empty at index " + i);
}
}
articles.addAll(articleList);
logger.info("Added {} articles to repository", articleList.size());
}
public List<Article> getAll() {
logger.debug("Retrieving all articles, count: {}", articles.size());
return Collections.unmodifiableList(articles);
}
public int size() {
return articles.size();
}
public void clear() {
int size = articles.size();
articles.clear();
logger.info("Cleared repository, removed {} articles", size);
}
}

28
w12/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java

@ -0,0 +1,28 @@
package com.example.datacollect.strategy;
import com.example.datacollect.exception.ParseException;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class BlogStrategy extends PriorityStrategy {
private static final int PRIORITY = 100;
private static final String URL_PATTERN = ".*blog\\.example\\.com.*";
public BlogStrategy() {
super(PRIORITY, URL_PATTERN);
}
@Override
public List<Article> parse(String url, Document doc) throws ParseException {
List<Article> articles = new ArrayList<>();
Elements titles = doc.select(".post-title");
for (Element e : titles) {
articles.add(new Article(e.text(), url, ""));
}
return articles;
}
}

11
w12/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java

@ -0,0 +1,11 @@
package com.example.datacollect.strategy;
import com.example.datacollect.exception.ParseException;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import java.util.List;
public interface CrawlStrategy {
List<Article> parse(String url, Document doc) throws ParseException;
boolean supports(String url);
}

38
w12/java-cli/src/main/java/com/example/datacollect/strategy/DefaultStrategy.java

@ -0,0 +1,38 @@
package com.example.datacollect.strategy;
import com.example.datacollect.exception.ParseException;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class DefaultStrategy implements CrawlStrategy {
@Override
public boolean supports(String url) {
return true;
}
@Override
public List<Article> parse(String url, Document doc) throws ParseException {
List<Article> articles = new ArrayList<>();
Elements links = doc.select("a[href]");
for (Element link : links) {
String title = link.text().trim();
String href = link.attr("abs:href");
if (!title.isEmpty() && title.length() > 5) {
articles.add(new Article(title, href.isEmpty() ? url : href, ""));
}
if (articles.size() >= 20) {
break;
}
}
return articles;
}
}

52
w12/java-cli/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java

@ -0,0 +1,52 @@
package com.example.datacollect.strategy;
import com.example.datacollect.exception.ParseException;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class HnuNewsStrategy extends PriorityStrategy {
private static final int PRIORITY = 200;
private static final String URL_PATTERN = ".*news\\.hnu\\.edu\\.cn.*";
public HnuNewsStrategy() {
super(PRIORITY, URL_PATTERN);
}
@Override
public List<Article> parse(String url, Document doc) throws ParseException {
List<Article> articles = new ArrayList<>();
Elements listItems = doc.select("ul.list11 li");
for (Element li : listItems) {
Element link = li.selectFirst("a");
if (link == null) continue;
String articleUrl = link.attr("href");
if (!articleUrl.startsWith("http")) {
articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", "");
}
String title = "";
Element titleEl = link.selectFirst("h4.l2.h4s2");
if (titleEl != null) {
title = titleEl.text().trim();
}
String content = "";
Element contentEl = link.selectFirst("p.l3.ps3");
if (contentEl != null) {
content = contentEl.text().trim();
}
if (!title.isEmpty()) {
articles.add(new Article(title, articleUrl, content));
}
}
return articles;
}
}

28
w12/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java

@ -0,0 +1,28 @@
package com.example.datacollect.strategy;
import com.example.datacollect.exception.ParseException;
import com.example.datacollect.model.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class NewsStrategy extends PriorityStrategy {
private static final int PRIORITY = 100;
private static final String URL_PATTERN = ".*news\\.example\\.com.*";
public NewsStrategy() {
super(PRIORITY, URL_PATTERN);
}
@Override
public List<Article> parse(String url, Document doc) throws ParseException {
List<Article> articles = new ArrayList<>();
Elements items = doc.select(".article-headline");
for (Element e : items) {
articles.add(new Article(e.text(), url, ""));
}
return articles;
}
}

27
w12/java-cli/src/main/java/com/example/datacollect/strategy/PriorityStrategy.java

@ -0,0 +1,27 @@
package com.example.datacollect.strategy;
import java.util.regex.Pattern;
public abstract class PriorityStrategy implements CrawlStrategy, Comparable<PriorityStrategy> {
private final int priority;
private final Pattern urlPattern;
public PriorityStrategy(int priority, String regexPattern) {
this.priority = priority;
this.urlPattern = Pattern.compile(regexPattern);
}
@Override
public boolean supports(String url) {
return urlPattern.matcher(url).matches();
}
@Override
public int compareTo(PriorityStrategy other) {
return Integer.compare(other.priority, this.priority);
}
public int getPriority() {
return priority;
}
}

49
w12/java-cli/src/main/java/com/example/datacollect/strategy/StrategyFactory.java

@ -0,0 +1,49 @@
package com.example.datacollect.strategy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class StrategyFactory {
private static final Logger logger = LoggerFactory.getLogger(StrategyFactory.class);
private final List<PriorityStrategy> strategies = new ArrayList<>();
private final CrawlStrategy defaultStrategy;
public StrategyFactory() {
strategies.add(new HnuNewsStrategy());
strategies.add(new BlogStrategy());
strategies.add(new NewsStrategy());
Collections.sort(strategies);
this.defaultStrategy = new DefaultStrategy();
logger.info("StrategyFactory initialized with {} strategies", strategies.size());
}
public CrawlStrategy getStrategy(String url) {
if (url == null || url.trim().isEmpty()) {
logger.debug("Empty URL provided, using default strategy");
return defaultStrategy;
}
for (PriorityStrategy s : strategies) {
if (s.supports(url)) {
logger.debug("URL {} matched strategy: {}", url, s.getClass().getSimpleName());
return s;
}
}
logger.debug("URL {} did not match any specific strategy, using default", url);
return defaultStrategy;
}
public void register(PriorityStrategy strategy) {
strategies.add(strategy);
Collections.sort(strategies);
logger.info("Registered new strategy: {}", strategy.getClass().getSimpleName());
}
public CrawlStrategy getDefaultStrategy() {
return defaultStrategy;
}
}

47
w12/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java

@ -0,0 +1,47 @@
package com.example.datacollect.view;
import com.example.datacollect.model.Article;
import java.util.List;
import java.util.Scanner;
public class ConsoleView {
private static final String ANSI_RESET = "\u001B[0m";
private static final String ANSI_GREEN = "\u001B[32m";
private static final String ANSI_RED = "\u001B[31m";
private static final String ANSI_BLUE = "\u001B[34m";
private static final String ANSI_YELLOW = "\u001B[33m";
private final Scanner scanner = new Scanner(System.in);
public String readLine() {
System.out.print("> ");
return scanner.nextLine();
}
public void printSuccess(String msg) {
System.out.println(ANSI_GREEN + msg + ANSI_RESET);
}
public void printError(String msg) {
System.out.println(ANSI_RED + msg + ANSI_RESET);
}
public void printInfo(String msg) {
System.out.println(ANSI_BLUE + msg + ANSI_RESET);
}
public void printWarning(String msg) {
System.out.println(ANSI_YELLOW + msg + ANSI_RESET);
}
public void display(List<Article> articles) {
if (articles.isEmpty()) {
printInfo("暂无文章,请先执行 crawl。");
return;
}
for (int i = 0; i < articles.size(); i++) {
Article a = articles.get(i);
System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl());
}
}
}

26
w12/java-cli/src/main/resources/logback.xml

@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/crawler.log</file>
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<fileNamePattern>logs/crawler.%d{yyyy-MM-dd}.log</fileNamePattern>
<maxHistory>30</maxHistory>
</rollingPolicy>
</appender>
<root level="INFO">
<appender-ref ref="STDOUT" />
<appender-ref ref="FILE" />
</root>
<logger name="com.example.datacollect" level="DEBUG" />
</configuration>

26
w12/java-cli/target/classes/logback.xml

@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/crawler.log</file>
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<fileNamePattern>logs/crawler.%d{yyyy-MM-dd}.log</fileNamePattern>
<maxHistory>30</maxHistory>
</rollingPolicy>
</appender>
<root level="INFO">
<appender-ref ref="STDOUT" />
<appender-ref ref="FILE" />
</root>
<logger name="com.example.datacollect" level="DEBUG" />
</configuration>

3
w12/java-cli/target/maven-archiver/pom.properties

@ -0,0 +1,3 @@
artifactId=datacollect-cli
groupId=com.example
version=0.1.0

21
w12/java-cli/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst

@ -0,0 +1,21 @@
com\example\datacollect\strategy\DefaultStrategy.class
com\example\datacollect\strategy\PriorityStrategy.class
com\example\datacollect\command\ListCommand.class
com\example\datacollect\command\CrawlCommand.class
com\example\datacollect\strategy\BlogStrategy.class
com\example\datacollect\repository\ArticleRepository.class
com\example\datacollect\Main.class
com\example\datacollect\view\ConsoleView.class
com\example\datacollect\command\ExitCommand.class
com\example\datacollect\command\HelpCommand.class
com\example\datacollect\strategy\NewsStrategy.class
com\example\datacollect\command\Command.class
com\example\datacollect\controller\CrawlerController.class
com\example\datacollect\exception\CrawlerException.class
com\example\datacollect\exception\NetworkException.class
com\example\datacollect\command\AnalyzeCommand.class
com\example\datacollect\strategy\StrategyFactory.class
com\example\datacollect\strategy\HnuNewsStrategy.class
com\example\datacollect\exception\ParseException.class
com\example\datacollect\strategy\CrawlStrategy.class
com\example\datacollect\model\Article.class

23
w12/java-cli/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst

@ -0,0 +1,23 @@
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\exception\ParseException.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\strategy\CrawlStrategy.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\command\JsonImporterCommand.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\command\CrawlCommand.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\strategy\BlogStrategy.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\command\JsonExporterCommand.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\command\AnalyzeCommand.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\strategy\HnuNewsStrategy.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\command\HelpCommand.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\command\ExitCommand.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\strategy\DefaultStrategy.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\strategy\NewsStrategy.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\strategy\StrategyFactory.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\Main.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\controller\CrawlerController.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\strategy\PriorityStrategy.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\exception\CrawlerException.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\exception\NetworkException.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\command\ListCommand.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\command\Command.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\model\Article.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\repository\ArticleRepository.java
D:\桌面\java-cli - 副本 - 副本\src\main\java\com\example\datacollect\view\ConsoleView.java
Loading…
Cancel
Save