Browse Source

上传project

main
86150 3 weeks ago
parent
commit
0ed7e9e639
  1. BIN
      project/202506050230-焦妍-期末实验报告.docx
  2. BIN
      project/java-cli/.DS_Store
  3. 4
      project/java-cli/.gitignore
  4. 17
      project/java-cli/README.md
  5. 69
      project/java-cli/pom.xml
  6. BIN
      project/java-cli/src/.DS_Store
  7. BIN
      project/java-cli/src/main/.DS_Store
  8. BIN
      project/java-cli/src/main/java/.DS_Store
  9. BIN
      project/java-cli/src/main/java/com/.DS_Store
  10. BIN
      project/java-cli/src/main/java/com/example/.DS_Store
  11. 31
      project/java-cli/src/main/java/com/example/datacollect/Main.java
  12. 130
      project/java-cli/src/main/java/com/example/datacollect/command/AnalyzeCommand.java
  13. 15
      project/java-cli/src/main/java/com/example/datacollect/command/Command.java
  14. 85
      project/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java
  15. 55
      project/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java
  16. 61
      project/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java
  17. 80
      project/java-cli/src/main/java/com/example/datacollect/command/HistoryCommand.java
  18. 65
      project/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java
  19. 85
      project/java-cli/src/main/java/com/example/datacollect/command/SaveCommand.java
  20. 60
      project/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java
  21. 11
      project/java-cli/src/main/java/com/example/datacollect/exception/CrawlerException.java
  22. 11
      project/java-cli/src/main/java/com/example/datacollect/exception/NetworkException.java
  23. 11
      project/java-cli/src/main/java/com/example/datacollect/exception/ParseException.java
  24. 75
      project/java-cli/src/main/java/com/example/datacollect/model/Article.java
  25. 52
      project/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java
  26. 56
      project/java-cli/src/main/java/com/example/datacollect/service/ScraperService.java
  27. 93
      project/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java
  28. 12
      project/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java
  29. 118
      project/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java
  30. 27
      project/java-cli/src/main/java/com/example/datacollect/strategy/StrategyFactory.java
  31. 105
      project/java-cli/src/main/java/com/example/datacollect/strategy/TechStrategy.java
  32. 53
      project/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java
  33. 22
      project/java-cli/src/main/resources/logback.xml
  34. BIN
      project/java-cli/target/classes/.DS_Store
  35. BIN
      project/java-cli/target/classes/com/.DS_Store
  36. BIN
      project/java-cli/target/classes/com/example/.DS_Store
  37. 22
      project/java-cli/target/classes/logback.xml
  38. 0
      project/java-cli/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
  39. 22
      project/java-cli/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst

BIN
project/202506050230-焦妍-期末实验报告.docx

Binary file not shown.

BIN
project/java-cli/.DS_Store

Binary file not shown.

4
project/java-cli/.gitignore

@ -0,0 +1,4 @@
*.jar
*.jar
*.class
*.log

17
project/java-cli/README.md

@ -0,0 +1,17 @@
# DataCollect 教学项目 — 最小可运行版本
这是一个最小可用的 Java CLI 演示工程,目标:打印帮助信息以验证运行环境。
构建:
```bash
mvn -q package
```
运行(示例):
```bash
java -jar target/datacollect-cli-0.1.0-jar-with-dependencies.jar --help
```
项目结构(最小):
- `src/main/java/com/example/datacollect/Main.java` — CLI 入口,打印帮助
- `pom.xml` — Maven 构建配置,生成可执行 jar

69
project/java-cli/pom.xml

@ -0,0 +1,69 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>datacollect-cli</artifactId>
<version>0.1.0</version>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.4.14</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.15.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
<version>2.15.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<archive>
<manifest>
<mainClass>com.example.datacollect.Main</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

BIN
project/java-cli/src/.DS_Store

Binary file not shown.

BIN
project/java-cli/src/main/.DS_Store

Binary file not shown.

BIN
project/java-cli/src/main/java/.DS_Store

Binary file not shown.

BIN
project/java-cli/src/main/java/com/.DS_Store

Binary file not shown.

BIN
project/java-cli/src/main/java/com/example/.DS_Store

Binary file not shown.

31
project/java-cli/src/main/java/com/example/datacollect/Main.java

@ -0,0 +1,31 @@
package com.example.datacollect;
import com.example.datacollect.controller.CrawlerController;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Main {
private static final Logger logger = LoggerFactory.getLogger(Main.class);
public static void main(String[] args) {
logger.info("Starting CLI Crawler application");
ConsoleView view = new ConsoleView();
ArticleRepository repository = new ArticleRepository();
CrawlerController controller = new CrawlerController(view, repository);
view.printSuccess("Welcome to CLI Crawler (W11)! Type help for commands.");
logger.info("Application started successfully");
while (true) {
try {
controller.handle(view.readLine());
} catch (Exception e) {
logger.error("Error processing command", e);
view.printError("Error: " + e.getMessage());
}
}
}
}

130
project/java-cli/src/main/java/com/example/datacollect/command/AnalyzeCommand.java

@ -0,0 +1,130 @@
package com.example.datacollect.command;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.strategy.CrawlStrategy;
import com.example.datacollect.strategy.StrategyFactory;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
public class AnalyzeCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class);
private final ConsoleView view;
public AnalyzeCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "analyze";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
logger.debug("Executing analyze command");
if (args.length >= 2) {
String url = args[1];
analyzeUrl(url);
} else {
analyzeRepository(repository);
}
}
private void analyzeUrl(String url) {
logger.info("Analyzing URL: {}", url);
CrawlStrategy strategy = StrategyFactory.getStrategy(url);
if (strategy == null) {
logger.error("No strategy found for URL: {}", url);
view.printError("No strategy found for URL: " + url);
return;
}
view.printInfo("Analyzing URL: " + url);
view.printInfo("Using strategy: " + strategy.getClass().getSimpleName());
List<Article> articles = strategy.crawl(url);
printAnalysis(articles);
logger.info("Analysis completed for URL: {}", url);
view.printInfo("Note: Analysis results are NOT stored.");
}
private void analyzeRepository(ArticleRepository repository) {
List<Article> articles = repository.getAll();
if (articles.isEmpty()) {
logger.info("No articles to analyze");
view.printInfo("No articles to analyze. Use 'analyze <url>' to analyze a URL without storing.");
return;
}
logger.info("Analyzing {} articles from repository", articles.size());
view.printInfo("Analyzing " + articles.size() + " articles from repository:");
printAnalysis(articles);
}
private void printAnalysis(List<Article> articles) {
if (articles.isEmpty()) {
logger.info("No articles found for analysis");
view.printInfo("No articles found.");
return;
}
int totalArticles = articles.size();
int totalContentLength = 0;
int articlesWithAuthor = 0;
int articlesWithDate = 0;
for (Article article : articles) {
if (article.getContent() != null) {
totalContentLength += article.getContent().length();
}
if (article.getAuthor() != null && !article.getAuthor().isEmpty()) {
articlesWithAuthor++;
}
if (article.getPublishDate() != null) {
articlesWithDate++;
}
}
double avgContentLength = totalArticles > 0 ? (double) totalContentLength / totalArticles : 0;
logger.info("Analysis results: {} articles, {} avg length", totalArticles, avgContentLength);
view.printInfo("=== Analysis Results ===");
view.printInfo("Total articles: " + totalArticles);
view.printInfo("Total content length: " + totalContentLength);
view.printInfo("Average content length: " + String.format("%.2f", avgContentLength));
view.printInfo("Articles with author: " + articlesWithAuthor);
view.printInfo("Articles with publish date: " + articlesWithDate);
}
@Override
public String getName1() {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'getName1'");
}
@Override
public void execute(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute'");
}
@Override
public void execute1(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
@Override
public void execute1(String[] args, ArticleRepository repository) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
}

15
project/java-cli/src/main/java/com/example/datacollect/command/Command.java

@ -0,0 +1,15 @@
package com.example.datacollect.command;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import java.util.List;
public interface Command {
String getName1();
void execute(String[] args, List<Article> articles);
void execute(String[] args, ArticleRepository repository);
String getName();
void execute1(String[] args, List<Article> articles);
void execute1(String[] args, ArticleRepository repository);
}

85
project/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java

@ -0,0 +1,85 @@
package com.example.datacollect.command;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.service.ScraperService;
import com.example.datacollect.strategy.CrawlStrategy;
import com.example.datacollect.strategy.StrategyFactory;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
public class CrawlCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class);
private final ConsoleView view;
private final ScraperService scraperService;
public CrawlCommand(ConsoleView view) {
this.view = view;
this.scraperService = new ScraperService();
}
@Override
public String getName() {
return "crawl";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
if (args.length < 2) {
logger.warn("Missing URL argument");
view.printError("Usage: crawl <url>");
return;
}
String url = args[1];
logger.info("Crawl started for: {}", url);
CrawlStrategy strategy = StrategyFactory.getStrategy(url);
if (strategy == null) {
logger.error("No strategy found for URL: {}", url);
view.printError("No strategy found for URL: " + url);
return;
}
logger.info("Using strategy: {}", strategy.getClass().getSimpleName());
view.printInfo("Crawling " + url + " with strategy: " + strategy.getClass().getSimpleName());
try {
List<Article> articles = scraperService.scrapeWithRetry(strategy, url);
repository.addAll(articles);
logger.info("Crawled {} articles successfully", articles.size());
view.printSuccess("Crawled " + articles.size() + " articles");
} catch (Exception e) {
logger.error("Error crawling URL: {}", url, e);
view.printError("Error: " + e.getMessage());
}
}
@Override
public String getName1() {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'getName1'");
}
@Override
public void execute(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute'");
}
@Override
public void execute1(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
@Override
public void execute1(String[] args, ArticleRepository repository) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
}

55
project/java-cli/src/main/java/com/example/datacollect/command/ExitCommand.java

@ -0,0 +1,55 @@
package com.example.datacollect.command;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ExitCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class);
private final ConsoleView view;
public ExitCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "exit";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
logger.info("User requested exit");
view.printSuccess("Goodbye!");
System.exit(0);
}
@Override
public void execute(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute'");
}
@Override
public String getName1() {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'getName1'");
}
@Override
public void execute1(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
@Override
public void execute1(String[] args, ArticleRepository repository) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
}

61
project/java-cli/src/main/java/com/example/datacollect/command/HelpCommand.java

@ -0,0 +1,61 @@
package com.example.datacollect.command;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class HelpCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class);
private final ConsoleView view;
public HelpCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "help";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
logger.debug("Executing help command");
view.printInfo("Commands:");
view.printInfo(" crawl <url> - 爬取指定 URL 的文章");
view.printInfo(" list - 列出已爬取的文章");
view.printInfo(" analyze - 分析文章统计信息");
view.printInfo(" history - 显示命令历史记录");
view.printInfo(" save [file] - 保存文章到 JSON 文件(默认 articles.json)");
view.printInfo(" help - 显示此帮助信息");
view.printInfo(" exit - 退出程序");
}
@Override
public String getName1() {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'getName1'");
}
@Override
public void execute(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute'");
}
@Override
public void execute1(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
@Override
public void execute1(String[] args, ArticleRepository repository) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
}

80
project/java-cli/src/main/java/com/example/datacollect/command/HistoryCommand.java

@ -0,0 +1,80 @@
package com.example.datacollect.command;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
public class HistoryCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(HistoryCommand.class);
private static final List<String> commandHistory = new ArrayList<>();
private final ConsoleView view;
public HistoryCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "history";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
logger.debug("Executing history command");
if (commandHistory.isEmpty()) {
logger.info("Command history is empty");
view.printInfo("No command history.");
return;
}
logger.info("Showing {} command history items", commandHistory.size());
view.printInfo("Command History:");
for (int i = 0; i < commandHistory.size(); i++) {
view.printInfo((i + 1) + ". " + commandHistory.get(i));
}
}
public static void addCommand(String command) {
commandHistory.add(command);
logger.debug("Command added to history: {}", command);
}
public static List<String> getCommandHistory() {
return new ArrayList<>(commandHistory);
}
public static void clearHistory() {
commandHistory.clear();
logger.info("Command history cleared");
}
@Override
public String getName1() {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'getName1'");
}
@Override
public void execute(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute'");
}
@Override
public void execute1(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
@Override
public void execute1(String[] args, ArticleRepository repository) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
}

65
project/java-cli/src/main/java/com/example/datacollect/command/ListCommand.java

@ -0,0 +1,65 @@
package com.example.datacollect.command;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
public class ListCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(ListCommand.class);
private final ConsoleView view;
public ListCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "list";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
logger.debug("Executing list command");
List<Article> articles = repository.getAll();
if (articles.isEmpty()) {
logger.info("No articles found");
view.printInfo("No articles yet. Use 'crawl <url>' to get started.");
return;
}
logger.info("Listing {} articles", articles.size());
view.printInfo("=== Articles (" + articles.size() + ") ===");
for (Article article : articles) {
view.printInfo(article.toString());
}
}
@Override
public String getName1() {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'getName1'");
}
@Override
public void execute(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute'");
}
@Override
public void execute1(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
@Override
public void execute1(String[] args, ArticleRepository repository) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
}

85
project/java-cli/src/main/java/com/example/datacollect/command/SaveCommand.java

@ -0,0 +1,85 @@
package com.example.datacollect.command;
import com.example.datacollect.model.Article;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.List;
public class SaveCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(SaveCommand.class);
private final ConsoleView view;
private final ObjectMapper objectMapper;
public SaveCommand(ConsoleView view) {
this.view = view;
this.objectMapper = new ObjectMapper();
this.objectMapper.registerModule(new JavaTimeModule());
this.objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
}
@Override
public String getName() {
return "save";
}
@Override
public void execute(String[] args, ArticleRepository repository) {
logger.debug("Executing save command");
String fileName = args.length >= 2 ? args[1] : "articles.json";
if (!fileName.endsWith(".json")) {
fileName = fileName + ".json";
}
List<Article> articles = repository.getAll();
if (articles.isEmpty()) {
logger.warn("No articles to save");
view.printError("No articles to save. Use 'crawl <url>' first.");
return;
}
try {
File file = new File(fileName);
objectMapper.writeValue(file, articles);
logger.info("Successfully saved {} articles to {}", articles.size(), fileName);
view.printSuccess("Saved " + articles.size() + " articles to " + fileName);
} catch (IOException e) {
logger.error("Failed to save articles to {}", fileName, e);
view.printError("Error saving articles: " + e.getMessage());
}
}
@Override
public String getName1() {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'getName1'");
}
@Override
public void execute(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute'");
}
@Override
public void execute1(String[] args, List<Article> articles) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
@Override
public void execute1(String[] args, ArticleRepository repository) {
// TODO Auto-generated method stub
throw new UnsupportedOperationException("Unimplemented method 'execute1'");
}
}

60
project/java-cli/src/main/java/com/example/datacollect/controller/CrawlerController.java

@ -0,0 +1,60 @@
package com.example.datacollect.controller;
import com.example.datacollect.command.*;
import com.example.datacollect.repository.ArticleRepository;
import com.example.datacollect.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;
public class CrawlerController {
private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class);
private final Map<String, Command> commands = new HashMap<>();
private final ConsoleView view;
private final ArticleRepository repository;
public CrawlerController(ConsoleView view, ArticleRepository repository) {
this.view = view;
this.repository = repository;
logger.info("Initializing CrawlerController with {} commands", 7);
register(new HelpCommand(view));
register(new ListCommand(view));
register(new CrawlCommand(view));
register(new ExitCommand(view));
register(new HistoryCommand(view));
register(new AnalyzeCommand(view));
register(new SaveCommand(view));
logger.info("CrawlerController initialized successfully");
}
private void register(Command command) {
commands.put(command.getName(), command);
logger.debug("Registered command: {}", command.getName());
}
public void handle(String input) {
String text = input == null ? "" : input.trim();
if (text.isEmpty()) {
return;
}
logger.debug("Handling input: {}", text);
// 记录命令历史
HistoryCommand.addCommand(text);
String[] args = text.split("\\s+");
String cmdName = args[0].toLowerCase();
Command command = commands.get(cmdName);
if (command == null) {
logger.warn("Unknown command: {}", cmdName);
view.printError("Unknown command: " + cmdName);
return;
}
logger.info("Executing command: {}", cmdName);
command.execute(args, repository);
}
}

11
project/java-cli/src/main/java/com/example/datacollect/exception/CrawlerException.java

@ -0,0 +1,11 @@
package com.example.datacollect.exception;
public class CrawlerException extends RuntimeException {
public CrawlerException(String message) {
super(message);
}
public CrawlerException(String message, Throwable cause) {
super(message, cause);
}
}

11
project/java-cli/src/main/java/com/example/datacollect/exception/NetworkException.java

@ -0,0 +1,11 @@
package com.example.datacollect.exception;
public class NetworkException extends CrawlerException {
public NetworkException(String message) {
super(message);
}
public NetworkException(String message, Throwable cause) {
super(message, cause);
}
}

11
project/java-cli/src/main/java/com/example/datacollect/exception/ParseException.java

@ -0,0 +1,11 @@
package com.example.datacollect.exception;
public class ParseException extends CrawlerException {
public ParseException(String message) {
super(message);
}
public ParseException(String message, Throwable cause) {
super(message, cause);
}
}

75
project/java-cli/src/main/java/com/example/datacollect/model/Article.java

@ -0,0 +1,75 @@
package com.example.datacollect.model;
import java.time.LocalDate;
public class Article {
private String title;
private String url;
private String content;
private String author;
private LocalDate publishDate;
public Article(String title, String url, String content) {
this.title = title;
this.url = url;
this.content = content;
}
public Article(String title, String url, String content, String author, LocalDate publishDate) {
this.title = title;
this.url = url;
this.content = content;
this.author = author;
this.publishDate = publishDate;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public LocalDate getPublishDate() {
return publishDate;
}
public void setPublishDate(LocalDate publishDate) {
this.publishDate = publishDate;
}
@Override
public String toString() {
return "Article{"
+ "title='" + title + '\''
+ ", url='" + url + '\''
+ ", author='" + author + '\''
+ ", publishDate=" + publishDate
+ '}';
}
}

52
project/java-cli/src/main/java/com/example/datacollect/repository/ArticleRepository.java

@ -0,0 +1,52 @@
package com.example.datacollect.repository;
import com.example.datacollect.model.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class ArticleRepository {
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class);
private final List<Article> articles = new ArrayList<>();
public void add(Article article) {
if (article == null) {
logger.warn("Attempted to add null article");
return;
}
if (article.getTitle() == null || article.getTitle().isEmpty()) {
logger.warn("Attempted to add article with empty title");
return;
}
articles.add(article);
logger.debug("Added article: {}", article.getTitle());
}
public void addAll(List<Article> articleList) {
if (articleList == null) {
logger.warn("Attempted to add null article list");
return;
}
for (Article article : articleList) {
add(article);
}
logger.info("Added {} articles", articleList.size());
}
public List<Article> getAll() {
return Collections.unmodifiableList(articles);
}
public void clear() {
int size = articles.size();
articles.clear();
logger.info("Cleared {} articles from repository", size);
}
public int size() {
return articles.size();
}
}

56
project/java-cli/src/main/java/com/example/datacollect/service/ScraperService.java

@ -0,0 +1,56 @@
package com.example.datacollect.service;
import com.example.datacollect.exception.CrawlerException;
import com.example.datacollect.exception.NetworkException;
import com.example.datacollect.model.Article;
import com.example.datacollect.strategy.CrawlStrategy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
public class ScraperService {
private static final Logger logger = LoggerFactory.getLogger(ScraperService.class);
private static final int MAX_RETRY = 3;
private static final long INITIAL_DELAY_MS = 1000;
private static final double BACKOFF_MULTIPLIER = 2.0;
public List<Article> scrapeWithRetry(CrawlStrategy strategy, String url) {
int attempt = 0;
long delay = INITIAL_DELAY_MS;
while (attempt < MAX_RETRY) {
try {
attempt++;
logger.info("Attempt {}/{} to crawl {}", attempt, MAX_RETRY, url);
List<Article> articles = strategy.crawl(url);
if (attempt > 1) {
logger.info("Successfully crawled {} on attempt {}", url, attempt);
}
return articles;
} catch (NetworkException e) {
logger.warn("Network error on attempt {} for {}: {}", attempt, url, e.getMessage());
if (attempt < MAX_RETRY) {
try {
logger.info("Retrying after {}ms...", delay);
Thread.sleep(delay);
delay = (long) (delay * BACKOFF_MULTIPLIER);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
throw new CrawlerException("Interrupted during retry wait", ie);
}
} else {
logger.error("Failed to crawl {} after {} attempts due to network errors", url, MAX_RETRY);
throw new CrawlerException("Failed to crawl " + url + " after " + MAX_RETRY + " attempts", e);
}
}
}
throw new CrawlerException("Unexpected error: max retry attempts exhausted");
}
}

93
project/java-cli/src/main/java/com/example/datacollect/strategy/BlogStrategy.java

@ -0,0 +1,93 @@
package com.example.datacollect.strategy;
import com.example.datacollect.model.Article;
import com.example.datacollect.exception.NetworkException;
import com.example.datacollect.exception.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class BlogStrategy implements CrawlStrategy {
@Override
public boolean supports(String url) {
return url != null && (url.contains("blog") || url.contains("wordpress") || url.contains("lofter") || url.contains("hexo"));
}
@Override
public List<Article> crawl(String url) {
List<Article> articles = new ArrayList<>();
try {
Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(10000)
.get();
articles = parse(doc, url);
} catch (IOException e) {
throw new NetworkException("网络请求失败:" + e.getMessage(), e);
} catch (ParseException e) {
throw e;
} catch (Exception e) {
articles.add(new Article("爬取失败", url, "错误:" + e.getMessage(), "系统", null));
}
return articles;
}
@Override
public List<Article> parse(Document doc, String url) throws ParseException {
List<Article> articles = new ArrayList<>();
try {
if (url.contains("lofter")) {
crawlLofter(doc, url, articles);
} else if (url.contains("wordpress")) {
crawlWordpress(doc, url, articles);
} else {
crawlGenericBlog(doc, url, articles);
}
} catch (Exception e) {
throw new ParseException("解析博客网站失败:" + e.getMessage(), e);
}
return articles;
}
private void crawlLofter(Document doc, String url, List<Article> articles) {
Elements items = doc.select(".m-post");
for (Element item : items) {
String title = item.select(".m-post-title a").text();
String link = item.select(".m-post-title a").attr("href");
String author = item.select(".m-user-name").text();
String summary = item.select(".m-post-content").text();
if (!title.isEmpty()) {
articles.add(new Article(title, link, summary.length() > 300 ? summary.substring(0, 300) : summary, author, null));
}
}
}
private void crawlWordpress(Document doc, String url, List<Article> articles) {
Elements items = doc.select(".post");
for (Element item : items) {
String title = item.select(".entry-title a").text();
String link = item.select(".entry-title a").attr("href");
String author = item.select(".author").text();
String summary = item.select(".entry-summary").text();
if (!title.isEmpty()) {
articles.add(new Article(title, link, summary.length() > 300 ? summary.substring(0, 300) : summary, author, null));
}
}
}
private void crawlGenericBlog(Document doc, String url, List<Article> articles) {
Elements items = doc.select(".article, .post, .blog-post");
for (Element item : items) {
String title = item.select("h1, h2, .title").text();
String content = item.select(".content, .post-content").text();
if (!title.isEmpty()) {
articles.add(new Article(title, url, content.length() > 300 ? content.substring(0, 300) : content, "未知作者", null));
}
}
}
}

12
project/java-cli/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java

@ -0,0 +1,12 @@
package com.example.datacollect.strategy;
import com.example.datacollect.model.Article;
import com.example.datacollect.exception.ParseException;
import org.jsoup.nodes.Document;
import java.util.List;
public interface CrawlStrategy {
boolean supports(String url);
List<Article> crawl(String url);
List<Article> parse(Document doc, String url) throws ParseException;
}

118
project/java-cli/src/main/java/com/example/datacollect/strategy/NewsStrategy.java

@ -0,0 +1,118 @@
package com.example.datacollect.strategy;
import com.example.datacollect.model.Article;
import com.example.datacollect.exception.NetworkException;
import com.example.datacollect.exception.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class NewsStrategy implements CrawlStrategy {
@Override
public boolean supports(String url) {
return url != null && (url.contains("news") || url.contains("sina") || url.contains("163") || url.contains("sohu") || url.contains("qq.com"));
}
@Override
public List<Article> crawl(String url) {
List<Article> articles = new ArrayList<>();
try {
Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(10000)
.get();
articles = parse(doc, url);
} catch (IOException e) {
throw new NetworkException("网络请求失败:" + e.getMessage(), e);
} catch (ParseException e) {
throw e;
} catch (Exception e) {
articles.add(new Article("爬取失败", url, "错误:" + e.getMessage(), "系统", null));
}
return articles;
}
@Override
public List<Article> parse(Document doc, String url) throws ParseException {
List<Article> articles = new ArrayList<>();
try {
if (url.contains("sina")) {
crawlSina(doc, url, articles);
} else if (url.contains("163") || url.contains("netease")) {
crawlNetease(doc, url, articles);
} else if (url.contains("sohu")) {
crawlSohu(doc, url, articles);
} else if (url.contains("qq")) {
crawlQQ(doc, url, articles);
} else {
crawlGenericNews(doc, url, articles);
}
} catch (Exception e) {
throw new ParseException("解析新闻网站失败:" + e.getMessage(), e);
}
return articles;
}
private void crawlSina(Document doc, String url, List<Article> articles) {
Elements items = doc.select(".news-item");
for (Element item : items) {
String title = item.select("a").text();
String link = item.select("a").attr("href");
String summary = item.select(".news-summary").text();
if (!title.isEmpty()) {
articles.add(new Article(title, link, summary.length() > 300 ? summary.substring(0, 300) : summary, "新浪新闻", null));
}
}
}
private void crawlNetease(Document doc, String url, List<Article> articles) {
Elements items = doc.select(".news-list li");
for (Element item : items) {
String title = item.select("a").text();
String link = item.select("a").attr("href");
if (!link.startsWith("http")) link = "https://news.163.com" + link;
if (!title.isEmpty()) {
articles.add(new Article(title, link, "", "网易新闻", null));
}
}
}
private void crawlSohu(Document doc, String url, List<Article> articles) {
Elements items = doc.select(".news-item h3 a");
for (Element item : items) {
String title = item.text();
String link = item.attr("href");
if (!title.isEmpty()) {
articles.add(new Article(title, link, "", "搜狐新闻", null));
}
}
}
private void crawlQQ(Document doc, String url, List<Article> articles) {
Elements items = doc.select(".list li a");
for (Element item : items) {
String title = item.text();
String link = item.attr("href");
if (!title.isEmpty()) {
articles.add(new Article(title, link, "", "腾讯新闻", null));
}
}
}
private void crawlGenericNews(Document doc, String url, List<Article> articles) {
Elements items = doc.select(".news, .article-item");
for (Element item : items) {
String title = item.select("h2, h3, .title").text();
String link = item.select("a").attr("href");
if (!link.startsWith("http")) link = url + link;
if (!title.isEmpty()) {
articles.add(new Article(title, link, "", "新闻网站", null));
}
}
}
}

27
project/java-cli/src/main/java/com/example/datacollect/strategy/StrategyFactory.java

@ -0,0 +1,27 @@
package com.example.datacollect.strategy;
import java.util.ArrayList;
import java.util.List;
public class StrategyFactory {
private static final List<CrawlStrategy> strategies = new ArrayList<>();
static {
strategies.add(new BlogStrategy());
strategies.add(new NewsStrategy());
strategies.add(new TechStrategy());
}
public static CrawlStrategy getStrategy(String url) {
for (CrawlStrategy strategy : strategies) {
if (strategy.supports(url)) {
return strategy;
}
}
return null;
}
public static List<CrawlStrategy> getAllStrategies() {
return new ArrayList<>(strategies);
}
}

105
project/java-cli/src/main/java/com/example/datacollect/strategy/TechStrategy.java

@ -0,0 +1,105 @@
package com.example.datacollect.strategy;
import com.example.datacollect.model.Article;
import com.example.datacollect.exception.NetworkException;
import com.example.datacollect.exception.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class TechStrategy implements CrawlStrategy {
@Override
public boolean supports(String url) {
return url != null && (url.contains("csdn") || url.contains("oschina") || url.contains("iteye") || url.contains("cnblogs"));
}
@Override
public List<Article> crawl(String url) {
List<Article> articles = new ArrayList<>();
try {
Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(10000)
.get();
articles = parse(doc, url);
} catch (IOException e) {
throw new NetworkException("网络请求失败:" + e.getMessage(), e);
} catch (ParseException e) {
throw e;
} catch (Exception e) {
articles.add(new Article("爬取失败", url, "错误:" + e.getMessage(), "系统", null));
}
return articles;
}
@Override
public List<Article> parse(Document doc, String url) throws ParseException {
List<Article> articles = new ArrayList<>();
try {
if (url.contains("csdn")) {
crawlCsdn(doc, url, articles);
} else if (url.contains("cnblogs")) {
crawlCnblogs(doc, url, articles);
} else if (url.contains("oschina")) {
crawlOschina(doc, url, articles);
} else {
crawlGeneric(doc, url, articles);
}
} catch (Exception e) {
throw new ParseException("解析技术网站失败:" + e.getMessage(), e);
}
return articles;
}
private void crawlCsdn(Document doc, String url, List<Article> articles) {
Elements items = doc.select(".article-item-box");
for (Element item : items) {
String title = item.select("h4 a").text();
String link = item.select("h4 a").attr("href");
String author = item.select(".name").text();
String summary = item.select(".content").text();
if (!title.isEmpty()) {
articles.add(new Article(title, link, summary, author, null));
}
}
}
private void crawlCnblogs(Document doc, String url, List<Article> articles) {
Elements items = doc.select(".post-item");
for (Element item : items) {
String title = item.select(".post-item-title a").text();
String link = item.select(".post-item-title a").attr("href");
String author = item.select(".post-item-author a").text();
String summary = item.select(".post-item-summary").text();
if (!title.isEmpty()) {
articles.add(new Article(title, link, summary, author, null));
}
}
}
private void crawlOschina(Document doc, String url, List<Article> articles) {
Elements items = doc.select(".news-list .news-item");
for (Element item : items) {
String title = item.select(".title a").text();
String link = "https://www.oschina.net" + item.select(".title a").attr("href");
String author = item.select(".author").text();
String summary = item.select(".description").text();
if (!title.isEmpty()) {
articles.add(new Article(title, link, summary, author, null));
}
}
}
private void crawlGeneric(Document doc, String url, List<Article> articles) {
String title = doc.title();
String content = doc.select("article, .article-content, .post-content").text();
if (!title.isEmpty()) {
articles.add(new Article(title, url, content.length() > 500 ? content.substring(0, 500) : content, "未知", null));
}
}
}

53
project/java-cli/src/main/java/com/example/datacollect/view/ConsoleView.java

@ -0,0 +1,53 @@
package com.example.datacollect.view;
import com.example.datacollect.model.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Scanner;
public class ConsoleView {
private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class);
private static final String ANSI_RESET = "\u001B[0m";
private static final String ANSI_GREEN = "\u001B[32m";
private static final String ANSI_RED = "\u001B[31m";
private static final String ANSI_BLUE = "\u001B[34m";
private final Scanner scanner = new Scanner(System.in);
public String readLine() {
System.out.print("> ");
String line = scanner.nextLine();
logger.debug("User input: {}", line);
return line;
}
public void printSuccess(String msg) {
logger.info("Success: {}", msg);
System.out.println(ANSI_GREEN + msg + ANSI_RESET);
}
public void printError(String msg) {
logger.error("Error: {}", msg);
System.out.println(ANSI_RED + msg + ANSI_RESET);
}
public void printInfo(String msg) {
logger.debug("Info: {}", msg);
System.out.println(ANSI_BLUE + msg + ANSI_RESET);
}
public void display(List<Article> articles) {
if (articles.isEmpty()) {
logger.info("No articles to display");
printInfo("暂无文章,请先执行 crawl。");
return;
}
logger.info("Displaying {} articles", articles.size());
for (int i = 0; i < articles.size(); i++) {
Article a = articles.get(i);
System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl());
}
}
}

22
project/java-cli/src/main/resources/logback.xml

@ -0,0 +1,22 @@
<configuration>
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<appender name="FILE" class="ch.qos.logback.core.FileAppender">
<file>logs/crawler.log</file>
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<root level="INFO">
<appender-ref ref="CONSOLE" />
<appender-ref ref="FILE" />
</root>
<logger name="com.example.datacollect" level="DEBUG" />
<logger name="org.jsoup" level="WARN" />
</configuration>

BIN
project/java-cli/target/classes/.DS_Store

Binary file not shown.

BIN
project/java-cli/target/classes/com/.DS_Store

Binary file not shown.

BIN
project/java-cli/target/classes/com/example/.DS_Store

Binary file not shown.

22
project/java-cli/target/classes/logback.xml

@ -0,0 +1,22 @@
<configuration>
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<appender name="FILE" class="ch.qos.logback.core.FileAppender">
<file>logs/crawler.log</file>
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<root level="INFO">
<appender-ref ref="CONSOLE" />
<appender-ref ref="FILE" />
</root>
<logger name="com.example.datacollect" level="DEBUG" />
<logger name="org.jsoup" level="WARN" />
</configuration>

0
project/java-cli/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst

22
project/java-cli/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst

@ -0,0 +1,22 @@
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\model\Article.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\exception\CrawlerException.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\TechStrategy.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\repository\ArticleRepository.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\Main.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\BlogStrategy.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\Command.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\ExitCommand.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\HelpCommand.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\CrawlCommand.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\HistoryCommand.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\NewsStrategy.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\exception\NetworkException.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\SaveCommand.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\controller\CrawlerController.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\CrawlStrategy.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\ListCommand.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\service\ScraperService.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\exception\ParseException.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\view\ConsoleView.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\StrategyFactory.java
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\AnalyzeCommand.java
Loading…
Cancel
Save