39 changed files with 1447 additions and 0 deletions
Binary file not shown.
Binary file not shown.
@ -0,0 +1,4 @@ |
|||
*.jar |
|||
*.jar |
|||
*.class |
|||
*.log |
|||
@ -0,0 +1,17 @@ |
|||
# DataCollect 教学项目 — 最小可运行版本 |
|||
|
|||
这是一个最小可用的 Java CLI 演示工程,目标:打印帮助信息以验证运行环境。 |
|||
|
|||
构建: |
|||
```bash |
|||
mvn -q package |
|||
``` |
|||
|
|||
运行(示例): |
|||
```bash |
|||
java -jar target/datacollect-cli-0.1.0-jar-with-dependencies.jar --help |
|||
``` |
|||
|
|||
项目结构(最小): |
|||
- `src/main/java/com/example/datacollect/Main.java` — CLI 入口,打印帮助 |
|||
- `pom.xml` — Maven 构建配置,生成可执行 jar |
|||
@ -0,0 +1,69 @@ |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
<groupId>com.example</groupId> |
|||
<artifactId>datacollect-cli</artifactId> |
|||
<version>0.1.0</version> |
|||
<properties> |
|||
<maven.compiler.source>11</maven.compiler.source> |
|||
<maven.compiler.target>11</maven.compiler.target> |
|||
</properties> |
|||
|
|||
<dependencies> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>1.17.2</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>ch.qos.logback</groupId> |
|||
<artifactId>logback-classic</artifactId> |
|||
<version>1.4.14</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.fasterxml.jackson.core</groupId> |
|||
<artifactId>jackson-databind</artifactId> |
|||
<version>2.15.2</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.fasterxml.jackson.datatype</groupId> |
|||
<artifactId>jackson-datatype-jsr310</artifactId> |
|||
<version>2.15.2</version> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-compiler-plugin</artifactId> |
|||
<version>3.8.1</version> |
|||
</plugin> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-assembly-plugin</artifactId> |
|||
<version>3.3.0</version> |
|||
<configuration> |
|||
<archive> |
|||
<manifest> |
|||
<mainClass>com.example.datacollect.Main</mainClass> |
|||
</manifest> |
|||
</archive> |
|||
<descriptorRefs> |
|||
<descriptorRef>jar-with-dependencies</descriptorRef> |
|||
</descriptorRefs> |
|||
</configuration> |
|||
<executions> |
|||
<execution> |
|||
<id>make-assembly</id> |
|||
<phase>package</phase> |
|||
<goals> |
|||
<goal>single</goal> |
|||
</goals> |
|||
</execution> |
|||
</executions> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,31 @@ |
|||
package com.example.datacollect; |
|||
|
|||
import com.example.datacollect.controller.CrawlerController; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
public class Main { |
|||
private static final Logger logger = LoggerFactory.getLogger(Main.class); |
|||
|
|||
public static void main(String[] args) { |
|||
logger.info("Starting CLI Crawler application"); |
|||
|
|||
ConsoleView view = new ConsoleView(); |
|||
ArticleRepository repository = new ArticleRepository(); |
|||
CrawlerController controller = new CrawlerController(view, repository); |
|||
|
|||
view.printSuccess("Welcome to CLI Crawler (W11)! Type help for commands."); |
|||
logger.info("Application started successfully"); |
|||
|
|||
while (true) { |
|||
try { |
|||
controller.handle(view.readLine()); |
|||
} catch (Exception e) { |
|||
logger.error("Error processing command", e); |
|||
view.printError("Error: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,130 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.strategy.CrawlStrategy; |
|||
import com.example.datacollect.strategy.StrategyFactory; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class AnalyzeCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class); |
|||
private final ConsoleView view; |
|||
|
|||
public AnalyzeCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "analyze"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
logger.debug("Executing analyze command"); |
|||
|
|||
if (args.length >= 2) { |
|||
String url = args[1]; |
|||
analyzeUrl(url); |
|||
} else { |
|||
analyzeRepository(repository); |
|||
} |
|||
} |
|||
|
|||
private void analyzeUrl(String url) { |
|||
logger.info("Analyzing URL: {}", url); |
|||
CrawlStrategy strategy = StrategyFactory.getStrategy(url); |
|||
|
|||
if (strategy == null) { |
|||
logger.error("No strategy found for URL: {}", url); |
|||
view.printError("No strategy found for URL: " + url); |
|||
return; |
|||
} |
|||
|
|||
view.printInfo("Analyzing URL: " + url); |
|||
view.printInfo("Using strategy: " + strategy.getClass().getSimpleName()); |
|||
|
|||
List<Article> articles = strategy.crawl(url); |
|||
|
|||
printAnalysis(articles); |
|||
logger.info("Analysis completed for URL: {}", url); |
|||
view.printInfo("Note: Analysis results are NOT stored."); |
|||
} |
|||
|
|||
private void analyzeRepository(ArticleRepository repository) { |
|||
List<Article> articles = repository.getAll(); |
|||
|
|||
if (articles.isEmpty()) { |
|||
logger.info("No articles to analyze"); |
|||
view.printInfo("No articles to analyze. Use 'analyze <url>' to analyze a URL without storing."); |
|||
return; |
|||
} |
|||
|
|||
logger.info("Analyzing {} articles from repository", articles.size()); |
|||
view.printInfo("Analyzing " + articles.size() + " articles from repository:"); |
|||
printAnalysis(articles); |
|||
} |
|||
|
|||
private void printAnalysis(List<Article> articles) { |
|||
if (articles.isEmpty()) { |
|||
logger.info("No articles found for analysis"); |
|||
view.printInfo("No articles found."); |
|||
return; |
|||
} |
|||
|
|||
int totalArticles = articles.size(); |
|||
int totalContentLength = 0; |
|||
int articlesWithAuthor = 0; |
|||
int articlesWithDate = 0; |
|||
|
|||
for (Article article : articles) { |
|||
if (article.getContent() != null) { |
|||
totalContentLength += article.getContent().length(); |
|||
} |
|||
if (article.getAuthor() != null && !article.getAuthor().isEmpty()) { |
|||
articlesWithAuthor++; |
|||
} |
|||
if (article.getPublishDate() != null) { |
|||
articlesWithDate++; |
|||
} |
|||
} |
|||
|
|||
double avgContentLength = totalArticles > 0 ? (double) totalContentLength / totalArticles : 0; |
|||
|
|||
logger.info("Analysis results: {} articles, {} avg length", totalArticles, avgContentLength); |
|||
view.printInfo("=== Analysis Results ==="); |
|||
view.printInfo("Total articles: " + totalArticles); |
|||
view.printInfo("Total content length: " + totalContentLength); |
|||
view.printInfo("Average content length: " + String.format("%.2f", avgContentLength)); |
|||
view.printInfo("Articles with author: " + articlesWithAuthor); |
|||
view.printInfo("Articles with publish date: " + articlesWithDate); |
|||
} |
|||
|
|||
@Override |
|||
public String getName1() { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, ArticleRepository repository) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
} |
|||
@ -0,0 +1,15 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
|
|||
import java.util.List; |
|||
|
|||
public interface Command { |
|||
String getName1(); |
|||
void execute(String[] args, List<Article> articles); |
|||
void execute(String[] args, ArticleRepository repository); |
|||
String getName(); |
|||
void execute1(String[] args, List<Article> articles); |
|||
void execute1(String[] args, ArticleRepository repository); |
|||
} |
|||
@ -0,0 +1,85 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.service.ScraperService; |
|||
import com.example.datacollect.strategy.CrawlStrategy; |
|||
import com.example.datacollect.strategy.StrategyFactory; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class CrawlCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); |
|||
private final ConsoleView view; |
|||
private final ScraperService scraperService; |
|||
|
|||
public CrawlCommand(ConsoleView view) { |
|||
this.view = view; |
|||
this.scraperService = new ScraperService(); |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "crawl"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
if (args.length < 2) { |
|||
logger.warn("Missing URL argument"); |
|||
view.printError("Usage: crawl <url>"); |
|||
return; |
|||
} |
|||
|
|||
String url = args[1]; |
|||
logger.info("Crawl started for: {}", url); |
|||
|
|||
CrawlStrategy strategy = StrategyFactory.getStrategy(url); |
|||
|
|||
if (strategy == null) { |
|||
logger.error("No strategy found for URL: {}", url); |
|||
view.printError("No strategy found for URL: " + url); |
|||
return; |
|||
} |
|||
|
|||
logger.info("Using strategy: {}", strategy.getClass().getSimpleName()); |
|||
view.printInfo("Crawling " + url + " with strategy: " + strategy.getClass().getSimpleName()); |
|||
|
|||
try { |
|||
List<Article> articles = scraperService.scrapeWithRetry(strategy, url); |
|||
repository.addAll(articles); |
|||
logger.info("Crawled {} articles successfully", articles.size()); |
|||
view.printSuccess("Crawled " + articles.size() + " articles"); |
|||
} catch (Exception e) { |
|||
logger.error("Error crawling URL: {}", url, e); |
|||
view.printError("Error: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String getName1() { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, ArticleRepository repository) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
} |
|||
@ -0,0 +1,55 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
|
|||
import java.util.List; |
|||
|
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
public class ExitCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class); |
|||
private final ConsoleView view; |
|||
|
|||
public ExitCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "exit"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
logger.info("User requested exit"); |
|||
view.printSuccess("Goodbye!"); |
|||
System.exit(0); |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
|||
} |
|||
|
|||
@Override |
|||
public String getName1() { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, ArticleRepository repository) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
} |
|||
@ -0,0 +1,61 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
|
|||
import java.util.List; |
|||
|
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
public class HelpCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class); |
|||
private final ConsoleView view; |
|||
|
|||
public HelpCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "help"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
logger.debug("Executing help command"); |
|||
view.printInfo("Commands:"); |
|||
view.printInfo(" crawl <url> - 爬取指定 URL 的文章"); |
|||
view.printInfo(" list - 列出已爬取的文章"); |
|||
view.printInfo(" analyze - 分析文章统计信息"); |
|||
view.printInfo(" history - 显示命令历史记录"); |
|||
view.printInfo(" save [file] - 保存文章到 JSON 文件(默认 articles.json)"); |
|||
view.printInfo(" help - 显示此帮助信息"); |
|||
view.printInfo(" exit - 退出程序"); |
|||
} |
|||
|
|||
@Override |
|||
public String getName1() { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, ArticleRepository repository) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
} |
|||
@ -0,0 +1,80 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class HistoryCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(HistoryCommand.class); |
|||
private static final List<String> commandHistory = new ArrayList<>(); |
|||
private final ConsoleView view; |
|||
|
|||
public HistoryCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "history"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
logger.debug("Executing history command"); |
|||
|
|||
if (commandHistory.isEmpty()) { |
|||
logger.info("Command history is empty"); |
|||
view.printInfo("No command history."); |
|||
return; |
|||
} |
|||
|
|||
logger.info("Showing {} command history items", commandHistory.size()); |
|||
view.printInfo("Command History:"); |
|||
for (int i = 0; i < commandHistory.size(); i++) { |
|||
view.printInfo((i + 1) + ". " + commandHistory.get(i)); |
|||
} |
|||
} |
|||
|
|||
public static void addCommand(String command) { |
|||
commandHistory.add(command); |
|||
logger.debug("Command added to history: {}", command); |
|||
} |
|||
|
|||
public static List<String> getCommandHistory() { |
|||
return new ArrayList<>(commandHistory); |
|||
} |
|||
|
|||
public static void clearHistory() { |
|||
commandHistory.clear(); |
|||
logger.info("Command history cleared"); |
|||
} |
|||
|
|||
@Override |
|||
public String getName1() { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, ArticleRepository repository) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
} |
|||
@ -0,0 +1,65 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class ListCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(ListCommand.class); |
|||
private final ConsoleView view; |
|||
|
|||
public ListCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "list"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
logger.debug("Executing list command"); |
|||
List<Article> articles = repository.getAll(); |
|||
|
|||
if (articles.isEmpty()) { |
|||
logger.info("No articles found"); |
|||
view.printInfo("No articles yet. Use 'crawl <url>' to get started."); |
|||
return; |
|||
} |
|||
|
|||
logger.info("Listing {} articles", articles.size()); |
|||
view.printInfo("=== Articles (" + articles.size() + ") ==="); |
|||
for (Article article : articles) { |
|||
view.printInfo(article.toString()); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String getName1() { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, ArticleRepository repository) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
} |
|||
@ -0,0 +1,85 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import com.fasterxml.jackson.databind.SerializationFeature; |
|||
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.util.List; |
|||
|
|||
public class SaveCommand implements Command { |
|||
private static final Logger logger = LoggerFactory.getLogger(SaveCommand.class); |
|||
private final ConsoleView view; |
|||
private final ObjectMapper objectMapper; |
|||
|
|||
public SaveCommand(ConsoleView view) { |
|||
this.view = view; |
|||
this.objectMapper = new ObjectMapper(); |
|||
this.objectMapper.registerModule(new JavaTimeModule()); |
|||
this.objectMapper.enable(SerializationFeature.INDENT_OUTPUT); |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "save"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, ArticleRepository repository) { |
|||
logger.debug("Executing save command"); |
|||
|
|||
String fileName = args.length >= 2 ? args[1] : "articles.json"; |
|||
|
|||
if (!fileName.endsWith(".json")) { |
|||
fileName = fileName + ".json"; |
|||
} |
|||
|
|||
List<Article> articles = repository.getAll(); |
|||
|
|||
if (articles.isEmpty()) { |
|||
logger.warn("No articles to save"); |
|||
view.printError("No articles to save. Use 'crawl <url>' first."); |
|||
return; |
|||
} |
|||
|
|||
try { |
|||
File file = new File(fileName); |
|||
objectMapper.writeValue(file, articles); |
|||
logger.info("Successfully saved {} articles to {}", articles.size(), fileName); |
|||
view.printSuccess("Saved " + articles.size() + " articles to " + fileName); |
|||
} catch (IOException e) { |
|||
logger.error("Failed to save articles to {}", fileName, e); |
|||
view.printError("Error saving articles: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String getName1() { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, List<Article> articles) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute1(String[] args, ArticleRepository repository) { |
|||
// TODO Auto-generated method stub
|
|||
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
|||
} |
|||
} |
|||
@ -0,0 +1,60 @@ |
|||
package com.example.datacollect.controller; |
|||
|
|||
import com.example.datacollect.command.*; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
|
|||
public class CrawlerController { |
|||
private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class); |
|||
private final Map<String, Command> commands = new HashMap<>(); |
|||
private final ConsoleView view; |
|||
private final ArticleRepository repository; |
|||
|
|||
public CrawlerController(ConsoleView view, ArticleRepository repository) { |
|||
this.view = view; |
|||
this.repository = repository; |
|||
logger.info("Initializing CrawlerController with {} commands", 7); |
|||
register(new HelpCommand(view)); |
|||
register(new ListCommand(view)); |
|||
register(new CrawlCommand(view)); |
|||
register(new ExitCommand(view)); |
|||
register(new HistoryCommand(view)); |
|||
register(new AnalyzeCommand(view)); |
|||
register(new SaveCommand(view)); |
|||
logger.info("CrawlerController initialized successfully"); |
|||
} |
|||
|
|||
private void register(Command command) { |
|||
commands.put(command.getName(), command); |
|||
logger.debug("Registered command: {}", command.getName()); |
|||
} |
|||
|
|||
public void handle(String input) { |
|||
String text = input == null ? "" : input.trim(); |
|||
if (text.isEmpty()) { |
|||
return; |
|||
} |
|||
|
|||
logger.debug("Handling input: {}", text); |
|||
|
|||
// 记录命令历史
|
|||
HistoryCommand.addCommand(text); |
|||
|
|||
String[] args = text.split("\\s+"); |
|||
String cmdName = args[0].toLowerCase(); |
|||
Command command = commands.get(cmdName); |
|||
if (command == null) { |
|||
logger.warn("Unknown command: {}", cmdName); |
|||
view.printError("Unknown command: " + cmdName); |
|||
return; |
|||
} |
|||
|
|||
logger.info("Executing command: {}", cmdName); |
|||
command.execute(args, repository); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.example.datacollect.exception; |
|||
|
|||
public class CrawlerException extends RuntimeException { |
|||
public CrawlerException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public CrawlerException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.example.datacollect.exception; |
|||
|
|||
public class NetworkException extends CrawlerException { |
|||
public NetworkException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public NetworkException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.example.datacollect.exception; |
|||
|
|||
public class ParseException extends CrawlerException { |
|||
public ParseException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public ParseException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,75 @@ |
|||
package com.example.datacollect.model; |
|||
|
|||
import java.time.LocalDate; |
|||
|
|||
public class Article { |
|||
private String title; |
|||
private String url; |
|||
private String content; |
|||
private String author; |
|||
private LocalDate publishDate; |
|||
|
|||
public Article(String title, String url, String content) { |
|||
this.title = title; |
|||
this.url = url; |
|||
this.content = content; |
|||
} |
|||
|
|||
public Article(String title, String url, String content, String author, LocalDate publishDate) { |
|||
this.title = title; |
|||
this.url = url; |
|||
this.content = content; |
|||
this.author = author; |
|||
this.publishDate = publishDate; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
|
|||
public void setUrl(String url) { |
|||
this.url = url; |
|||
} |
|||
|
|||
public String getContent() { |
|||
return content; |
|||
} |
|||
|
|||
public void setContent(String content) { |
|||
this.content = content; |
|||
} |
|||
|
|||
public String getAuthor() { |
|||
return author; |
|||
} |
|||
|
|||
public void setAuthor(String author) { |
|||
this.author = author; |
|||
} |
|||
|
|||
public LocalDate getPublishDate() { |
|||
return publishDate; |
|||
} |
|||
|
|||
public void setPublishDate(LocalDate publishDate) { |
|||
this.publishDate = publishDate; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Article{" |
|||
+ "title='" + title + '\'' |
|||
+ ", url='" + url + '\'' |
|||
+ ", author='" + author + '\'' |
|||
+ ", publishDate=" + publishDate |
|||
+ '}'; |
|||
} |
|||
} |
|||
@ -0,0 +1,52 @@ |
|||
package com.example.datacollect.repository; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.Collections; |
|||
import java.util.List; |
|||
|
|||
public class ArticleRepository { |
|||
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); |
|||
private final List<Article> articles = new ArrayList<>(); |
|||
|
|||
public void add(Article article) { |
|||
if (article == null) { |
|||
logger.warn("Attempted to add null article"); |
|||
return; |
|||
} |
|||
if (article.getTitle() == null || article.getTitle().isEmpty()) { |
|||
logger.warn("Attempted to add article with empty title"); |
|||
return; |
|||
} |
|||
articles.add(article); |
|||
logger.debug("Added article: {}", article.getTitle()); |
|||
} |
|||
|
|||
public void addAll(List<Article> articleList) { |
|||
if (articleList == null) { |
|||
logger.warn("Attempted to add null article list"); |
|||
return; |
|||
} |
|||
for (Article article : articleList) { |
|||
add(article); |
|||
} |
|||
logger.info("Added {} articles", articleList.size()); |
|||
} |
|||
|
|||
public List<Article> getAll() { |
|||
return Collections.unmodifiableList(articles); |
|||
} |
|||
|
|||
public void clear() { |
|||
int size = articles.size(); |
|||
articles.clear(); |
|||
logger.info("Cleared {} articles from repository", size); |
|||
} |
|||
|
|||
public int size() { |
|||
return articles.size(); |
|||
} |
|||
} |
|||
@ -0,0 +1,56 @@ |
|||
package com.example.datacollect.service; |
|||
|
|||
import com.example.datacollect.exception.CrawlerException; |
|||
import com.example.datacollect.exception.NetworkException; |
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.strategy.CrawlStrategy; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class ScraperService { |
|||
private static final Logger logger = LoggerFactory.getLogger(ScraperService.class); |
|||
|
|||
private static final int MAX_RETRY = 3; |
|||
private static final long INITIAL_DELAY_MS = 1000; |
|||
private static final double BACKOFF_MULTIPLIER = 2.0; |
|||
|
|||
public List<Article> scrapeWithRetry(CrawlStrategy strategy, String url) { |
|||
int attempt = 0; |
|||
long delay = INITIAL_DELAY_MS; |
|||
|
|||
while (attempt < MAX_RETRY) { |
|||
try { |
|||
attempt++; |
|||
logger.info("Attempt {}/{} to crawl {}", attempt, MAX_RETRY, url); |
|||
|
|||
List<Article> articles = strategy.crawl(url); |
|||
|
|||
if (attempt > 1) { |
|||
logger.info("Successfully crawled {} on attempt {}", url, attempt); |
|||
} |
|||
return articles; |
|||
|
|||
} catch (NetworkException e) { |
|||
logger.warn("Network error on attempt {} for {}: {}", attempt, url, e.getMessage()); |
|||
|
|||
if (attempt < MAX_RETRY) { |
|||
try { |
|||
logger.info("Retrying after {}ms...", delay); |
|||
Thread.sleep(delay); |
|||
delay = (long) (delay * BACKOFF_MULTIPLIER); |
|||
} catch (InterruptedException ie) { |
|||
Thread.currentThread().interrupt(); |
|||
throw new CrawlerException("Interrupted during retry wait", ie); |
|||
} |
|||
} else { |
|||
logger.error("Failed to crawl {} after {} attempts due to network errors", url, MAX_RETRY); |
|||
throw new CrawlerException("Failed to crawl " + url + " after " + MAX_RETRY + " attempts", e); |
|||
} |
|||
} |
|||
} |
|||
|
|||
throw new CrawlerException("Unexpected error: max retry attempts exhausted"); |
|||
} |
|||
} |
|||
@ -0,0 +1,93 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.exception.NetworkException; |
|||
import com.example.datacollect.exception.ParseException; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class BlogStrategy implements CrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url != null && (url.contains("blog") || url.contains("wordpress") || url.contains("lofter") || url.contains("hexo")); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> crawl(String url) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
Document doc = Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
|||
.timeout(10000) |
|||
.get(); |
|||
articles = parse(doc, url); |
|||
} catch (IOException e) { |
|||
throw new NetworkException("网络请求失败:" + e.getMessage(), e); |
|||
} catch (ParseException e) { |
|||
throw e; |
|||
} catch (Exception e) { |
|||
articles.add(new Article("爬取失败", url, "错误:" + e.getMessage(), "系统", null)); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(Document doc, String url) throws ParseException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
if (url.contains("lofter")) { |
|||
crawlLofter(doc, url, articles); |
|||
} else if (url.contains("wordpress")) { |
|||
crawlWordpress(doc, url, articles); |
|||
} else { |
|||
crawlGenericBlog(doc, url, articles); |
|||
} |
|||
} catch (Exception e) { |
|||
throw new ParseException("解析博客网站失败:" + e.getMessage(), e); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
private void crawlLofter(Document doc, String url, List<Article> articles) { |
|||
Elements items = doc.select(".m-post"); |
|||
for (Element item : items) { |
|||
String title = item.select(".m-post-title a").text(); |
|||
String link = item.select(".m-post-title a").attr("href"); |
|||
String author = item.select(".m-user-name").text(); |
|||
String summary = item.select(".m-post-content").text(); |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, link, summary.length() > 300 ? summary.substring(0, 300) : summary, author, null)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private void crawlWordpress(Document doc, String url, List<Article> articles) { |
|||
Elements items = doc.select(".post"); |
|||
for (Element item : items) { |
|||
String title = item.select(".entry-title a").text(); |
|||
String link = item.select(".entry-title a").attr("href"); |
|||
String author = item.select(".author").text(); |
|||
String summary = item.select(".entry-summary").text(); |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, link, summary.length() > 300 ? summary.substring(0, 300) : summary, author, null)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private void crawlGenericBlog(Document doc, String url, List<Article> articles) { |
|||
Elements items = doc.select(".article, .post, .blog-post"); |
|||
for (Element item : items) { |
|||
String title = item.select("h1, h2, .title").text(); |
|||
String content = item.select(".content, .post-content").text(); |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, url, content.length() > 300 ? content.substring(0, 300) : content, "未知作者", null)); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,12 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.exception.ParseException; |
|||
import org.jsoup.nodes.Document; |
|||
import java.util.List; |
|||
|
|||
public interface CrawlStrategy { |
|||
boolean supports(String url); |
|||
List<Article> crawl(String url); |
|||
List<Article> parse(Document doc, String url) throws ParseException; |
|||
} |
|||
@ -0,0 +1,118 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.exception.NetworkException; |
|||
import com.example.datacollect.exception.ParseException; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class NewsStrategy implements CrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url != null && (url.contains("news") || url.contains("sina") || url.contains("163") || url.contains("sohu") || url.contains("qq.com")); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> crawl(String url) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
Document doc = Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
|||
.timeout(10000) |
|||
.get(); |
|||
articles = parse(doc, url); |
|||
} catch (IOException e) { |
|||
throw new NetworkException("网络请求失败:" + e.getMessage(), e); |
|||
} catch (ParseException e) { |
|||
throw e; |
|||
} catch (Exception e) { |
|||
articles.add(new Article("爬取失败", url, "错误:" + e.getMessage(), "系统", null)); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(Document doc, String url) throws ParseException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
if (url.contains("sina")) { |
|||
crawlSina(doc, url, articles); |
|||
} else if (url.contains("163") || url.contains("netease")) { |
|||
crawlNetease(doc, url, articles); |
|||
} else if (url.contains("sohu")) { |
|||
crawlSohu(doc, url, articles); |
|||
} else if (url.contains("qq")) { |
|||
crawlQQ(doc, url, articles); |
|||
} else { |
|||
crawlGenericNews(doc, url, articles); |
|||
} |
|||
} catch (Exception e) { |
|||
throw new ParseException("解析新闻网站失败:" + e.getMessage(), e); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
private void crawlSina(Document doc, String url, List<Article> articles) { |
|||
Elements items = doc.select(".news-item"); |
|||
for (Element item : items) { |
|||
String title = item.select("a").text(); |
|||
String link = item.select("a").attr("href"); |
|||
String summary = item.select(".news-summary").text(); |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, link, summary.length() > 300 ? summary.substring(0, 300) : summary, "新浪新闻", null)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private void crawlNetease(Document doc, String url, List<Article> articles) { |
|||
Elements items = doc.select(".news-list li"); |
|||
for (Element item : items) { |
|||
String title = item.select("a").text(); |
|||
String link = item.select("a").attr("href"); |
|||
if (!link.startsWith("http")) link = "https://news.163.com" + link; |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, link, "", "网易新闻", null)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private void crawlSohu(Document doc, String url, List<Article> articles) { |
|||
Elements items = doc.select(".news-item h3 a"); |
|||
for (Element item : items) { |
|||
String title = item.text(); |
|||
String link = item.attr("href"); |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, link, "", "搜狐新闻", null)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private void crawlQQ(Document doc, String url, List<Article> articles) { |
|||
Elements items = doc.select(".list li a"); |
|||
for (Element item : items) { |
|||
String title = item.text(); |
|||
String link = item.attr("href"); |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, link, "", "腾讯新闻", null)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private void crawlGenericNews(Document doc, String url, List<Article> articles) { |
|||
Elements items = doc.select(".news, .article-item"); |
|||
for (Element item : items) { |
|||
String title = item.select("h2, h3, .title").text(); |
|||
String link = item.select("a").attr("href"); |
|||
if (!link.startsWith("http")) link = url + link; |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, link, "", "新闻网站", null)); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,27 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class StrategyFactory { |
|||
private static final List<CrawlStrategy> strategies = new ArrayList<>(); |
|||
|
|||
static { |
|||
strategies.add(new BlogStrategy()); |
|||
strategies.add(new NewsStrategy()); |
|||
strategies.add(new TechStrategy()); |
|||
} |
|||
|
|||
public static CrawlStrategy getStrategy(String url) { |
|||
for (CrawlStrategy strategy : strategies) { |
|||
if (strategy.supports(url)) { |
|||
return strategy; |
|||
} |
|||
} |
|||
return null; |
|||
} |
|||
|
|||
public static List<CrawlStrategy> getAllStrategies() { |
|||
return new ArrayList<>(strategies); |
|||
} |
|||
} |
|||
@ -0,0 +1,105 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import com.example.datacollect.exception.NetworkException; |
|||
import com.example.datacollect.exception.ParseException; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class TechStrategy implements CrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url != null && (url.contains("csdn") || url.contains("oschina") || url.contains("iteye") || url.contains("cnblogs")); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> crawl(String url) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
Document doc = Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
|||
.timeout(10000) |
|||
.get(); |
|||
articles = parse(doc, url); |
|||
} catch (IOException e) { |
|||
throw new NetworkException("网络请求失败:" + e.getMessage(), e); |
|||
} catch (ParseException e) { |
|||
throw e; |
|||
} catch (Exception e) { |
|||
articles.add(new Article("爬取失败", url, "错误:" + e.getMessage(), "系统", null)); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(Document doc, String url) throws ParseException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
if (url.contains("csdn")) { |
|||
crawlCsdn(doc, url, articles); |
|||
} else if (url.contains("cnblogs")) { |
|||
crawlCnblogs(doc, url, articles); |
|||
} else if (url.contains("oschina")) { |
|||
crawlOschina(doc, url, articles); |
|||
} else { |
|||
crawlGeneric(doc, url, articles); |
|||
} |
|||
} catch (Exception e) { |
|||
throw new ParseException("解析技术网站失败:" + e.getMessage(), e); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
private void crawlCsdn(Document doc, String url, List<Article> articles) { |
|||
Elements items = doc.select(".article-item-box"); |
|||
for (Element item : items) { |
|||
String title = item.select("h4 a").text(); |
|||
String link = item.select("h4 a").attr("href"); |
|||
String author = item.select(".name").text(); |
|||
String summary = item.select(".content").text(); |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, link, summary, author, null)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private void crawlCnblogs(Document doc, String url, List<Article> articles) { |
|||
Elements items = doc.select(".post-item"); |
|||
for (Element item : items) { |
|||
String title = item.select(".post-item-title a").text(); |
|||
String link = item.select(".post-item-title a").attr("href"); |
|||
String author = item.select(".post-item-author a").text(); |
|||
String summary = item.select(".post-item-summary").text(); |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, link, summary, author, null)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private void crawlOschina(Document doc, String url, List<Article> articles) { |
|||
Elements items = doc.select(".news-list .news-item"); |
|||
for (Element item : items) { |
|||
String title = item.select(".title a").text(); |
|||
String link = "https://www.oschina.net" + item.select(".title a").attr("href"); |
|||
String author = item.select(".author").text(); |
|||
String summary = item.select(".description").text(); |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, link, summary, author, null)); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private void crawlGeneric(Document doc, String url, List<Article> articles) { |
|||
String title = doc.title(); |
|||
String content = doc.select("article, .article-content, .post-content").text(); |
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, url, content.length() > 500 ? content.substring(0, 500) : content, "未知", null)); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,53 @@ |
|||
package com.example.datacollect.view; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.List; |
|||
import java.util.Scanner; |
|||
|
|||
public class ConsoleView { |
|||
private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class); |
|||
private static final String ANSI_RESET = "\u001B[0m"; |
|||
private static final String ANSI_GREEN = "\u001B[32m"; |
|||
private static final String ANSI_RED = "\u001B[31m"; |
|||
private static final String ANSI_BLUE = "\u001B[34m"; |
|||
|
|||
private final Scanner scanner = new Scanner(System.in); |
|||
|
|||
public String readLine() { |
|||
System.out.print("> "); |
|||
String line = scanner.nextLine(); |
|||
logger.debug("User input: {}", line); |
|||
return line; |
|||
} |
|||
|
|||
public void printSuccess(String msg) { |
|||
logger.info("Success: {}", msg); |
|||
System.out.println(ANSI_GREEN + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void printError(String msg) { |
|||
logger.error("Error: {}", msg); |
|||
System.out.println(ANSI_RED + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void printInfo(String msg) { |
|||
logger.debug("Info: {}", msg); |
|||
System.out.println(ANSI_BLUE + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void display(List<Article> articles) { |
|||
if (articles.isEmpty()) { |
|||
logger.info("No articles to display"); |
|||
printInfo("暂无文章,请先执行 crawl。"); |
|||
return; |
|||
} |
|||
logger.info("Displaying {} articles", articles.size()); |
|||
for (int i = 0; i < articles.size(); i++) { |
|||
Article a = articles.get(i); |
|||
System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,22 @@ |
|||
<configuration> |
|||
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
|||
<encoder> |
|||
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
|||
</encoder> |
|||
</appender> |
|||
|
|||
<appender name="FILE" class="ch.qos.logback.core.FileAppender"> |
|||
<file>logs/crawler.log</file> |
|||
<encoder> |
|||
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
|||
</encoder> |
|||
</appender> |
|||
|
|||
<root level="INFO"> |
|||
<appender-ref ref="CONSOLE" /> |
|||
<appender-ref ref="FILE" /> |
|||
</root> |
|||
|
|||
<logger name="com.example.datacollect" level="DEBUG" /> |
|||
<logger name="org.jsoup" level="WARN" /> |
|||
</configuration> |
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,22 @@ |
|||
<configuration> |
|||
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
|||
<encoder> |
|||
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
|||
</encoder> |
|||
</appender> |
|||
|
|||
<appender name="FILE" class="ch.qos.logback.core.FileAppender"> |
|||
<file>logs/crawler.log</file> |
|||
<encoder> |
|||
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
|||
</encoder> |
|||
</appender> |
|||
|
|||
<root level="INFO"> |
|||
<appender-ref ref="CONSOLE" /> |
|||
<appender-ref ref="FILE" /> |
|||
</root> |
|||
|
|||
<logger name="com.example.datacollect" level="DEBUG" /> |
|||
<logger name="org.jsoup" level="WARN" /> |
|||
</configuration> |
|||
@ -0,0 +1,22 @@ |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\model\Article.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\exception\CrawlerException.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\TechStrategy.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\repository\ArticleRepository.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\Main.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\BlogStrategy.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\Command.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\ExitCommand.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\HelpCommand.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\CrawlCommand.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\HistoryCommand.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\NewsStrategy.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\exception\NetworkException.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\SaveCommand.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\controller\CrawlerController.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\CrawlStrategy.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\ListCommand.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\service\ScraperService.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\exception\ParseException.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\view\ConsoleView.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\StrategyFactory.java |
|||
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\AnalyzeCommand.java |
|||
Loading…
Reference in new issue