39 changed files with 1447 additions and 0 deletions
Binary file not shown.
Binary file not shown.
@ -0,0 +1,4 @@ |
|||||
|
*.jar |
||||
|
*.jar |
||||
|
*.class |
||||
|
*.log |
||||
@ -0,0 +1,17 @@ |
|||||
|
# DataCollect 教学项目 — 最小可运行版本 |
||||
|
|
||||
|
这是一个最小可用的 Java CLI 演示工程,目标:打印帮助信息以验证运行环境。 |
||||
|
|
||||
|
构建: |
||||
|
```bash |
||||
|
mvn -q package |
||||
|
``` |
||||
|
|
||||
|
运行(示例): |
||||
|
```bash |
||||
|
java -jar target/datacollect-cli-0.1.0-jar-with-dependencies.jar --help |
||||
|
``` |
||||
|
|
||||
|
项目结构(最小): |
||||
|
- `src/main/java/com/example/datacollect/Main.java` — CLI 入口,打印帮助 |
||||
|
- `pom.xml` — Maven 构建配置,生成可执行 jar |
||||
@ -0,0 +1,69 @@ |
|||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" |
||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> |
||||
|
<modelVersion>4.0.0</modelVersion> |
||||
|
<groupId>com.example</groupId> |
||||
|
<artifactId>datacollect-cli</artifactId> |
||||
|
<version>0.1.0</version> |
||||
|
<properties> |
||||
|
<maven.compiler.source>11</maven.compiler.source> |
||||
|
<maven.compiler.target>11</maven.compiler.target> |
||||
|
</properties> |
||||
|
|
||||
|
<dependencies> |
||||
|
<dependency> |
||||
|
<groupId>org.jsoup</groupId> |
||||
|
<artifactId>jsoup</artifactId> |
||||
|
<version>1.17.2</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>ch.qos.logback</groupId> |
||||
|
<artifactId>logback-classic</artifactId> |
||||
|
<version>1.4.14</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>com.fasterxml.jackson.core</groupId> |
||||
|
<artifactId>jackson-databind</artifactId> |
||||
|
<version>2.15.2</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>com.fasterxml.jackson.datatype</groupId> |
||||
|
<artifactId>jackson-datatype-jsr310</artifactId> |
||||
|
<version>2.15.2</version> |
||||
|
</dependency> |
||||
|
</dependencies> |
||||
|
|
||||
|
<build> |
||||
|
<plugins> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-compiler-plugin</artifactId> |
||||
|
<version>3.8.1</version> |
||||
|
</plugin> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-assembly-plugin</artifactId> |
||||
|
<version>3.3.0</version> |
||||
|
<configuration> |
||||
|
<archive> |
||||
|
<manifest> |
||||
|
<mainClass>com.example.datacollect.Main</mainClass> |
||||
|
</manifest> |
||||
|
</archive> |
||||
|
<descriptorRefs> |
||||
|
<descriptorRef>jar-with-dependencies</descriptorRef> |
||||
|
</descriptorRefs> |
||||
|
</configuration> |
||||
|
<executions> |
||||
|
<execution> |
||||
|
<id>make-assembly</id> |
||||
|
<phase>package</phase> |
||||
|
<goals> |
||||
|
<goal>single</goal> |
||||
|
</goals> |
||||
|
</execution> |
||||
|
</executions> |
||||
|
</plugin> |
||||
|
</plugins> |
||||
|
</build> |
||||
|
</project> |
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,31 @@ |
|||||
|
package com.example.datacollect; |
||||
|
|
||||
|
import com.example.datacollect.controller.CrawlerController; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
public class Main { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(Main.class); |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
logger.info("Starting CLI Crawler application"); |
||||
|
|
||||
|
ConsoleView view = new ConsoleView(); |
||||
|
ArticleRepository repository = new ArticleRepository(); |
||||
|
CrawlerController controller = new CrawlerController(view, repository); |
||||
|
|
||||
|
view.printSuccess("Welcome to CLI Crawler (W11)! Type help for commands."); |
||||
|
logger.info("Application started successfully"); |
||||
|
|
||||
|
while (true) { |
||||
|
try { |
||||
|
controller.handle(view.readLine()); |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Error processing command", e); |
||||
|
view.printError("Error: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,130 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.strategy.CrawlStrategy; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public class AnalyzeCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public AnalyzeCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "analyze"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
logger.debug("Executing analyze command"); |
||||
|
|
||||
|
if (args.length >= 2) { |
||||
|
String url = args[1]; |
||||
|
analyzeUrl(url); |
||||
|
} else { |
||||
|
analyzeRepository(repository); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void analyzeUrl(String url) { |
||||
|
logger.info("Analyzing URL: {}", url); |
||||
|
CrawlStrategy strategy = StrategyFactory.getStrategy(url); |
||||
|
|
||||
|
if (strategy == null) { |
||||
|
logger.error("No strategy found for URL: {}", url); |
||||
|
view.printError("No strategy found for URL: " + url); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
view.printInfo("Analyzing URL: " + url); |
||||
|
view.printInfo("Using strategy: " + strategy.getClass().getSimpleName()); |
||||
|
|
||||
|
List<Article> articles = strategy.crawl(url); |
||||
|
|
||||
|
printAnalysis(articles); |
||||
|
logger.info("Analysis completed for URL: {}", url); |
||||
|
view.printInfo("Note: Analysis results are NOT stored."); |
||||
|
} |
||||
|
|
||||
|
private void analyzeRepository(ArticleRepository repository) { |
||||
|
List<Article> articles = repository.getAll(); |
||||
|
|
||||
|
if (articles.isEmpty()) { |
||||
|
logger.info("No articles to analyze"); |
||||
|
view.printInfo("No articles to analyze. Use 'analyze <url>' to analyze a URL without storing."); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
logger.info("Analyzing {} articles from repository", articles.size()); |
||||
|
view.printInfo("Analyzing " + articles.size() + " articles from repository:"); |
||||
|
printAnalysis(articles); |
||||
|
} |
||||
|
|
||||
|
private void printAnalysis(List<Article> articles) { |
||||
|
if (articles.isEmpty()) { |
||||
|
logger.info("No articles found for analysis"); |
||||
|
view.printInfo("No articles found."); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
int totalArticles = articles.size(); |
||||
|
int totalContentLength = 0; |
||||
|
int articlesWithAuthor = 0; |
||||
|
int articlesWithDate = 0; |
||||
|
|
||||
|
for (Article article : articles) { |
||||
|
if (article.getContent() != null) { |
||||
|
totalContentLength += article.getContent().length(); |
||||
|
} |
||||
|
if (article.getAuthor() != null && !article.getAuthor().isEmpty()) { |
||||
|
articlesWithAuthor++; |
||||
|
} |
||||
|
if (article.getPublishDate() != null) { |
||||
|
articlesWithDate++; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
double avgContentLength = totalArticles > 0 ? (double) totalContentLength / totalArticles : 0; |
||||
|
|
||||
|
logger.info("Analysis results: {} articles, {} avg length", totalArticles, avgContentLength); |
||||
|
view.printInfo("=== Analysis Results ==="); |
||||
|
view.printInfo("Total articles: " + totalArticles); |
||||
|
view.printInfo("Total content length: " + totalContentLength); |
||||
|
view.printInfo("Average content length: " + String.format("%.2f", avgContentLength)); |
||||
|
view.printInfo("Articles with author: " + articlesWithAuthor); |
||||
|
view.printInfo("Articles with publish date: " + articlesWithDate); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName1() { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, ArticleRepository repository) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,15 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface Command { |
||||
|
String getName1(); |
||||
|
void execute(String[] args, List<Article> articles); |
||||
|
void execute(String[] args, ArticleRepository repository); |
||||
|
String getName(); |
||||
|
void execute1(String[] args, List<Article> articles); |
||||
|
void execute1(String[] args, ArticleRepository repository); |
||||
|
} |
||||
@ -0,0 +1,85 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.service.ScraperService; |
||||
|
import com.example.datacollect.strategy.CrawlStrategy; |
||||
|
import com.example.datacollect.strategy.StrategyFactory; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CrawlCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
private final ScraperService scraperService; |
||||
|
|
||||
|
public CrawlCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
this.scraperService = new ScraperService(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "crawl"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
if (args.length < 2) { |
||||
|
logger.warn("Missing URL argument"); |
||||
|
view.printError("Usage: crawl <url>"); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String url = args[1]; |
||||
|
logger.info("Crawl started for: {}", url); |
||||
|
|
||||
|
CrawlStrategy strategy = StrategyFactory.getStrategy(url); |
||||
|
|
||||
|
if (strategy == null) { |
||||
|
logger.error("No strategy found for URL: {}", url); |
||||
|
view.printError("No strategy found for URL: " + url); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
logger.info("Using strategy: {}", strategy.getClass().getSimpleName()); |
||||
|
view.printInfo("Crawling " + url + " with strategy: " + strategy.getClass().getSimpleName()); |
||||
|
|
||||
|
try { |
||||
|
List<Article> articles = scraperService.scrapeWithRetry(strategy, url); |
||||
|
repository.addAll(articles); |
||||
|
logger.info("Crawled {} articles successfully", articles.size()); |
||||
|
view.printSuccess("Crawled " + articles.size() + " articles"); |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Error crawling URL: {}", url, e); |
||||
|
view.printError("Error: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName1() { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, ArticleRepository repository) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,55 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
public class ExitCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public ExitCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "exit"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
logger.info("User requested exit"); |
||||
|
view.printSuccess("Goodbye!"); |
||||
|
System.exit(0); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName1() { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, ArticleRepository repository) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,61 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
public class HelpCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public HelpCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "help"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
logger.debug("Executing help command"); |
||||
|
view.printInfo("Commands:"); |
||||
|
view.printInfo(" crawl <url> - 爬取指定 URL 的文章"); |
||||
|
view.printInfo(" list - 列出已爬取的文章"); |
||||
|
view.printInfo(" analyze - 分析文章统计信息"); |
||||
|
view.printInfo(" history - 显示命令历史记录"); |
||||
|
view.printInfo(" save [file] - 保存文章到 JSON 文件(默认 articles.json)"); |
||||
|
view.printInfo(" help - 显示此帮助信息"); |
||||
|
view.printInfo(" exit - 退出程序"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName1() { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, ArticleRepository repository) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,80 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class HistoryCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(HistoryCommand.class); |
||||
|
private static final List<String> commandHistory = new ArrayList<>(); |
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public HistoryCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "history"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
logger.debug("Executing history command"); |
||||
|
|
||||
|
if (commandHistory.isEmpty()) { |
||||
|
logger.info("Command history is empty"); |
||||
|
view.printInfo("No command history."); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
logger.info("Showing {} command history items", commandHistory.size()); |
||||
|
view.printInfo("Command History:"); |
||||
|
for (int i = 0; i < commandHistory.size(); i++) { |
||||
|
view.printInfo((i + 1) + ". " + commandHistory.get(i)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static void addCommand(String command) { |
||||
|
commandHistory.add(command); |
||||
|
logger.debug("Command added to history: {}", command); |
||||
|
} |
||||
|
|
||||
|
public static List<String> getCommandHistory() { |
||||
|
return new ArrayList<>(commandHistory); |
||||
|
} |
||||
|
|
||||
|
public static void clearHistory() { |
||||
|
commandHistory.clear(); |
||||
|
logger.info("Command history cleared"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName1() { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, ArticleRepository repository) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,65 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ListCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ListCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
|
||||
|
public ListCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "list"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
logger.debug("Executing list command"); |
||||
|
List<Article> articles = repository.getAll(); |
||||
|
|
||||
|
if (articles.isEmpty()) { |
||||
|
logger.info("No articles found"); |
||||
|
view.printInfo("No articles yet. Use 'crawl <url>' to get started."); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
logger.info("Listing {} articles", articles.size()); |
||||
|
view.printInfo("=== Articles (" + articles.size() + ") ==="); |
||||
|
for (Article article : articles) { |
||||
|
view.printInfo(article.toString()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName1() { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, ArticleRepository repository) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,85 @@ |
|||||
|
package com.example.datacollect.command; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import com.fasterxml.jackson.databind.ObjectMapper; |
||||
|
import com.fasterxml.jackson.databind.SerializationFeature; |
||||
|
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.File; |
||||
|
import java.io.IOException; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class SaveCommand implements Command { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(SaveCommand.class); |
||||
|
private final ConsoleView view; |
||||
|
private final ObjectMapper objectMapper; |
||||
|
|
||||
|
public SaveCommand(ConsoleView view) { |
||||
|
this.view = view; |
||||
|
this.objectMapper = new ObjectMapper(); |
||||
|
this.objectMapper.registerModule(new JavaTimeModule()); |
||||
|
this.objectMapper.enable(SerializationFeature.INDENT_OUTPUT); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "save"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, ArticleRepository repository) { |
||||
|
logger.debug("Executing save command"); |
||||
|
|
||||
|
String fileName = args.length >= 2 ? args[1] : "articles.json"; |
||||
|
|
||||
|
if (!fileName.endsWith(".json")) { |
||||
|
fileName = fileName + ".json"; |
||||
|
} |
||||
|
|
||||
|
List<Article> articles = repository.getAll(); |
||||
|
|
||||
|
if (articles.isEmpty()) { |
||||
|
logger.warn("No articles to save"); |
||||
|
view.printError("No articles to save. Use 'crawl <url>' first."); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
File file = new File(fileName); |
||||
|
objectMapper.writeValue(file, articles); |
||||
|
logger.info("Successfully saved {} articles to {}", articles.size(), fileName); |
||||
|
view.printSuccess("Saved " + articles.size() + " articles to " + fileName); |
||||
|
} catch (IOException e) { |
||||
|
logger.error("Failed to save articles to {}", fileName, e); |
||||
|
view.printError("Error saving articles: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName1() { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'getName1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, List<Article> articles) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute1(String[] args, ArticleRepository repository) { |
||||
|
// TODO Auto-generated method stub
|
||||
|
throw new UnsupportedOperationException("Unimplemented method 'execute1'"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,60 @@ |
|||||
|
package com.example.datacollect.controller; |
||||
|
|
||||
|
import com.example.datacollect.command.*; |
||||
|
import com.example.datacollect.repository.ArticleRepository; |
||||
|
import com.example.datacollect.view.ConsoleView; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.HashMap; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class CrawlerController { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class); |
||||
|
private final Map<String, Command> commands = new HashMap<>(); |
||||
|
private final ConsoleView view; |
||||
|
private final ArticleRepository repository; |
||||
|
|
||||
|
public CrawlerController(ConsoleView view, ArticleRepository repository) { |
||||
|
this.view = view; |
||||
|
this.repository = repository; |
||||
|
logger.info("Initializing CrawlerController with {} commands", 7); |
||||
|
register(new HelpCommand(view)); |
||||
|
register(new ListCommand(view)); |
||||
|
register(new CrawlCommand(view)); |
||||
|
register(new ExitCommand(view)); |
||||
|
register(new HistoryCommand(view)); |
||||
|
register(new AnalyzeCommand(view)); |
||||
|
register(new SaveCommand(view)); |
||||
|
logger.info("CrawlerController initialized successfully"); |
||||
|
} |
||||
|
|
||||
|
private void register(Command command) { |
||||
|
commands.put(command.getName(), command); |
||||
|
logger.debug("Registered command: {}", command.getName()); |
||||
|
} |
||||
|
|
||||
|
public void handle(String input) { |
||||
|
String text = input == null ? "" : input.trim(); |
||||
|
if (text.isEmpty()) { |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
logger.debug("Handling input: {}", text); |
||||
|
|
||||
|
// 记录命令历史
|
||||
|
HistoryCommand.addCommand(text); |
||||
|
|
||||
|
String[] args = text.split("\\s+"); |
||||
|
String cmdName = args[0].toLowerCase(); |
||||
|
Command command = commands.get(cmdName); |
||||
|
if (command == null) { |
||||
|
logger.warn("Unknown command: {}", cmdName); |
||||
|
view.printError("Unknown command: " + cmdName); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
logger.info("Executing command: {}", cmdName); |
||||
|
command.execute(args, repository); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class CrawlerException extends RuntimeException { |
||||
|
public CrawlerException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class NetworkException extends CrawlerException { |
||||
|
public NetworkException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.example.datacollect.exception; |
||||
|
|
||||
|
public class ParseException extends CrawlerException { |
||||
|
public ParseException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,75 @@ |
|||||
|
package com.example.datacollect.model; |
||||
|
|
||||
|
import java.time.LocalDate; |
||||
|
|
||||
|
public class Article { |
||||
|
private String title; |
||||
|
private String url; |
||||
|
private String content; |
||||
|
private String author; |
||||
|
private LocalDate publishDate; |
||||
|
|
||||
|
public Article(String title, String url, String content) { |
||||
|
this.title = title; |
||||
|
this.url = url; |
||||
|
this.content = content; |
||||
|
} |
||||
|
|
||||
|
public Article(String title, String url, String content, String author, LocalDate publishDate) { |
||||
|
this.title = title; |
||||
|
this.url = url; |
||||
|
this.content = content; |
||||
|
this.author = author; |
||||
|
this.publishDate = publishDate; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public String getUrl() { |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
public void setUrl(String url) { |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
public String getContent() { |
||||
|
return content; |
||||
|
} |
||||
|
|
||||
|
public void setContent(String content) { |
||||
|
this.content = content; |
||||
|
} |
||||
|
|
||||
|
public String getAuthor() { |
||||
|
return author; |
||||
|
} |
||||
|
|
||||
|
public void setAuthor(String author) { |
||||
|
this.author = author; |
||||
|
} |
||||
|
|
||||
|
public LocalDate getPublishDate() { |
||||
|
return publishDate; |
||||
|
} |
||||
|
|
||||
|
public void setPublishDate(LocalDate publishDate) { |
||||
|
this.publishDate = publishDate; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Article{" |
||||
|
+ "title='" + title + '\'' |
||||
|
+ ", url='" + url + '\'' |
||||
|
+ ", author='" + author + '\'' |
||||
|
+ ", publishDate=" + publishDate |
||||
|
+ '}'; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,52 @@ |
|||||
|
package com.example.datacollect.repository; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.Collections; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ArticleRepository { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); |
||||
|
private final List<Article> articles = new ArrayList<>(); |
||||
|
|
||||
|
public void add(Article article) { |
||||
|
if (article == null) { |
||||
|
logger.warn("Attempted to add null article"); |
||||
|
return; |
||||
|
} |
||||
|
if (article.getTitle() == null || article.getTitle().isEmpty()) { |
||||
|
logger.warn("Attempted to add article with empty title"); |
||||
|
return; |
||||
|
} |
||||
|
articles.add(article); |
||||
|
logger.debug("Added article: {}", article.getTitle()); |
||||
|
} |
||||
|
|
||||
|
public void addAll(List<Article> articleList) { |
||||
|
if (articleList == null) { |
||||
|
logger.warn("Attempted to add null article list"); |
||||
|
return; |
||||
|
} |
||||
|
for (Article article : articleList) { |
||||
|
add(article); |
||||
|
} |
||||
|
logger.info("Added {} articles", articleList.size()); |
||||
|
} |
||||
|
|
||||
|
public List<Article> getAll() { |
||||
|
return Collections.unmodifiableList(articles); |
||||
|
} |
||||
|
|
||||
|
public void clear() { |
||||
|
int size = articles.size(); |
||||
|
articles.clear(); |
||||
|
logger.info("Cleared {} articles from repository", size); |
||||
|
} |
||||
|
|
||||
|
public int size() { |
||||
|
return articles.size(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,56 @@ |
|||||
|
package com.example.datacollect.service; |
||||
|
|
||||
|
import com.example.datacollect.exception.CrawlerException; |
||||
|
import com.example.datacollect.exception.NetworkException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.strategy.CrawlStrategy; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ScraperService { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ScraperService.class); |
||||
|
|
||||
|
private static final int MAX_RETRY = 3; |
||||
|
private static final long INITIAL_DELAY_MS = 1000; |
||||
|
private static final double BACKOFF_MULTIPLIER = 2.0; |
||||
|
|
||||
|
public List<Article> scrapeWithRetry(CrawlStrategy strategy, String url) { |
||||
|
int attempt = 0; |
||||
|
long delay = INITIAL_DELAY_MS; |
||||
|
|
||||
|
while (attempt < MAX_RETRY) { |
||||
|
try { |
||||
|
attempt++; |
||||
|
logger.info("Attempt {}/{} to crawl {}", attempt, MAX_RETRY, url); |
||||
|
|
||||
|
List<Article> articles = strategy.crawl(url); |
||||
|
|
||||
|
if (attempt > 1) { |
||||
|
logger.info("Successfully crawled {} on attempt {}", url, attempt); |
||||
|
} |
||||
|
return articles; |
||||
|
|
||||
|
} catch (NetworkException e) { |
||||
|
logger.warn("Network error on attempt {} for {}: {}", attempt, url, e.getMessage()); |
||||
|
|
||||
|
if (attempt < MAX_RETRY) { |
||||
|
try { |
||||
|
logger.info("Retrying after {}ms...", delay); |
||||
|
Thread.sleep(delay); |
||||
|
delay = (long) (delay * BACKOFF_MULTIPLIER); |
||||
|
} catch (InterruptedException ie) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
throw new CrawlerException("Interrupted during retry wait", ie); |
||||
|
} |
||||
|
} else { |
||||
|
logger.error("Failed to crawl {} after {} attempts due to network errors", url, MAX_RETRY); |
||||
|
throw new CrawlerException("Failed to crawl " + url + " after " + MAX_RETRY + " attempts", e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
throw new CrawlerException("Unexpected error: max retry attempts exhausted"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,93 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.exception.NetworkException; |
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class BlogStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url != null && (url.contains("blog") || url.contains("wordpress") || url.contains("lofter") || url.contains("hexo")); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> crawl(String url) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
Document doc = Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
||||
|
.timeout(10000) |
||||
|
.get(); |
||||
|
articles = parse(doc, url); |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("网络请求失败:" + e.getMessage(), e); |
||||
|
} catch (ParseException e) { |
||||
|
throw e; |
||||
|
} catch (Exception e) { |
||||
|
articles.add(new Article("爬取失败", url, "错误:" + e.getMessage(), "系统", null)); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(Document doc, String url) throws ParseException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
if (url.contains("lofter")) { |
||||
|
crawlLofter(doc, url, articles); |
||||
|
} else if (url.contains("wordpress")) { |
||||
|
crawlWordpress(doc, url, articles); |
||||
|
} else { |
||||
|
crawlGenericBlog(doc, url, articles); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseException("解析博客网站失败:" + e.getMessage(), e); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
private void crawlLofter(Document doc, String url, List<Article> articles) { |
||||
|
Elements items = doc.select(".m-post"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.select(".m-post-title a").text(); |
||||
|
String link = item.select(".m-post-title a").attr("href"); |
||||
|
String author = item.select(".m-user-name").text(); |
||||
|
String summary = item.select(".m-post-content").text(); |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, link, summary.length() > 300 ? summary.substring(0, 300) : summary, author, null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlWordpress(Document doc, String url, List<Article> articles) { |
||||
|
Elements items = doc.select(".post"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.select(".entry-title a").text(); |
||||
|
String link = item.select(".entry-title a").attr("href"); |
||||
|
String author = item.select(".author").text(); |
||||
|
String summary = item.select(".entry-summary").text(); |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, link, summary.length() > 300 ? summary.substring(0, 300) : summary, author, null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlGenericBlog(Document doc, String url, List<Article> articles) { |
||||
|
Elements items = doc.select(".article, .post, .blog-post"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.select("h1, h2, .title").text(); |
||||
|
String content = item.select(".content, .post-content").text(); |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, url, content.length() > 300 ? content.substring(0, 300) : content, "未知作者", null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,12 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlStrategy { |
||||
|
boolean supports(String url); |
||||
|
List<Article> crawl(String url); |
||||
|
List<Article> parse(Document doc, String url) throws ParseException; |
||||
|
} |
||||
@ -0,0 +1,118 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.exception.NetworkException; |
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class NewsStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url != null && (url.contains("news") || url.contains("sina") || url.contains("163") || url.contains("sohu") || url.contains("qq.com")); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> crawl(String url) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
Document doc = Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
||||
|
.timeout(10000) |
||||
|
.get(); |
||||
|
articles = parse(doc, url); |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("网络请求失败:" + e.getMessage(), e); |
||||
|
} catch (ParseException e) { |
||||
|
throw e; |
||||
|
} catch (Exception e) { |
||||
|
articles.add(new Article("爬取失败", url, "错误:" + e.getMessage(), "系统", null)); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(Document doc, String url) throws ParseException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
if (url.contains("sina")) { |
||||
|
crawlSina(doc, url, articles); |
||||
|
} else if (url.contains("163") || url.contains("netease")) { |
||||
|
crawlNetease(doc, url, articles); |
||||
|
} else if (url.contains("sohu")) { |
||||
|
crawlSohu(doc, url, articles); |
||||
|
} else if (url.contains("qq")) { |
||||
|
crawlQQ(doc, url, articles); |
||||
|
} else { |
||||
|
crawlGenericNews(doc, url, articles); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseException("解析新闻网站失败:" + e.getMessage(), e); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
private void crawlSina(Document doc, String url, List<Article> articles) { |
||||
|
Elements items = doc.select(".news-item"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.select("a").text(); |
||||
|
String link = item.select("a").attr("href"); |
||||
|
String summary = item.select(".news-summary").text(); |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, link, summary.length() > 300 ? summary.substring(0, 300) : summary, "新浪新闻", null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlNetease(Document doc, String url, List<Article> articles) { |
||||
|
Elements items = doc.select(".news-list li"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.select("a").text(); |
||||
|
String link = item.select("a").attr("href"); |
||||
|
if (!link.startsWith("http")) link = "https://news.163.com" + link; |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, link, "", "网易新闻", null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlSohu(Document doc, String url, List<Article> articles) { |
||||
|
Elements items = doc.select(".news-item h3 a"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.text(); |
||||
|
String link = item.attr("href"); |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, link, "", "搜狐新闻", null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlQQ(Document doc, String url, List<Article> articles) { |
||||
|
Elements items = doc.select(".list li a"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.text(); |
||||
|
String link = item.attr("href"); |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, link, "", "腾讯新闻", null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlGenericNews(Document doc, String url, List<Article> articles) { |
||||
|
Elements items = doc.select(".news, .article-item"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.select("h2, h3, .title").text(); |
||||
|
String link = item.select("a").attr("href"); |
||||
|
if (!link.startsWith("http")) link = url + link; |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, link, "", "新闻网站", null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,27 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class StrategyFactory { |
||||
|
private static final List<CrawlStrategy> strategies = new ArrayList<>(); |
||||
|
|
||||
|
static { |
||||
|
strategies.add(new BlogStrategy()); |
||||
|
strategies.add(new NewsStrategy()); |
||||
|
strategies.add(new TechStrategy()); |
||||
|
} |
||||
|
|
||||
|
public static CrawlStrategy getStrategy(String url) { |
||||
|
for (CrawlStrategy strategy : strategies) { |
||||
|
if (strategy.supports(url)) { |
||||
|
return strategy; |
||||
|
} |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public static List<CrawlStrategy> getAllStrategies() { |
||||
|
return new ArrayList<>(strategies); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,105 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import com.example.datacollect.exception.NetworkException; |
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class TechStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url != null && (url.contains("csdn") || url.contains("oschina") || url.contains("iteye") || url.contains("cnblogs")); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> crawl(String url) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
Document doc = Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
||||
|
.timeout(10000) |
||||
|
.get(); |
||||
|
articles = parse(doc, url); |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("网络请求失败:" + e.getMessage(), e); |
||||
|
} catch (ParseException e) { |
||||
|
throw e; |
||||
|
} catch (Exception e) { |
||||
|
articles.add(new Article("爬取失败", url, "错误:" + e.getMessage(), "系统", null)); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(Document doc, String url) throws ParseException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
if (url.contains("csdn")) { |
||||
|
crawlCsdn(doc, url, articles); |
||||
|
} else if (url.contains("cnblogs")) { |
||||
|
crawlCnblogs(doc, url, articles); |
||||
|
} else if (url.contains("oschina")) { |
||||
|
crawlOschina(doc, url, articles); |
||||
|
} else { |
||||
|
crawlGeneric(doc, url, articles); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseException("解析技术网站失败:" + e.getMessage(), e); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
private void crawlCsdn(Document doc, String url, List<Article> articles) { |
||||
|
Elements items = doc.select(".article-item-box"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.select("h4 a").text(); |
||||
|
String link = item.select("h4 a").attr("href"); |
||||
|
String author = item.select(".name").text(); |
||||
|
String summary = item.select(".content").text(); |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, link, summary, author, null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlCnblogs(Document doc, String url, List<Article> articles) { |
||||
|
Elements items = doc.select(".post-item"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.select(".post-item-title a").text(); |
||||
|
String link = item.select(".post-item-title a").attr("href"); |
||||
|
String author = item.select(".post-item-author a").text(); |
||||
|
String summary = item.select(".post-item-summary").text(); |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, link, summary, author, null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlOschina(Document doc, String url, List<Article> articles) { |
||||
|
Elements items = doc.select(".news-list .news-item"); |
||||
|
for (Element item : items) { |
||||
|
String title = item.select(".title a").text(); |
||||
|
String link = "https://www.oschina.net" + item.select(".title a").attr("href"); |
||||
|
String author = item.select(".author").text(); |
||||
|
String summary = item.select(".description").text(); |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, link, summary, author, null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void crawlGeneric(Document doc, String url, List<Article> articles) { |
||||
|
String title = doc.title(); |
||||
|
String content = doc.select("article, .article-content, .post-content").text(); |
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, url, content.length() > 500 ? content.substring(0, 500) : content, "未知", null)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,53 @@ |
|||||
|
package com.example.datacollect.view; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.List; |
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
public class ConsoleView { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class); |
||||
|
private static final String ANSI_RESET = "\u001B[0m"; |
||||
|
private static final String ANSI_GREEN = "\u001B[32m"; |
||||
|
private static final String ANSI_RED = "\u001B[31m"; |
||||
|
private static final String ANSI_BLUE = "\u001B[34m"; |
||||
|
|
||||
|
private final Scanner scanner = new Scanner(System.in); |
||||
|
|
||||
|
public String readLine() { |
||||
|
System.out.print("> "); |
||||
|
String line = scanner.nextLine(); |
||||
|
logger.debug("User input: {}", line); |
||||
|
return line; |
||||
|
} |
||||
|
|
||||
|
public void printSuccess(String msg) { |
||||
|
logger.info("Success: {}", msg); |
||||
|
System.out.println(ANSI_GREEN + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void printError(String msg) { |
||||
|
logger.error("Error: {}", msg); |
||||
|
System.out.println(ANSI_RED + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void printInfo(String msg) { |
||||
|
logger.debug("Info: {}", msg); |
||||
|
System.out.println(ANSI_BLUE + msg + ANSI_RESET); |
||||
|
} |
||||
|
|
||||
|
public void display(List<Article> articles) { |
||||
|
if (articles.isEmpty()) { |
||||
|
logger.info("No articles to display"); |
||||
|
printInfo("暂无文章,请先执行 crawl。"); |
||||
|
return; |
||||
|
} |
||||
|
logger.info("Displaying {} articles", articles.size()); |
||||
|
for (int i = 0; i < articles.size(); i++) { |
||||
|
Article a = articles.get(i); |
||||
|
System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,22 @@ |
|||||
|
<configuration> |
||||
|
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
||||
|
<encoder> |
||||
|
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<appender name="FILE" class="ch.qos.logback.core.FileAppender"> |
||||
|
<file>logs/crawler.log</file> |
||||
|
<encoder> |
||||
|
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<root level="INFO"> |
||||
|
<appender-ref ref="CONSOLE" /> |
||||
|
<appender-ref ref="FILE" /> |
||||
|
</root> |
||||
|
|
||||
|
<logger name="com.example.datacollect" level="DEBUG" /> |
||||
|
<logger name="org.jsoup" level="WARN" /> |
||||
|
</configuration> |
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,22 @@ |
|||||
|
<configuration> |
||||
|
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> |
||||
|
<encoder> |
||||
|
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<appender name="FILE" class="ch.qos.logback.core.FileAppender"> |
||||
|
<file>logs/crawler.log</file> |
||||
|
<encoder> |
||||
|
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
||||
|
</encoder> |
||||
|
</appender> |
||||
|
|
||||
|
<root level="INFO"> |
||||
|
<appender-ref ref="CONSOLE" /> |
||||
|
<appender-ref ref="FILE" /> |
||||
|
</root> |
||||
|
|
||||
|
<logger name="com.example.datacollect" level="DEBUG" /> |
||||
|
<logger name="org.jsoup" level="WARN" /> |
||||
|
</configuration> |
||||
@ -0,0 +1,22 @@ |
|||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\model\Article.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\exception\CrawlerException.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\TechStrategy.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\repository\ArticleRepository.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\Main.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\BlogStrategy.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\Command.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\ExitCommand.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\HelpCommand.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\CrawlCommand.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\HistoryCommand.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\NewsStrategy.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\exception\NetworkException.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\SaveCommand.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\controller\CrawlerController.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\CrawlStrategy.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\ListCommand.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\service\ScraperService.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\exception\ParseException.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\view\ConsoleView.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\strategy\StrategyFactory.java |
||||
|
D:\作业\ZY\W11\java-cli\src\main\java\com\example\datacollect\command\AnalyzeCommand.java |
||||
Loading…
Reference in new issue