16 changed files with 566 additions and 71 deletions
@ -0,0 +1,71 @@ |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
<groupId>com.example</groupId> |
|||
<artifactId>datacollect-cli</artifactId> |
|||
<version>0.1.0</version> |
|||
<properties> |
|||
<maven.compiler.source>11</maven.compiler.source> |
|||
<maven.compiler.target>11</maven.compiler.target> |
|||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
|||
<logback.version>1.4.11</logback.version> |
|||
<slf4j.version>2.0.9</slf4j.version> |
|||
<jsoup.version>1.17.2</jsoup.version> |
|||
</properties> |
|||
|
|||
<dependencies> |
|||
<dependency> |
|||
<groupId>ch.qos.logback</groupId> |
|||
<artifactId>logback-classic</artifactId> |
|||
<version>${logback.version}</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.slf4j</groupId> |
|||
<artifactId>slf4j-api</artifactId> |
|||
<version>${slf4j.version}</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>${jsoup.version}</version> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-compiler-plugin</artifactId> |
|||
<version>3.8.1</version> |
|||
<configuration> |
|||
<encoding>UTF-8</encoding> |
|||
</configuration> |
|||
</plugin> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-assembly-plugin</artifactId> |
|||
<version>3.3.0</version> |
|||
<configuration> |
|||
<archive> |
|||
<manifest> |
|||
<mainClass>com.example.datacollect.Main</mainClass> |
|||
</manifest> |
|||
</archive> |
|||
<descriptorRefs> |
|||
<descriptorRef>jar-with-dependencies</descriptorRef> |
|||
</descriptorRefs> |
|||
</configuration> |
|||
<executions> |
|||
<execution> |
|||
<id>make-assembly</id> |
|||
<phase>package</phase> |
|||
<goals> |
|||
<goal>single</goal> |
|||
</goals> |
|||
</execution> |
|||
</executions> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
|||
@ -1,19 +1,28 @@ |
|||
package com.example.datacollect; |
|||
|
|||
import com.example.datacollect.controller.CrawlerController; |
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import com.example.datacollect.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
public class Main { |
|||
private static final Logger logger = LoggerFactory.getLogger(Main.class); |
|||
|
|||
public static void main(String[] args) { |
|||
logger.info("启动 CLI Crawler 程序"); |
|||
|
|||
ConsoleView view = new ConsoleView(); |
|||
ArticleRepository repository = new ArticleRepository(); |
|||
CrawlerController controller = new CrawlerController(view, repository); |
|||
CrawlerController controller = new CrawlerController(view); |
|||
|
|||
view.printSuccess("Welcome to CLI Crawler! Type help for commands."); |
|||
|
|||
view.printSuccess("Welcome to CLI Crawler (w10)! Type help for commands."); |
|||
try { |
|||
while (true) { |
|||
controller.handle(view.readLine()); |
|||
} |
|||
} catch (Exception e) { |
|||
logger.error("程序异常退出: {}", e.getMessage(), e); |
|||
view.printError("程序异常退出: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
@ -1,8 +1,10 @@ |
|||
package com.example.datacollect.command; |
|||
|
|||
import com.example.datacollect.repository.ArticleRepository; |
|||
import java.util.List; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
|
|||
public interface Command { |
|||
String getName(); |
|||
void execute(String[] args, ArticleRepository repository); |
|||
void execute(String[] args, List<Article> articles); |
|||
} |
|||
@ -1,6 +1,6 @@ |
|||
package com.example.crawler.exception; |
|||
package com.example.datacollect.exception; |
|||
|
|||
public class CrawlerException extends Exception { |
|||
public class CrawlerException extends RuntimeException { |
|||
public CrawlerException(String message) { |
|||
super(message); |
|||
} |
|||
@ -0,0 +1,29 @@ |
|||
package com.example.datacollect.exception; |
|||
|
|||
public class NetworkException extends CrawlerException { |
|||
private final String url; |
|||
|
|||
public NetworkException(String message) { |
|||
super(message); |
|||
this.url = null; |
|||
} |
|||
|
|||
public NetworkException(String message, String url) { |
|||
super(message); |
|||
this.url = url; |
|||
} |
|||
|
|||
public NetworkException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
this.url = null; |
|||
} |
|||
|
|||
public NetworkException(String message, String url, Throwable cause) { |
|||
super(message, cause); |
|||
this.url = url; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
} |
|||
@ -0,0 +1,29 @@ |
|||
package com.example.datacollect.exception; |
|||
|
|||
public class ParseException extends CrawlerException { |
|||
private final String source; |
|||
|
|||
public ParseException(String message) { |
|||
super(message); |
|||
this.source = null; |
|||
} |
|||
|
|||
public ParseException(String message, String source) { |
|||
super(message); |
|||
this.source = source; |
|||
} |
|||
|
|||
public ParseException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
this.source = null; |
|||
} |
|||
|
|||
public ParseException(String message, String source, Throwable cause) { |
|||
super(message, cause); |
|||
this.source = source; |
|||
} |
|||
|
|||
public String getSource() { |
|||
return source; |
|||
} |
|||
} |
|||
@ -0,0 +1,29 @@ |
|||
package com.example.datacollect.exception; |
|||
|
|||
public class UrlFormatException extends RuntimeException { |
|||
private final String url; |
|||
|
|||
public UrlFormatException(String message) { |
|||
super(message); |
|||
this.url = null; |
|||
} |
|||
|
|||
public UrlFormatException(String message, String url) { |
|||
super(message); |
|||
this.url = url; |
|||
} |
|||
|
|||
public UrlFormatException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
this.url = null; |
|||
} |
|||
|
|||
public UrlFormatException(String message, String url, Throwable cause) { |
|||
super(message, cause); |
|||
this.url = url; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
} |
|||
@ -1,42 +1,86 @@ |
|||
package com.example.datacollect.repository; |
|||
|
|||
import com.example.datacollect.exception.UrlFormatException; |
|||
import com.example.datacollect.model.Article; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.Collections; |
|||
import java.util.List; |
|||
import java.util.Optional; |
|||
|
|||
public class ArticleRepository { |
|||
private final List<Article> articles = new ArrayList<>(); |
|||
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); |
|||
private final List<Article> articles; |
|||
|
|||
public ArticleRepository() { |
|||
this.articles = new ArrayList<>(); |
|||
} |
|||
|
|||
public void add(Article article) { |
|||
if (article != null) { |
|||
articles.add(article); |
|||
// 防御检查:标题非空
|
|||
if (article.getTitle() == null || article.getTitle().trim().isEmpty()) { |
|||
logger.error("文章标题不能为空"); |
|||
throw new IllegalArgumentException("文章标题不能为空"); |
|||
} |
|||
|
|||
// 防御检查:URL非空且格式正确
|
|||
if (article.getUrl() == null || article.getUrl().trim().isEmpty()) { |
|||
logger.error("文章URL不能为空"); |
|||
throw new IllegalArgumentException("文章URL不能为空"); |
|||
} |
|||
|
|||
// URL格式校验
|
|||
String url = article.getUrl().trim(); |
|||
if (!url.startsWith("http://") && !url.startsWith("https://")) { |
|||
logger.warn("URL格式不正确,缺少协议头: {}", url); |
|||
throw new UrlFormatException("URL格式不正确,必须以 http:// 或 https:// 开头", url); |
|||
} |
|||
|
|||
public void addAll(List<Article> newArticles) { |
|||
if (newArticles != null && !newArticles.isEmpty()) { |
|||
articles.addAll(newArticles); |
|||
// 验证URL是否为有效格式
|
|||
try { |
|||
java.net.URL validUrl = new java.net.URL(url); |
|||
String host = validUrl.getHost(); |
|||
if (host == null || host.isEmpty()) { |
|||
logger.error("URL主机名无效: {}", url); |
|||
throw new UrlFormatException("URL主机名无效", url); |
|||
} |
|||
} catch (java.net.MalformedURLException e) { |
|||
logger.error("URL格式错误: {}", url); |
|||
throw new UrlFormatException("URL格式错误: " + e.getMessage(), url, e); |
|||
} |
|||
|
|||
articles.add(article); |
|||
logger.info("成功添加文章: {}", article.getTitle()); |
|||
} |
|||
|
|||
public void addAll(List<Article> articles) { |
|||
for (Article article : articles) { |
|||
add(article); |
|||
} |
|||
} |
|||
|
|||
public List<Article> getAll() { |
|||
return Collections.unmodifiableList(articles); |
|||
return new ArrayList<>(articles); |
|||
} |
|||
|
|||
public Optional<Article> findByTitle(String title) { |
|||
return articles.stream() |
|||
.filter(a -> a.getTitle().equals(title)) |
|||
.findFirst(); |
|||
public Article get(int index) { |
|||
if (index < 0 || index >= articles.size()) { |
|||
logger.error("索引越界: {}", index); |
|||
throw new IndexOutOfBoundsException("索引越界: " + index); |
|||
} |
|||
return articles.get(index); |
|||
} |
|||
|
|||
public int count() { |
|||
public int size() { |
|||
return articles.size(); |
|||
} |
|||
|
|||
public boolean isEmpty() { |
|||
return articles.isEmpty(); |
|||
} |
|||
|
|||
public void clear() { |
|||
articles.clear(); |
|||
logger.info("文章列表已清空"); |
|||
} |
|||
} |
|||
@ -0,0 +1,89 @@ |
|||
package com.example.datacollect.util; |
|||
|
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.function.Supplier; |
|||
|
|||
public class RetryUtils { |
|||
private static final Logger logger = LoggerFactory.getLogger(RetryUtils.class); |
|||
|
|||
private static final int DEFAULT_MAX_RETRIES = 3; |
|||
private static final long DEFAULT_BASE_WAIT_MS = 500; |
|||
|
|||
private final int maxRetries; |
|||
private final long baseWaitMs; |
|||
|
|||
public RetryUtils() { |
|||
this(DEFAULT_MAX_RETRIES, DEFAULT_BASE_WAIT_MS); |
|||
} |
|||
|
|||
public RetryUtils(int maxRetries, long baseWaitMs) { |
|||
this.maxRetries = maxRetries; |
|||
this.baseWaitMs = baseWaitMs; |
|||
} |
|||
|
|||
public <T> T executeWithRetry(Supplier<T> supplier, String operationName) { |
|||
Exception lastException = null; |
|||
|
|||
for (int attempt = 0; attempt <= maxRetries; attempt++) { |
|||
try { |
|||
logger.debug("[{}] 第 {} 次尝试", operationName, attempt + 1); |
|||
T result = supplier.get(); |
|||
|
|||
if (result != null) { |
|||
logger.info("[{}] 第 {} 次尝试成功", operationName, attempt + 1); |
|||
return result; |
|||
} |
|||
|
|||
logger.warn("[{}] 第 {} 次尝试返回空结果", operationName, attempt + 1); |
|||
|
|||
} catch (Exception e) { |
|||
lastException = e; |
|||
logger.warn("[{}] 第 {} 次尝试失败: {}", operationName, attempt + 1, e.getMessage()); |
|||
|
|||
if (attempt < maxRetries) { |
|||
long waitTime = calculateWaitTime(attempt); |
|||
logger.warn("[{}] 将在 {} ms 后重试", operationName, waitTime); |
|||
|
|||
try { |
|||
Thread.sleep(waitTime); |
|||
} catch (InterruptedException ie) { |
|||
Thread.currentThread().interrupt(); |
|||
logger.error("[{}] 重试等待被中断", operationName); |
|||
throw new RuntimeException("重试等待被中断", ie); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
logger.error("[{}] 已重试 {} 次,全部失败", operationName, maxRetries); |
|||
throw new RuntimeException("操作失败,已重试 " + maxRetries + " 次", lastException); |
|||
} |
|||
|
|||
private long calculateWaitTime(int attempt) { |
|||
// 指数退避: wait = base * 2^attempt
|
|||
double waitTime = baseWaitMs * Math.pow(2, attempt); |
|||
return (long) waitTime; |
|||
} |
|||
|
|||
public static <T> T execute(Supplier<T> supplier, String operationName) { |
|||
return new RetryUtils().executeWithRetry(supplier, operationName); |
|||
} |
|||
|
|||
public static <T> T execute(Supplier<T> supplier, String operationName, int maxRetries) { |
|||
return new RetryUtils(maxRetries, DEFAULT_BASE_WAIT_MS).executeWithRetry(supplier, operationName); |
|||
} |
|||
|
|||
public static <T> T execute(Supplier<T> supplier, String operationName, int maxRetries, long baseWaitMs) { |
|||
return new RetryUtils(maxRetries, baseWaitMs).executeWithRetry(supplier, operationName); |
|||
} |
|||
|
|||
public int getMaxRetries() { |
|||
return maxRetries; |
|||
} |
|||
|
|||
public long getBaseWaitMs() { |
|||
return baseWaitMs; |
|||
} |
|||
} |
|||
Loading…
Reference in new issue