From 5abdee0f76991e7f7be6faeccf1c466684a096ac Mon Sep 17 00:00:00 2001 From: 283375 Date: Sat, 23 May 2026 23:02:14 +0800 Subject: [PATCH] improve exception handling --- src/main/java/internal/hw/crawler/Main.java | 5 ++ .../internal/hw/crawler/MainController.java | 10 +++- .../hw/crawler/commands/CrawlCommand.java | 48 ++++++++++++------- .../strategies/crawl/CrawlException.java | 11 ----- .../crawl/CrawlNetworkException.java | 34 +++++++++++++ .../strategies/crawl/CrawlParseException.java | 43 +++++++++++++++++ .../strategies/crawl/CrawlStrategy.java | 2 +- .../crawl/CrawlStrategyFactory.java | 2 +- .../crawl/CrawlUnsupportedException.java | 9 ++++ .../crawler/strategies/crawl/CrawlUtils.java | 17 +++++-- .../strategies/crawl/CrawlerException.java | 31 ++++++++++++ .../crawler/strategies/crawl/HttpCrawler.java | 21 ++++++-- .../strategies/crawl/IthomeCrawlStrategy.java | 12 ++--- .../crawl/NeteaseNewsCrawlStrategy.java | 12 ++--- .../crawl/PeopleCnCrawlStrategy.java | 12 ++--- 15 files changed, 213 insertions(+), 56 deletions(-) delete mode 100644 src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java create mode 100644 src/main/java/internal/hw/crawler/strategies/crawl/CrawlNetworkException.java create mode 100644 src/main/java/internal/hw/crawler/strategies/crawl/CrawlParseException.java create mode 100644 src/main/java/internal/hw/crawler/strategies/crawl/CrawlUnsupportedException.java create mode 100644 src/main/java/internal/hw/crawler/strategies/crawl/CrawlerException.java diff --git a/src/main/java/internal/hw/crawler/Main.java b/src/main/java/internal/hw/crawler/Main.java index 92451d9..efc7055 100644 --- a/src/main/java/internal/hw/crawler/Main.java +++ b/src/main/java/internal/hw/crawler/Main.java @@ -3,8 +3,12 @@ package internal.hw.crawler; import internal.hw.crawler.commands.*; import internal.hw.crawler.repositories.ArticleRepository; import internal.hw.crawler.views.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class Main { + private static final Logger log = LoggerFactory.getLogger(Main.class); + public static void main(String[] args) { ConsoleView view = new ConsoleView(); MainController controller = new MainController(view); @@ -27,6 +31,7 @@ public class Main { } controller.handleInput(line); } catch (Exception e) { + log.error("Unhandled exception in REPL loop", e); view.printError("Unexpected error: " + e.getMessage()); } } diff --git a/src/main/java/internal/hw/crawler/MainController.java b/src/main/java/internal/hw/crawler/MainController.java index e9a6460..965d564 100644 --- a/src/main/java/internal/hw/crawler/MainController.java +++ b/src/main/java/internal/hw/crawler/MainController.java @@ -2,7 +2,10 @@ package internal.hw.crawler; import internal.hw.crawler.commands.Command; import internal.hw.crawler.commands.CommandArg; +import internal.hw.crawler.strategies.crawl.CrawlerException; import internal.hw.crawler.views.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.Collections; import java.util.HashMap; @@ -10,6 +13,7 @@ import java.util.List; import java.util.Map; public class MainController { + private static final Logger log = LoggerFactory.getLogger(MainController.class); private final Map commands = new HashMap<>(); private final ConsoleView view; @@ -45,8 +49,12 @@ public class MainController { try { command.execute(args); + } catch (CrawlerException e) { + log.warn("Crawler error in command '{}'", cmdName, e); + view.printError(e.getMessage()); } catch (Exception e) { - view.printError("Command failed: " + e.getMessage()); + log.error("Unexpected error in command '{}'", cmdName, e); + view.printError("Internal error: " + e.getMessage()); } } diff --git a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java index 6df7e8d..66347d1 100644 --- a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java +++ b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java @@ -5,15 +5,22 @@ import internal.hw.crawler.repositories.ArticleRepository; import internal.hw.crawler.strategies.crawl.CrawlStrategy; import internal.hw.crawler.strategies.crawl.CrawlStrategyFactory; import internal.hw.crawler.strategies.crawl.CrawlUtils; +import internal.hw.crawler.strategies.crawl.CrawlNetworkException; +import internal.hw.crawler.strategies.crawl.CrawlParseException; +import internal.hw.crawler.strategies.crawl.CrawlUnsupportedException; import internal.hw.crawler.views.CommandOutput; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; import java.net.URL; import java.util.List; import java.util.Objects; public class CrawlCommand implements Command { + private static final Logger log = LoggerFactory.getLogger(CrawlCommand.class); private final ArticleRepository repository; private final CommandOutput out; private final CrawlStrategyFactory crawlStrategyFactory = new CrawlStrategyFactory(); @@ -34,24 +41,33 @@ public class CrawlCommand implements Command { } @Override - public void execute(String[] args) throws Exception { + public void execute(String[] args) { String urlRaw = args[1]; - URL url = new URL(urlRaw); - CrawlStrategy strategy = crawlStrategyFactory.getStrategy(url); - if (strategy == null) { - out.error("Unsupported URL: " + urlRaw); - return; - } - - Document doc = Jsoup.connect(url.toString()).timeout(5000).get(); - CrawlUtils.setProgressCallback(msg -> System.out.print("\r" + msg)); try { - List
articles = strategy.parse(url, doc); - System.out.println(); - articles.stream().filter(Objects::nonNull).forEach(repository::add); - out.success(String.format("Crawled %d articles from %s", articles.size(), urlRaw)); - } finally { - CrawlUtils.clearProgressCallback(); + URL url = new URL(urlRaw); + CrawlStrategy strategy = crawlStrategyFactory.getStrategy(url); + Document doc = Jsoup.connect(url.toString()).timeout(5000).get(); + CrawlUtils.setProgressCallback(msg -> System.out.print("\r" + msg)); + try { + List
articles = strategy.parse(url, doc); + System.out.println(); + articles.stream().filter(Objects::nonNull).forEach(repository::add); + out.success(String.format("Crawled %d articles from %s", articles.size(), urlRaw)); + } finally { + CrawlUtils.clearProgressCallback(); + } + } catch (CrawlUnsupportedException e) { + out.error(e.getMessage()); + log.warn("Unsupported URL: {}", urlRaw); + } catch (CrawlNetworkException e) { + out.error("Network error: " + e.getMessage()); + log.error("Crawl network failure for {}", urlRaw, e); + } catch (CrawlParseException e) { + out.error("Parse error: " + e.getMessage()); + log.error("Crawl parse failure for {}", urlRaw, e); + } catch (IOException e) { + out.error("I/O error: " + e.getMessage()); + log.error("Crawl I/O failure for {}", urlRaw, e); } } } diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java deleted file mode 100644 index 7ff25df..0000000 --- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java +++ /dev/null @@ -1,11 +0,0 @@ -package internal.hw.crawler.strategies.crawl; - -public class CrawlException extends Exception { - public CrawlException(String message) { - super(message); - } - - public CrawlException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlNetworkException.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlNetworkException.java new file mode 100644 index 0000000..ea3ef53 --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlNetworkException.java @@ -0,0 +1,34 @@ +package internal.hw.crawler.strategies.crawl; + +import java.net.URL; + +public class CrawlNetworkException extends CrawlerException { + private final int statusCode; + private final int attempts; + + public CrawlNetworkException(String message, URL url) { + this(message, null, url, -1, 0); + } + + public CrawlNetworkException(String message, Throwable cause, URL url) { + this(message, cause, url, -1, 0); + } + + public CrawlNetworkException(String message, Throwable cause, URL url, int attempts) { + this(message, cause, url, -1, attempts); + } + + public CrawlNetworkException(String message, Throwable cause, URL url, int statusCode, int attempts) { + super(message, url, cause); + this.statusCode = statusCode; + this.attempts = attempts; + } + + public int getStatusCode() { + return statusCode; + } + + public int getAttempts() { + return attempts; + } +} diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlParseException.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlParseException.java new file mode 100644 index 0000000..d19227c --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlParseException.java @@ -0,0 +1,43 @@ +package internal.hw.crawler.strategies.crawl; + +import java.net.URL; + +public class CrawlParseException extends CrawlerException { + public CrawlParseException(String message) { + super(message); + } + + public CrawlParseException(String message, URL url) { + super(message, url); + } + + public CrawlParseException(String message, URL url, Throwable cause) { + super(message, url, cause); + } + + public static class ElementNotFoundException extends CrawlParseException { + private final String selector; + + public ElementNotFoundException(String selector, URL url) { + super("Missing element '" + selector + "' in page: " + url, url); + this.selector = selector; + } + + public String getSelector() { + return selector; + } + } + + public static class IdExtractionException extends CrawlParseException { + private final String pattern; + + public IdExtractionException(URL url, String pattern) { + super("Cannot determine id for " + url + " (pattern: " + pattern + ")", url); + this.pattern = pattern; + } + + public String getPattern() { + return pattern; + } + } +} diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java index 81e0484..ed2a1b6 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java @@ -7,7 +7,7 @@ import java.net.URL; import java.util.List; public interface CrawlStrategy { - List
parse(URL url, Document doc) throws CrawlException; + List
parse(URL url, Document doc) throws CrawlerException; boolean supports(URL url); } diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java index fef8d8e..b467f6a 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java @@ -19,7 +19,7 @@ public class CrawlStrategyFactory { return s; } } - return null; + throw new CrawlUnsupportedException(url); } public void register(CrawlStrategy strategy) { diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUnsupportedException.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUnsupportedException.java new file mode 100644 index 0000000..6e4d3cd --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUnsupportedException.java @@ -0,0 +1,9 @@ +package internal.hw.crawler.strategies.crawl; + +import java.net.URL; + +public class CrawlUnsupportedException extends CrawlerException { + public CrawlUnsupportedException(URL url) { + super("Unsupported site: " + url, url); + } +} diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java index 7942dbd..603ab6e 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java @@ -6,6 +6,7 @@ import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.net.MalformedURLException; import java.net.URI; import java.net.URL; import java.util.ArrayList; @@ -34,7 +35,7 @@ public class CrawlUtils { } public static List
parseHomepage(Document doc, Pattern idRegex, - BiFunction singleParser) { + BiFunction singleParser) { HttpCrawler crawler = new HttpCrawler(); Consumer callback = progressCallback.get(); @@ -48,6 +49,7 @@ public class CrawlUtils { int total = hrefs.size(); AtomicInteger done = new AtomicInteger(0); + AtomicInteger errors = new AtomicInteger(0); ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE); List> futures = new ArrayList<>(total); @@ -57,8 +59,13 @@ public class CrawlUtils { URL articleUrl = URI.create(href).toURL(); Document articleDoc = crawler.fetch(articleUrl); return singleParser.apply(articleUrl, articleDoc); - } catch (Exception e) { - log.warn("Failed to fetch article: {}", href, e); + } catch (CrawlerException e) { + int failed = errors.incrementAndGet(); + log.warn("Failed [{}/{}]: {} — {}", failed, total, href, e.getMessage()); + return null; + } catch (MalformedURLException e) { + int failed = errors.incrementAndGet(); + log.warn("Failed [{}/{}]: {} — malformed URL", failed, total, href); return null; } finally { int completed = done.incrementAndGet(); @@ -72,6 +79,10 @@ public class CrawlUtils { executor.shutdown(); CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); + if (errors.get() > 0) { + log.warn("Crawl completed: {}/{} articles failed", errors.get(), total); + } + return futures.stream() .map(CompletableFuture::join) .filter(Objects::nonNull) diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlerException.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlerException.java new file mode 100644 index 0000000..3073d51 --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlerException.java @@ -0,0 +1,31 @@ +package internal.hw.crawler.strategies.crawl; + +import java.net.URL; + +public abstract class CrawlerException extends RuntimeException { + private final URL url; + + public CrawlerException(String message) { + super(message); + this.url = null; + } + + public CrawlerException(String message, Throwable cause) { + super(message, cause); + this.url = null; + } + + public CrawlerException(String message, URL url) { + super(message); + this.url = url; + } + + public CrawlerException(String message, URL url, Throwable cause) { + super(message, cause); + this.url = url; + } + + public URL getUrl() { + return url; + } +} diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java b/src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java index 5705daa..8ad8743 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java @@ -5,6 +5,7 @@ import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.net.URL; import java.util.concurrent.ThreadLocalRandom; @@ -29,26 +30,36 @@ public class HttpCrawler { this.userAgent = userAgent; } - public Document fetch(URL url) throws Exception { + public Document fetch(URL url) throws CrawlNetworkException { + IOException lastError; for (int attempt = 0; attempt <= maxRetries; attempt++) { try { return Jsoup.connect(url.toString()) .timeout(timeoutMillis) .userAgent(userAgent) .get(); - } catch (Exception e) { + } catch (IOException e) { + lastError = e; if (attempt < maxRetries) { long delay = computeBackoff(attempt); log.warn("Failed to fetch {}, attempt {}/{}: {}. Retrying in {}ms", url, attempt + 1, maxRetries, e.getMessage(), delay); - Thread.sleep(delay); + try { + Thread.sleep(delay); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new CrawlNetworkException( + "Interrupted while fetching " + url, ie, url, maxRetries + 1); + } } else { log.error("Failed to fetch {} after {} attempts", url, maxRetries + 1); - throw e; + throw new CrawlNetworkException( + "Failed to fetch " + url + " after " + (maxRetries + 1) + " attempts", + lastError, url, maxRetries + 1); } } } - throw new RuntimeException("Unreachable"); + throw new CrawlNetworkException("Unreachable", url); } public void rateLimit() throws InterruptedException { diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java index 021032f..20b1300 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java @@ -24,12 +24,12 @@ public class IthomeCrawlStrategy implements CrawlStrategy { } @Override - public List
parse(URL url, Document doc) throws CrawlException { + public List
parse(URL url, Document doc) throws CrawlParseException { if (isHomepage(url)) { return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> { try { return parseSingle(articleUrl, articleDoc); - } catch (CrawlException e) { + } catch (CrawlParseException e) { log.warn("Failed to parse article: {}", articleUrl, e); return null; } @@ -44,23 +44,23 @@ public class IthomeCrawlStrategy implements CrawlStrategy { return (path == null || path.isEmpty() || path.equals("/")); } - private Article parseSingle(URL url, Document doc) throws CrawlException { + private Article parseSingle(URL url, Document doc) throws CrawlParseException { Matcher matcher = idRegex.matcher(url.getPath()); if (!matcher.find()) { - throw new CrawlException(String.format("Cannot determine id for %s", url)); + throw new CrawlParseException.IdExtractionException(url, idRegex.pattern()); } String id = String.format("%s-%s-%s", matcher.group(1), matcher.group(2), matcher.group(3)); Element h1 = doc.selectFirst("h1"); if (h1 == null) { - throw new CrawlException("Missing

element in page: " + url); + throw new CrawlParseException.ElementNotFoundException("h1", url); } String title = h1.text(); Element paragraph = doc.selectFirst("#paragraph"); if (paragraph == null) { - throw new CrawlException("Missing #paragraph element in page: " + url); + throw new CrawlParseException.ElementNotFoundException("#paragraph", url); } String content = paragraph.text(); diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java index 8bc1e28..82c0041 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java @@ -25,12 +25,12 @@ public class NeteaseNewsCrawlStrategy implements CrawlStrategy { } @Override - public List
parse(URL url, Document doc) throws CrawlException { + public List
parse(URL url, Document doc) throws CrawlParseException { if (isHomepage(url)) { return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> { try { return parseSingle(articleUrl, articleDoc); - } catch (CrawlException e) { + } catch (CrawlParseException e) { log.warn("Failed to parse article: {}", articleUrl, e); return null; } @@ -45,20 +45,20 @@ public class NeteaseNewsCrawlStrategy implements CrawlStrategy { return path == null || path.isEmpty() || path.equals("/"); } - private Article parseSingle(URL url, Document doc) throws CrawlException { + private Article parseSingle(URL url, Document doc) throws CrawlParseException { Matcher matcher = idRegex.matcher(url.getPath()); if (!matcher.find()) { - throw new CrawlException(String.format("Cannot determine id for %s", url)); + throw new CrawlParseException.IdExtractionException(url, idRegex.pattern()); } String id = matcher.group(1); Element titleEl = doc.selectFirst("h1.post_title"); - if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url)); + if (titleEl == null) throw new CrawlParseException.ElementNotFoundException("h1.post_title", url); String title = titleEl.text(); Element contentEl = doc.selectFirst("div.post_body"); - if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url)); + if (contentEl == null) throw new CrawlParseException.ElementNotFoundException("div.post_body", url); String content = contentEl.text(); Article article = new Article(); diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java index 99d481a..23d0c8a 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java @@ -31,12 +31,12 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy { } @Override - public List
parse(URL url, Document doc) throws CrawlException { + public List
parse(URL url, Document doc) throws CrawlParseException { if (isHomepage(url)) { return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> { try { return parseSingle(articleUrl, articleDoc); - } catch (CrawlException e) { + } catch (CrawlParseException e) { log.warn("Failed to parse article: {}", articleUrl, e); return null; } @@ -55,15 +55,15 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy { return path == null || path.isEmpty() || path.equals("/"); } - private Article parseSingle(URL url, Document doc) throws CrawlException { + private Article parseSingle(URL url, Document doc) throws CrawlParseException { Matcher matcher = idRegex.matcher(url.getPath()); if (!matcher.find()) { - throw new CrawlException(String.format("Cannot determine id for %s", url)); + throw new CrawlParseException.IdExtractionException(url, idRegex.pattern()); } String id = String.format("%s%s-c%s-%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4)); Element titleEl = doc.selectFirst(".layout.rm_txt h1"); - if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url)); + if (titleEl == null) throw new CrawlParseException.ElementNotFoundException(".layout.rm_txt h1", url); String title = titleEl.text(); Set authors = new HashSet<>(); @@ -73,7 +73,7 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy { } Element contentEl = doc.selectFirst("div#rm_txt_zw"); - if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url)); + if (contentEl == null) throw new CrawlParseException.ElementNotFoundException("div#rm_txt_zw", url); String content = contentEl.text(); Article article = new Article();