improve exception handling

4 weeks ago · 5abdee0f76
15 changed files with 213 additions and 56 deletions
--- a/src/main/java/internal/hw/crawler/Main.java
+++ b/src/main/java/internal/hw/crawler/Main.java
@ -3,8 +3,12 @@ package internal.hw.crawler;
 import internal.hw.crawler.commands.*;
 import internal.hw.crawler.repositories.ArticleRepository;
 import internal.hw.crawler.views.ConsoleView;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 public class Main {
    private static final Logger log = LoggerFactory.getLogger(Main.class);
    public static void main(String[] args) {
        ConsoleView view = new ConsoleView();
        MainController controller = new MainController(view);
@ -27,6 +31,7 @@ public class Main {
                }
                controller.handleInput(line);
            } catch (Exception e) {
                log.error("Unhandled exception in REPL loop", e);
                view.printError("Unexpected error: " + e.getMessage());
            }
        }
--- a/src/main/java/internal/hw/crawler/MainController.java
+++ b/src/main/java/internal/hw/crawler/MainController.java
@ -2,7 +2,10 @@ package internal.hw.crawler;
 import internal.hw.crawler.commands.Command;
 import internal.hw.crawler.commands.CommandArg;
 import internal.hw.crawler.strategies.crawl.CrawlerException;
 import internal.hw.crawler.views.ConsoleView;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.util.Collections;
 import java.util.HashMap;
@ -10,6 +13,7 @@ import java.util.List;
 import java.util.Map;
 public class MainController {
    private static final Logger log = LoggerFactory.getLogger(MainController.class);
    private final Map<String, Command> commands = new HashMap<>();
    private final ConsoleView view;
@ -45,8 +49,12 @@ public class MainController {
        try {
            command.execute(args);
        } catch (CrawlerException e) {
            log.warn("Crawler error in command '{}'", cmdName, e);
            view.printError(e.getMessage());
        } catch (Exception e) {
-            view.printError("Command failed: " + e.getMessage());
+            log.error("Unexpected error in command '{}'", cmdName, e);
            view.printError("Internal error: " + e.getMessage());
        }
    }
--- a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java
+++ b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java
@ -5,15 +5,22 @@ import internal.hw.crawler.repositories.ArticleRepository;
 import internal.hw.crawler.strategies.crawl.CrawlStrategy;
 import internal.hw.crawler.strategies.crawl.CrawlStrategyFactory;
 import internal.hw.crawler.strategies.crawl.CrawlUtils;
 import internal.hw.crawler.strategies.crawl.CrawlNetworkException;
 import internal.hw.crawler.strategies.crawl.CrawlParseException;
 import internal.hw.crawler.strategies.crawl.CrawlUnsupportedException;
 import internal.hw.crawler.views.CommandOutput;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.net.URL;
 import java.util.List;
 import java.util.Objects;
 public class CrawlCommand implements Command {
    private static final Logger log = LoggerFactory.getLogger(CrawlCommand.class);
    private final ArticleRepository repository;
    private final CommandOutput out;
    private final CrawlStrategyFactory crawlStrategyFactory = new CrawlStrategyFactory();
@ -34,24 +41,33 @@ public class CrawlCommand implements Command {
    }
    @Override
-    public void execute(String[] args) throws Exception {
+    public void execute(String[] args) {
        String urlRaw = args[1];
        URL url = new URL(urlRaw);
        CrawlStrategy strategy = crawlStrategyFactory.getStrategy(url);
        if (strategy == null) {
            out.error("Unsupported URL: " + urlRaw);
            return;
        }
        Document doc = Jsoup.connect(url.toString()).timeout(5000).get();
        CrawlUtils.setProgressCallback(msg -> System.out.print("\r" + msg));
        try {
-            List<Article> articles = strategy.parse(url, doc);
+            URL url = new URL(urlRaw);
-            System.out.println();
+            CrawlStrategy strategy = crawlStrategyFactory.getStrategy(url);
-            articles.stream().filter(Objects::nonNull).forEach(repository::add);
+            Document doc = Jsoup.connect(url.toString()).timeout(5000).get();
-            out.success(String.format("Crawled %d articles from %s", articles.size(), urlRaw));
+            CrawlUtils.setProgressCallback(msg -> System.out.print("\r" + msg));
-        } finally {
+            try {
-            CrawlUtils.clearProgressCallback();
+                List<Article> articles = strategy.parse(url, doc);
                System.out.println();
                articles.stream().filter(Objects::nonNull).forEach(repository::add);
                out.success(String.format("Crawled %d articles from %s", articles.size(), urlRaw));
            } finally {
                CrawlUtils.clearProgressCallback();
            }
        } catch (CrawlUnsupportedException e) {
            out.error(e.getMessage());
            log.warn("Unsupported URL: {}", urlRaw);
        } catch (CrawlNetworkException e) {
            out.error("Network error: " + e.getMessage());
            log.error("Crawl network failure for {}", urlRaw, e);
        } catch (CrawlParseException e) {
            out.error("Parse error: " + e.getMessage());
            log.error("Crawl parse failure for {}", urlRaw, e);
        } catch (IOException e) {
            out.error("I/O error: " + e.getMessage());
            log.error("Crawl I/O failure for {}", urlRaw, e);
        }
    }
 }
--- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java
@ -1,11 +0,0 @@
 package internal.hw.crawler.strategies.crawl;
 public class CrawlException extends Exception {
    public CrawlException(String message) {
        super(message);
    }
    public CrawlException(String message, Throwable cause) {
        super(message, cause);
    }
 }
--- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlNetworkException.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlNetworkException.java
@ -0,0 +1,34 @@
 package internal.hw.crawler.strategies.crawl;
 import java.net.URL;
 public class CrawlNetworkException extends CrawlerException {
    private final int statusCode;
    private final int attempts;
    public CrawlNetworkException(String message, URL url) {
        this(message, null, url, -1, 0);
    }
    public CrawlNetworkException(String message, Throwable cause, URL url) {
        this(message, cause, url, -1, 0);
    }
    public CrawlNetworkException(String message, Throwable cause, URL url, int attempts) {
        this(message, cause, url, -1, attempts);
    }
    public CrawlNetworkException(String message, Throwable cause, URL url, int statusCode, int attempts) {
        super(message, url, cause);
        this.statusCode = statusCode;
        this.attempts = attempts;
    }
    public int getStatusCode() {
        return statusCode;
    }
    public int getAttempts() {
        return attempts;
    }
 }
--- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlParseException.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlParseException.java
@ -0,0 +1,43 @@
 package internal.hw.crawler.strategies.crawl;
 import java.net.URL;
 public class CrawlParseException extends CrawlerException {
    public CrawlParseException(String message) {
        super(message);
    }
    public CrawlParseException(String message, URL url) {
        super(message, url);
    }
    public CrawlParseException(String message, URL url, Throwable cause) {
        super(message, url, cause);
    }
    public static class ElementNotFoundException extends CrawlParseException {
        private final String selector;
        public ElementNotFoundException(String selector, URL url) {
            super("Missing element '" + selector + "' in page: " + url, url);
            this.selector = selector;
        }
        public String getSelector() {
            return selector;
        }
    }
    public static class IdExtractionException extends CrawlParseException {
        private final String pattern;
        public IdExtractionException(URL url, String pattern) {
            super("Cannot determine id for " + url + " (pattern: " + pattern + ")", url);
            this.pattern = pattern;
        }
        public String getPattern() {
            return pattern;
        }
    }
 }
--- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java
@ -7,7 +7,7 @@ import java.net.URL;
 import java.util.List;
 public interface CrawlStrategy {
-    List<Article> parse(URL url, Document doc) throws CrawlException;
+    List<Article> parse(URL url, Document doc) throws CrawlerException;
    boolean supports(URL url);
 }
--- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
@ -19,7 +19,7 @@ public class CrawlStrategyFactory {
                return s;
            }
        }
-        return null;
+        throw new CrawlUnsupportedException(url);
    }
    public void register(CrawlStrategy strategy) {
--- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUnsupportedException.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUnsupportedException.java
@ -0,0 +1,9 @@
 package internal.hw.crawler.strategies.crawl;
 import java.net.URL;
 public class CrawlUnsupportedException extends CrawlerException {
    public CrawlUnsupportedException(URL url) {
        super("Unsupported site: " + url, url);
    }
 }
--- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java
@ -6,6 +6,7 @@ import org.jsoup.nodes.Element;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
 import java.util.ArrayList;
@ -34,7 +35,7 @@ public class CrawlUtils {
    }
    public static List<Article> parseHomepage(Document doc, Pattern idRegex,
-                                              BiFunction<URL, Document, Article> singleParser) {
+                                               BiFunction<URL, Document, Article> singleParser) {
        HttpCrawler crawler = new HttpCrawler();
        Consumer<String> callback = progressCallback.get();
@ -48,6 +49,7 @@ public class CrawlUtils {
        int total = hrefs.size();
        AtomicInteger done = new AtomicInteger(0);
        AtomicInteger errors = new AtomicInteger(0);
        ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE);
        List<CompletableFuture<Article>> futures = new ArrayList<>(total);
@ -57,8 +59,13 @@ public class CrawlUtils {
                    URL articleUrl = URI.create(href).toURL();
                    Document articleDoc = crawler.fetch(articleUrl);
                    return singleParser.apply(articleUrl, articleDoc);
-                } catch (Exception e) {
+                } catch (CrawlerException e) {
-                    log.warn("Failed to fetch article: {}", href, e);
+                    int failed = errors.incrementAndGet();
                    log.warn("Failed [{}/{}]: {} — {}", failed, total, href, e.getMessage());
                    return null;
                } catch (MalformedURLException e) {
                    int failed = errors.incrementAndGet();
                    log.warn("Failed [{}/{}]: {} — malformed URL", failed, total, href);
                    return null;
                } finally {
                    int completed = done.incrementAndGet();
@ -72,6 +79,10 @@ public class CrawlUtils {
        executor.shutdown();
        CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
        if (errors.get() > 0) {
            log.warn("Crawl completed: {}/{} articles failed", errors.get(), total);
        }
        return futures.stream()
                .map(CompletableFuture::join)
                .filter(Objects::nonNull)
--- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlerException.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlerException.java
@ -0,0 +1,31 @@
 package internal.hw.crawler.strategies.crawl;
 import java.net.URL;
 public abstract class CrawlerException extends RuntimeException {
    private final URL url;
    public CrawlerException(String message) {
        super(message);
        this.url = null;
    }
    public CrawlerException(String message, Throwable cause) {
        super(message, cause);
        this.url = null;
    }
    public CrawlerException(String message, URL url) {
        super(message);
        this.url = url;
    }
    public CrawlerException(String message, URL url, Throwable cause) {
        super(message, cause);
        this.url = url;
    }
    public URL getUrl() {
        return url;
    }
 }
--- a/src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java
@ -5,6 +5,7 @@ import org.jsoup.nodes.Document;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.net.URL;
 import java.util.concurrent.ThreadLocalRandom;
@ -29,26 +30,36 @@ public class HttpCrawler {
        this.userAgent = userAgent;
    }
-    public Document fetch(URL url) throws Exception {
+    public Document fetch(URL url) throws CrawlNetworkException {
        IOException lastError;
        for (int attempt = 0; attempt <= maxRetries; attempt++) {
            try {
                return Jsoup.connect(url.toString())
                        .timeout(timeoutMillis)
                        .userAgent(userAgent)
                        .get();
-            } catch (Exception e) {
+            } catch (IOException e) {
                lastError = e;
                if (attempt < maxRetries) {
                    long delay = computeBackoff(attempt);
                    log.warn("Failed to fetch {}, attempt {}/{}: {}. Retrying in {}ms",
                            url, attempt + 1, maxRetries, e.getMessage(), delay);
-                    Thread.sleep(delay);
+                    try {
                        Thread.sleep(delay);
                    } catch (InterruptedException ie) {
                        Thread.currentThread().interrupt();
                        throw new CrawlNetworkException(
                                "Interrupted while fetching " + url, ie, url, maxRetries + 1);
                    }
                } else {
                    log.error("Failed to fetch {} after {} attempts", url, maxRetries + 1);
-                    throw e;
+                    throw new CrawlNetworkException(
                            "Failed to fetch " + url + " after " + (maxRetries + 1) + " attempts",
                            lastError, url, maxRetries + 1);
                }
            }
        }
-        throw new RuntimeException("Unreachable");
+        throw new CrawlNetworkException("Unreachable", url);
    }
    public void rateLimit() throws InterruptedException {
--- a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java
@ -24,12 +24,12 @@ public class IthomeCrawlStrategy implements CrawlStrategy {
    }
    @Override
-    public List<Article> parse(URL url, Document doc) throws CrawlException {
+    public List<Article> parse(URL url, Document doc) throws CrawlParseException {
        if (isHomepage(url)) {
            return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> {
                try {
                    return parseSingle(articleUrl, articleDoc);
-                } catch (CrawlException e) {
+                } catch (CrawlParseException e) {
                    log.warn("Failed to parse article: {}", articleUrl, e);
                    return null;
                }
@ -44,23 +44,23 @@ public class IthomeCrawlStrategy implements CrawlStrategy {
        return (path == null || path.isEmpty() || path.equals("/"));
    }
-    private Article parseSingle(URL url, Document doc) throws CrawlException {
+    private Article parseSingle(URL url, Document doc) throws CrawlParseException {
        Matcher matcher = idRegex.matcher(url.getPath());
        if (!matcher.find()) {
-            throw new CrawlException(String.format("Cannot determine id for %s", url));
+            throw new CrawlParseException.IdExtractionException(url, idRegex.pattern());
        }
        String id = String.format("%s-%s-%s", matcher.group(1), matcher.group(2), matcher.group(3));
        Element h1 = doc.selectFirst("h1");
        if (h1 == null) {
-            throw new CrawlException("Missing <h1> element in page: " + url);
+            throw new CrawlParseException.ElementNotFoundException("h1", url);
        }
        String title = h1.text();
        Element paragraph = doc.selectFirst("#paragraph");
        if (paragraph == null) {
-            throw new CrawlException("Missing #paragraph element in page: " + url);
+            throw new CrawlParseException.ElementNotFoundException("#paragraph", url);
        }
        String content = paragraph.text();
--- a/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java
@ -25,12 +25,12 @@ public class NeteaseNewsCrawlStrategy implements CrawlStrategy {
    }
    @Override
-    public List<Article> parse(URL url, Document doc) throws CrawlException {
+    public List<Article> parse(URL url, Document doc) throws CrawlParseException {
        if (isHomepage(url)) {
            return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> {
                try {
                    return parseSingle(articleUrl, articleDoc);
-                } catch (CrawlException e) {
+                } catch (CrawlParseException e) {
                    log.warn("Failed to parse article: {}", articleUrl, e);
                    return null;
                }
@ -45,20 +45,20 @@ public class NeteaseNewsCrawlStrategy implements CrawlStrategy {
        return path == null || path.isEmpty() || path.equals("/");
    }
-    private Article parseSingle(URL url, Document doc) throws CrawlException {
+    private Article parseSingle(URL url, Document doc) throws CrawlParseException {
        Matcher matcher = idRegex.matcher(url.getPath());
        if (!matcher.find()) {
-            throw new CrawlException(String.format("Cannot determine id for %s", url));
+            throw new CrawlParseException.IdExtractionException(url, idRegex.pattern());
        }
        String id = matcher.group(1);
        Element titleEl = doc.selectFirst("h1.post_title");
-        if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url));
+        if (titleEl == null) throw new CrawlParseException.ElementNotFoundException("h1.post_title", url);
        String title = titleEl.text();
        Element contentEl = doc.selectFirst("div.post_body");
-        if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url));
+        if (contentEl == null) throw new CrawlParseException.ElementNotFoundException("div.post_body", url);
        String content = contentEl.text();
        Article article = new Article();
--- a/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java
@ -31,12 +31,12 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy {
    }
    @Override
-    public List<Article> parse(URL url, Document doc) throws CrawlException {
+    public List<Article> parse(URL url, Document doc) throws CrawlParseException {
        if (isHomepage(url)) {
            return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> {
                try {
                    return parseSingle(articleUrl, articleDoc);
-                } catch (CrawlException e) {
+                } catch (CrawlParseException e) {
                    log.warn("Failed to parse article: {}", articleUrl, e);
                    return null;
                }
@ -55,15 +55,15 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy {
        return path == null || path.isEmpty() || path.equals("/");
    }
-    private Article parseSingle(URL url, Document doc) throws CrawlException {
+    private Article parseSingle(URL url, Document doc) throws CrawlParseException {
        Matcher matcher = idRegex.matcher(url.getPath());
        if (!matcher.find()) {
-            throw new CrawlException(String.format("Cannot determine id for %s", url));
+            throw new CrawlParseException.IdExtractionException(url, idRegex.pattern());
        }
        String id = String.format("%s%s-c%s-%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4));
        Element titleEl = doc.selectFirst(".layout.rm_txt h1");
-        if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url));
+        if (titleEl == null) throw new CrawlParseException.ElementNotFoundException(".layout.rm_txt h1", url);
        String title = titleEl.text();
        Set<String> authors = new HashSet<>();
@ -73,7 +73,7 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy {
        }
        Element contentEl = doc.selectFirst("div#rm_txt_zw");
-        if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url));
+        if (contentEl == null) throw new CrawlParseException.ElementNotFoundException("div#rm_txt_zw", url);
        String content = contentEl.text();
        Article article = new Article();