diff --git a/src/main/java/internal/hw/crawler/Main.java b/src/main/java/internal/hw/crawler/Main.java index feef9a9..92451d9 100644 --- a/src/main/java/internal/hw/crawler/Main.java +++ b/src/main/java/internal/hw/crawler/Main.java @@ -10,7 +10,7 @@ public class Main { MainController controller = new MainController(view); ArticleRepository repository = new ArticleRepository(); - controller.registerCommand(new ExitCommand()); + controller.registerCommand(new ExitCommand(view)); controller.registerCommand(new CrawlCommand(repository, view)); controller.registerCommand(new ListCommand(repository, view)); controller.registerCommand(new SaveCommand(repository, view)); @@ -18,7 +18,17 @@ public class Main { view.printSuccess("Welcome to crawler. Type `help` for a list of available commands."); while (true) { - controller.handleInput(view.readLine()); + try { + String line = view.readLine(); + // stdin 读取异常时退出 + if (line == null) { + controller.handleInput(new ExitCommand(view).getName()); + break; + } + controller.handleInput(line); + } catch (Exception e) { + view.printError("Unexpected error: " + e.getMessage()); + } } } } \ No newline at end of file diff --git a/src/main/java/internal/hw/crawler/MainController.java b/src/main/java/internal/hw/crawler/MainController.java index fdbf603..e9a6460 100644 --- a/src/main/java/internal/hw/crawler/MainController.java +++ b/src/main/java/internal/hw/crawler/MainController.java @@ -43,7 +43,11 @@ public class MainController { return; } - command.execute(args); + try { + command.execute(args); + } catch (Exception e) { + view.printError("Command failed: " + e.getMessage()); + } } private boolean validateArgs(Command command, String[] args) { diff --git a/src/main/java/internal/hw/crawler/commands/Command.java b/src/main/java/internal/hw/crawler/commands/Command.java index e461cc4..dbfa8fd 100644 --- a/src/main/java/internal/hw/crawler/commands/Command.java +++ b/src/main/java/internal/hw/crawler/commands/Command.java @@ -9,5 +9,5 @@ public interface Command { return List.of(); } - void execute(String[] args); + void execute(String[] args) throws Exception; } diff --git a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java index 821f94e..7e00e53 100644 --- a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java +++ b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java @@ -32,24 +32,22 @@ public class CrawlCommand implements Command { } @Override - public void execute(String[] args) { - try { - String urlRaw = args[1]; - URL url = new URL(urlRaw); - CrawlStrategy strategy = crawlStrategyFactory.getStrategy(url); - if (strategy == null) { - out.error("Unsupported URL: " + urlRaw); - return; - } + public void execute(String[] args) throws Exception { + String urlRaw = args[1]; + URL url = new URL(urlRaw); + CrawlStrategy strategy = crawlStrategyFactory.getStrategy(url); + if (strategy == null) { + out.error("Unsupported URL: " + urlRaw); + return; + } - Document doc = Jsoup.parse(url, 5000); - List
articles = strategy.parse(url, doc); - for (Article article : articles) { + Document doc = Jsoup.parse(url, 5000); + List
articles = strategy.parse(url, doc); + for (Article article : articles) { + if (article != null) { repository.add(article); } - out.success(String.format("Crawled %d articles from %s", articles.size(), urlRaw)); - } catch (Exception e) { - throw new RuntimeException(e); } + out.success(String.format("Crawled %d articles from %s", articles.size(), urlRaw)); } } diff --git a/src/main/java/internal/hw/crawler/commands/ExitCommand.java b/src/main/java/internal/hw/crawler/commands/ExitCommand.java index 5c80548..1464586 100644 --- a/src/main/java/internal/hw/crawler/commands/ExitCommand.java +++ b/src/main/java/internal/hw/crawler/commands/ExitCommand.java @@ -1,6 +1,14 @@ package internal.hw.crawler.commands; +import internal.hw.crawler.views.CommandOutput; + public class ExitCommand implements Command { + private final CommandOutput out; + + public ExitCommand(CommandOutput out) { + this.out = out; + } + @Override public String getName() { return "exit"; @@ -8,6 +16,7 @@ public class ExitCommand implements Command { @Override public void execute(String[] args) { + out.info("Goodbye."); System.exit(0); } } diff --git a/src/main/java/internal/hw/crawler/commands/SaveCommand.java b/src/main/java/internal/hw/crawler/commands/SaveCommand.java index e906395..24b55d1 100644 --- a/src/main/java/internal/hw/crawler/commands/SaveCommand.java +++ b/src/main/java/internal/hw/crawler/commands/SaveCommand.java @@ -5,6 +5,7 @@ import internal.hw.crawler.models.Article; import internal.hw.crawler.repositories.ArticleRepository; import internal.hw.crawler.views.CommandOutput; +import com.google.gson.JsonSyntaxException; import java.io.*; import java.util.*; import java.util.stream.Collectors; @@ -25,7 +26,7 @@ public class SaveCommand implements Command { } @Override - public void execute(String[] args) { + public void execute(String[] args) throws IOException { String filename = "articles.output.json"; List
articles = getExistingArticles(filename); @@ -38,11 +39,9 @@ public class SaveCommand implements Command { Article[] articlesToSave = articleMap.values().toArray(new Article[0]); - try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename));) { + try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { writer.write(gson.toJson(articlesToSave)); out.success(String.format("Wrote %d articles to %s", articlesToSave.length, filename)); - } catch (IOException e) { - throw new RuntimeException(e); } } @@ -54,7 +53,10 @@ public class SaveCommand implements Command { try (BufferedReader reader = new BufferedReader(new FileReader(filename))) { Article[] articles = gson.fromJson(reader, Article[].class); return Arrays.asList(articles); - } catch (IOException e) { + } catch (FileNotFoundException e) { + return List.of(); + } catch (IOException | JsonSyntaxException e) { + out.error("Failed to read existing articles: " + e.getMessage()); return List.of(); } } diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java index 0ea4f6a..34d33ff 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java @@ -2,6 +2,7 @@ package internal.hw.crawler.strategies.crawl; import internal.hw.crawler.models.Article; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import java.net.URL; import java.util.HashSet; @@ -25,17 +26,29 @@ public class IthomeCrawlStrategy implements CrawlStrategy { private Article parseSingle(URL url, Document doc) throws CrawlException { Matcher matcher = idRegex.matcher(url.getPath()); - if (!matcher.find()) throw new CrawlException(String.format("Cannot determine id for %s", url)); + if (!matcher.find()) { + throw new CrawlException(String.format("Cannot determine id for %s", url)); + } String id = String.format("%s-%s-%s", matcher.group(1), matcher.group(2), matcher.group(3)); - String title = doc.selectFirst("h1").text(); - String content = doc.selectFirst("#paragraph").text(); - String authorRaw = doc.selectFirst("#author_baidu > strong").text(); - String editorRaw = doc.selectFirst("#editor_baidu > strong").text(); + Element h1 = doc.selectFirst("h1"); + if (h1 == null) { + throw new CrawlException("Missing

element in page: " + url); + } + String title = h1.text(); + + Element paragraph = doc.selectFirst("#paragraph"); + if (paragraph == null) { + throw new CrawlException("Missing #paragraph element in page: " + url); + } + String content = paragraph.text(); + + Element authorEl = doc.selectFirst("#author_baidu > strong"); + Element editorEl = doc.selectFirst("#editor_baidu > strong"); Set authors = new HashSet<>(); - authors.add(authorRaw); - authors.add(editorRaw); + if (authorEl != null) authors.add(authorEl.text()); + if (editorEl != null) authors.add(editorEl.text()); Article article = new Article(); article.setId(id); diff --git a/src/main/java/internal/hw/crawler/views/ConsoleView.java b/src/main/java/internal/hw/crawler/views/ConsoleView.java index a8416c7..6e8f027 100644 --- a/src/main/java/internal/hw/crawler/views/ConsoleView.java +++ b/src/main/java/internal/hw/crawler/views/ConsoleView.java @@ -1,5 +1,6 @@ package internal.hw.crawler.views; +import java.util.NoSuchElementException; import java.util.Scanner; public class ConsoleView implements CommandOutput { @@ -12,7 +13,11 @@ public class ConsoleView implements CommandOutput { public String readLine() { System.out.print("> "); - return scanner.nextLine(); + try { + return scanner.nextLine(); + } catch (NoSuchElementException | IllegalStateException e) { + return null; + } } @Override