From 18eca216fd3223f86eb9432a9b55fc75e37b7396 Mon Sep 17 00:00:00 2001 From: 283375 Date: Sat, 23 May 2026 22:41:20 +0800 Subject: [PATCH] show progress when parsing homepage --- .../hw/crawler/commands/CrawlCommand.java | 13 +++++++-- .../crawler/strategies/crawl/CrawlUtils.java | 28 +++++++++++++++++-- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java index 6012c03..6df7e8d 100644 --- a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java +++ b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java @@ -4,6 +4,7 @@ import internal.hw.crawler.models.Article; import internal.hw.crawler.repositories.ArticleRepository; import internal.hw.crawler.strategies.crawl.CrawlStrategy; import internal.hw.crawler.strategies.crawl.CrawlStrategyFactory; +import internal.hw.crawler.strategies.crawl.CrawlUtils; import internal.hw.crawler.views.CommandOutput; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -43,8 +44,14 @@ public class CrawlCommand implements Command { } Document doc = Jsoup.connect(url.toString()).timeout(5000).get(); - List
articles = strategy.parse(url, doc); - articles.stream().filter(Objects::nonNull).forEach(repository::add); - out.success(String.format("Crawled %d articles from %s", articles.size(), urlRaw)); + CrawlUtils.setProgressCallback(msg -> System.out.print("\r" + msg)); + try { + List
articles = strategy.parse(url, doc); + System.out.println(); + articles.stream().filter(Objects::nonNull).forEach(repository::add); + out.success(String.format("Crawled %d articles from %s", articles.size(), urlRaw)); + } finally { + CrawlUtils.clearProgressCallback(); + } } } diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java index a3d03cb..7942dbd 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java @@ -14,25 +14,44 @@ import java.util.Objects; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; import java.util.function.BiFunction; +import java.util.function.Consumer; import java.util.regex.Pattern; import java.util.stream.Collectors; public class CrawlUtils { private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class); private static final int THREAD_POOL_SIZE = 4; + private static final ThreadLocal> progressCallback = new ThreadLocal<>(); + + public static void setProgressCallback(Consumer callback) { + progressCallback.set(callback); + } + + public static void clearProgressCallback() { + progressCallback.remove(); + } public static List
parseHomepage(Document doc, Pattern idRegex, BiFunction singleParser) { HttpCrawler crawler = new HttpCrawler(); - ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE); - List> futures = new ArrayList<>(); + Consumer callback = progressCallback.get(); + List hrefs = new ArrayList<>(); for (Element link : doc.getElementsByTag("a")) { String href = link.absUrl("href"); if (href.isEmpty()) continue; if (!idRegex.matcher(href).find()) continue; + hrefs.add(href); + } + + int total = hrefs.size(); + AtomicInteger done = new AtomicInteger(0); + ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE); + List> futures = new ArrayList<>(total); + for (String href : hrefs) { futures.add(CompletableFuture.supplyAsync(() -> { try { URL articleUrl = URI.create(href).toURL(); @@ -41,6 +60,11 @@ public class CrawlUtils { } catch (Exception e) { log.warn("Failed to fetch article: {}", href, e); return null; + } finally { + int completed = done.incrementAndGet(); + if (callback != null) { + callback.accept("Progress: " + completed + "/" + total); + } } }, executor)); }