package internal.hw.crawler.strategies.crawl; import internal.hw.crawler.models.Article; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.MalformedURLException; import java.net.URI; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.regex.Pattern; import java.util.stream.Collectors; public class CrawlUtils { private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class); private static final int THREAD_POOL_SIZE = 4; private static final ThreadLocal> progressCallback = new ThreadLocal<>(); public static void setProgressCallback(Consumer callback) { progressCallback.set(callback); } public static void clearProgressCallback() { progressCallback.remove(); } public static List
parseHomepage(Document doc, Pattern idRegex, BiFunction singleParser) { HttpCrawler crawler = new HttpCrawler(); Consumer callback = progressCallback.get(); List hrefs = new ArrayList<>(); for (Element link : doc.getElementsByTag("a")) { String href = link.absUrl("href"); if (href.isEmpty()) continue; if (!idRegex.matcher(href).find()) continue; hrefs.add(href); } int total = hrefs.size(); AtomicInteger done = new AtomicInteger(0); AtomicInteger errors = new AtomicInteger(0); ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE); List> futures = new ArrayList<>(total); for (String href : hrefs) { futures.add(CompletableFuture.supplyAsync(() -> { try { URL articleUrl = URI.create(href).toURL(); Document articleDoc = crawler.fetch(articleUrl); return singleParser.apply(articleUrl, articleDoc); } catch (CrawlerException e) { int failed = errors.incrementAndGet(); log.warn("Failed [{}/{}]: {} — {}", failed, total, href, e.getMessage()); return null; } catch (MalformedURLException e) { int failed = errors.incrementAndGet(); log.warn("Failed [{}/{}]: {} — malformed URL", failed, total, href); return null; } finally { int completed = done.incrementAndGet(); if (callback != null) { callback.accept("Progress: " + completed + "/" + total); } } }, executor)); } executor.shutdown(); CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); if (errors.get() > 0) { log.warn("Crawl completed: {}/{} articles failed", errors.get(), total); } return futures.stream() .map(CompletableFuture::join) .filter(Objects::nonNull) .collect(Collectors.toList()); } }