project/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java


								package internal.hw.crawler.strategies.crawl;


								import internal.hw.crawler.models.Article;

								import org.jsoup.nodes.Document;

								import org.jsoup.nodes.Element;

								import org.slf4j.Logger;

								import org.slf4j.LoggerFactory;


								import java.net.MalformedURLException;

								import java.net.URI;

								import java.net.URL;

								import java.util.ArrayList;

								import java.util.List;

								import java.util.Objects;

								import java.util.concurrent.CompletableFuture;

								import java.util.concurrent.ExecutorService;

								import java.util.concurrent.Executors;

								import java.util.concurrent.atomic.AtomicInteger;

								import java.util.function.BiFunction;

								import java.util.function.Consumer;

								import java.util.regex.Pattern;

								import java.util.stream.Collectors;


								public class CrawlUtils {

								    private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class);

								    private static final int THREAD_POOL_SIZE = 4;

								    private static final ThreadLocal<Consumer<String>> progressCallback = new ThreadLocal<>();


								    public static void setProgressCallback(Consumer<String> callback) {

								        progressCallback.set(callback);

								    }


								    public static void clearProgressCallback() {

								        progressCallback.remove();

								    }


								    public static List<Article> parseHomepage(Document doc, Pattern idRegex,

								                                               BiFunction<URL, Document, Article> singleParser) {

								        HttpCrawler crawler = new HttpCrawler();

								        Consumer<String> callback = progressCallback.get();


								        List<String> hrefs = new ArrayList<>();

								        for (Element link : doc.getElementsByTag("a")) {

								            String href = link.absUrl("href");

								            if (href.isEmpty()) continue;

								            if (!idRegex.matcher(href).find()) continue;

								            hrefs.add(href);

								        }


								        int total = hrefs.size();

								        AtomicInteger done = new AtomicInteger(0);

								        AtomicInteger errors = new AtomicInteger(0);

								        ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE);

								        List<CompletableFuture<Article>> futures = new ArrayList<>(total);


								        for (String href : hrefs) {

								            futures.add(CompletableFuture.supplyAsync(() -> {

								                try {

								                    URL articleUrl = URI.create(href).toURL();

								                    Document articleDoc = crawler.fetch(articleUrl);

								                    return singleParser.apply(articleUrl, articleDoc);

								                } catch (CrawlerException e) {

								                    int failed = errors.incrementAndGet();

								                    log.warn("Failed [{}/{}]: {} — {}", failed, total, href, e.getMessage());

								                    return null;

								                } catch (MalformedURLException e) {

								                    int failed = errors.incrementAndGet();

								                    log.warn("Failed [{}/{}]: {} — malformed URL", failed, total, href);

								                    return null;

								                } finally {

								                    int completed = done.incrementAndGet();

								                    if (callback != null) {

								                        callback.accept("Progress: " + completed + "/" + total);

								                    }

								                }

								            }, executor));

								        }


								        executor.shutdown();

								        CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();


								        if (errors.get() > 0) {

								            log.warn("Crawl completed: {}/{} articles failed", errors.get(), total);

								        }


								        return futures.stream()

								                .map(CompletableFuture::join)

								                .filter(Objects::nonNull)

								                .collect(Collectors.toList());

								    }

								}