diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java index ab7e3bc..a3d03cb 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java @@ -10,34 +10,47 @@ import java.net.URI; import java.net.URL; import java.util.ArrayList; import java.util.List; +import java.util.Objects; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.function.BiFunction; import java.util.regex.Pattern; +import java.util.stream.Collectors; public class CrawlUtils { private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class); + private static final int THREAD_POOL_SIZE = 4; public static List
parseHomepage(Document doc, Pattern idRegex, BiFunction singleParser) { HttpCrawler crawler = new HttpCrawler(); - List
articles = new ArrayList<>(); + ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE); + List> futures = new ArrayList<>(); + for (Element link : doc.getElementsByTag("a")) { String href = link.absUrl("href"); if (href.isEmpty()) continue; if (!idRegex.matcher(href).find()) continue; - try { - crawler.rateLimit(); - URL articleUrl = URI.create(href).toURL(); - Document articleDoc = crawler.fetch(articleUrl); - Article article = singleParser.apply(articleUrl, articleDoc); - if (article != null) { - articles.add(article); + + futures.add(CompletableFuture.supplyAsync(() -> { + try { + URL articleUrl = URI.create(href).toURL(); + Document articleDoc = crawler.fetch(articleUrl); + return singleParser.apply(articleUrl, articleDoc); + } catch (Exception e) { + log.warn("Failed to fetch article: {}", href, e); + return null; } - } catch (CrawlException e) { - log.warn("Failed to parse article: {}", href, e); - } catch (Exception e) { - log.warn("Failed to fetch article: {}", href, e); - } + }, executor)); } - return articles; + + executor.shutdown(); + CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); + + return futures.stream() + .map(CompletableFuture::join) + .filter(Objects::nonNull) + .collect(Collectors.toList()); } }