Browse Source

concurrency

master
283375 4 weeks ago
parent
commit
7f571273b2
Failed to extract signature
  1. 31
      src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java

31
src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java

@ -10,34 +10,47 @@ import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.function.BiFunction;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class CrawlUtils {
private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class);
private static final int THREAD_POOL_SIZE = 4;
public static List<Article> parseHomepage(Document doc, Pattern idRegex,
BiFunction<URL, Document, Article> singleParser) {
HttpCrawler crawler = new HttpCrawler();
List<Article> articles = new ArrayList<>();
ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE);
List<CompletableFuture<Article>> futures = new ArrayList<>();
for (Element link : doc.getElementsByTag("a")) {
String href = link.absUrl("href");
if (href.isEmpty()) continue;
if (!idRegex.matcher(href).find()) continue;
futures.add(CompletableFuture.supplyAsync(() -> {
try {
crawler.rateLimit();
URL articleUrl = URI.create(href).toURL();
Document articleDoc = crawler.fetch(articleUrl);
Article article = singleParser.apply(articleUrl, articleDoc);
if (article != null) {
articles.add(article);
}
} catch (CrawlException e) {
log.warn("Failed to parse article: {}", href, e);
return singleParser.apply(articleUrl, articleDoc);
} catch (Exception e) {
log.warn("Failed to fetch article: {}", href, e);
return null;
}
}, executor));
}
return articles;
executor.shutdown();
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
return futures.stream()
.map(CompletableFuture::join)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
}

Loading…
Cancel
Save