Browse Source

concurrency

master
283375 4 weeks ago
parent
commit
7f571273b2
Failed to extract signature
  1. 41
      src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java

41
src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java

@ -10,34 +10,47 @@ import java.net.URI;
import java.net.URL; import java.net.URL;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Objects;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.function.BiFunction; import java.util.function.BiFunction;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class CrawlUtils { public class CrawlUtils {
private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class); private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class);
private static final int THREAD_POOL_SIZE = 4;
public static List<Article> parseHomepage(Document doc, Pattern idRegex, public static List<Article> parseHomepage(Document doc, Pattern idRegex,
BiFunction<URL, Document, Article> singleParser) { BiFunction<URL, Document, Article> singleParser) {
HttpCrawler crawler = new HttpCrawler(); HttpCrawler crawler = new HttpCrawler();
List<Article> articles = new ArrayList<>(); ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE);
List<CompletableFuture<Article>> futures = new ArrayList<>();
for (Element link : doc.getElementsByTag("a")) { for (Element link : doc.getElementsByTag("a")) {
String href = link.absUrl("href"); String href = link.absUrl("href");
if (href.isEmpty()) continue; if (href.isEmpty()) continue;
if (!idRegex.matcher(href).find()) continue; if (!idRegex.matcher(href).find()) continue;
try {
crawler.rateLimit(); futures.add(CompletableFuture.supplyAsync(() -> {
URL articleUrl = URI.create(href).toURL(); try {
Document articleDoc = crawler.fetch(articleUrl); URL articleUrl = URI.create(href).toURL();
Article article = singleParser.apply(articleUrl, articleDoc); Document articleDoc = crawler.fetch(articleUrl);
if (article != null) { return singleParser.apply(articleUrl, articleDoc);
articles.add(article); } catch (Exception e) {
log.warn("Failed to fetch article: {}", href, e);
return null;
} }
} catch (CrawlException e) { }, executor));
log.warn("Failed to parse article: {}", href, e);
} catch (Exception e) {
log.warn("Failed to fetch article: {}", href, e);
}
} }
return articles;
executor.shutdown();
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
return futures.stream()
.map(CompletableFuture::join)
.filter(Objects::nonNull)
.collect(Collectors.toList());
} }
} }

Loading…
Cancel
Save