You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

91 lines
3.4 KiB

package internal.hw.crawler.strategies.crawl;
import internal.hw.crawler.models.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BiFunction;
import java.util.function.Consumer;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class CrawlUtils {
private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class);
private static final int THREAD_POOL_SIZE = 4;
private static final ThreadLocal<Consumer<String>> progressCallback = new ThreadLocal<>();
public static void setProgressCallback(Consumer<String> callback) {
progressCallback.set(callback);
}
public static void clearProgressCallback() {
progressCallback.remove();
}
public static List<Article> parseHomepage(Document doc, Pattern idRegex,
BiFunction<URL, Document, Article> singleParser) {
HttpCrawler crawler = new HttpCrawler();
Consumer<String> callback = progressCallback.get();
List<String> hrefs = new ArrayList<>();
for (Element link : doc.getElementsByTag("a")) {
String href = link.absUrl("href");
if (href.isEmpty()) continue;
if (!idRegex.matcher(href).find()) continue;
hrefs.add(href);
}
int total = hrefs.size();
AtomicInteger done = new AtomicInteger(0);
AtomicInteger errors = new AtomicInteger(0);
ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE);
List<CompletableFuture<Article>> futures = new ArrayList<>(total);
for (String href : hrefs) {
futures.add(CompletableFuture.supplyAsync(() -> {
try {
URL articleUrl = URI.create(href).toURL();
Document articleDoc = crawler.fetch(articleUrl);
return singleParser.apply(articleUrl, articleDoc);
} catch (CrawlerException e) {
int failed = errors.incrementAndGet();
log.warn("Failed [{}/{}]: {} — {}", failed, total, href, e.getMessage());
return null;
} catch (MalformedURLException e) {
int failed = errors.incrementAndGet();
log.warn("Failed [{}/{}]: {} — malformed URL", failed, total, href);
return null;
} finally {
int completed = done.incrementAndGet();
if (callback != null) {
callback.accept("Progress: " + completed + "/" + total);
}
}
}, executor));
}
executor.shutdown();
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
if (errors.get() > 0) {
log.warn("Crawl completed: {}/{} articles failed", errors.get(), total);
}
return futures.stream()
.map(CompletableFuture::join)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
}