You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
91 lines
3.4 KiB
91 lines
3.4 KiB
package internal.hw.crawler.strategies.crawl;
|
|
|
|
import internal.hw.crawler.models.Article;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.net.MalformedURLException;
|
|
import java.net.URI;
|
|
import java.net.URL;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.Objects;
|
|
import java.util.concurrent.CompletableFuture;
|
|
import java.util.concurrent.ExecutorService;
|
|
import java.util.concurrent.Executors;
|
|
import java.util.concurrent.atomic.AtomicInteger;
|
|
import java.util.function.BiFunction;
|
|
import java.util.function.Consumer;
|
|
import java.util.regex.Pattern;
|
|
import java.util.stream.Collectors;
|
|
|
|
public class CrawlUtils {
|
|
private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class);
|
|
private static final int THREAD_POOL_SIZE = 4;
|
|
private static final ThreadLocal<Consumer<String>> progressCallback = new ThreadLocal<>();
|
|
|
|
public static void setProgressCallback(Consumer<String> callback) {
|
|
progressCallback.set(callback);
|
|
}
|
|
|
|
public static void clearProgressCallback() {
|
|
progressCallback.remove();
|
|
}
|
|
|
|
public static List<Article> parseHomepage(Document doc, Pattern idRegex,
|
|
BiFunction<URL, Document, Article> singleParser) {
|
|
HttpCrawler crawler = new HttpCrawler();
|
|
Consumer<String> callback = progressCallback.get();
|
|
|
|
List<String> hrefs = new ArrayList<>();
|
|
for (Element link : doc.getElementsByTag("a")) {
|
|
String href = link.absUrl("href");
|
|
if (href.isEmpty()) continue;
|
|
if (!idRegex.matcher(href).find()) continue;
|
|
hrefs.add(href);
|
|
}
|
|
|
|
int total = hrefs.size();
|
|
AtomicInteger done = new AtomicInteger(0);
|
|
AtomicInteger errors = new AtomicInteger(0);
|
|
ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE);
|
|
List<CompletableFuture<Article>> futures = new ArrayList<>(total);
|
|
|
|
for (String href : hrefs) {
|
|
futures.add(CompletableFuture.supplyAsync(() -> {
|
|
try {
|
|
URL articleUrl = URI.create(href).toURL();
|
|
Document articleDoc = crawler.fetch(articleUrl);
|
|
return singleParser.apply(articleUrl, articleDoc);
|
|
} catch (CrawlerException e) {
|
|
int failed = errors.incrementAndGet();
|
|
log.warn("Failed [{}/{}]: {} — {}", failed, total, href, e.getMessage());
|
|
return null;
|
|
} catch (MalformedURLException e) {
|
|
int failed = errors.incrementAndGet();
|
|
log.warn("Failed [{}/{}]: {} — malformed URL", failed, total, href);
|
|
return null;
|
|
} finally {
|
|
int completed = done.incrementAndGet();
|
|
if (callback != null) {
|
|
callback.accept("Progress: " + completed + "/" + total);
|
|
}
|
|
}
|
|
}, executor));
|
|
}
|
|
|
|
executor.shutdown();
|
|
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
|
|
|
|
if (errors.get() > 0) {
|
|
log.warn("Crawl completed: {}/{} articles failed", errors.get(), total);
|
|
}
|
|
|
|
return futures.stream()
|
|
.map(CompletableFuture::join)
|
|
.filter(Objects::nonNull)
|
|
.collect(Collectors.toList());
|
|
}
|
|
}
|
|
|