Browse Source

show progress when parsing homepage

master
283375 4 weeks ago
parent
commit
18eca216fd
Failed to extract signature
  1. 7
      src/main/java/internal/hw/crawler/commands/CrawlCommand.java
  2. 28
      src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java

7
src/main/java/internal/hw/crawler/commands/CrawlCommand.java

@ -4,6 +4,7 @@ import internal.hw.crawler.models.Article;
import internal.hw.crawler.repositories.ArticleRepository;
import internal.hw.crawler.strategies.crawl.CrawlStrategy;
import internal.hw.crawler.strategies.crawl.CrawlStrategyFactory;
import internal.hw.crawler.strategies.crawl.CrawlUtils;
import internal.hw.crawler.views.CommandOutput;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
@ -43,8 +44,14 @@ public class CrawlCommand implements Command {
}
Document doc = Jsoup.connect(url.toString()).timeout(5000).get();
CrawlUtils.setProgressCallback(msg -> System.out.print("\r" + msg));
try {
List<Article> articles = strategy.parse(url, doc);
System.out.println();
articles.stream().filter(Objects::nonNull).forEach(repository::add);
out.success(String.format("Crawled %d articles from %s", articles.size(), urlRaw));
} finally {
CrawlUtils.clearProgressCallback();
}
}
}

28
src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java

@ -14,25 +14,44 @@ import java.util.Objects;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BiFunction;
import java.util.function.Consumer;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class CrawlUtils {
private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class);
private static final int THREAD_POOL_SIZE = 4;
private static final ThreadLocal<Consumer<String>> progressCallback = new ThreadLocal<>();
public static void setProgressCallback(Consumer<String> callback) {
progressCallback.set(callback);
}
public static void clearProgressCallback() {
progressCallback.remove();
}
public static List<Article> parseHomepage(Document doc, Pattern idRegex,
BiFunction<URL, Document, Article> singleParser) {
HttpCrawler crawler = new HttpCrawler();
ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE);
List<CompletableFuture<Article>> futures = new ArrayList<>();
Consumer<String> callback = progressCallback.get();
List<String> hrefs = new ArrayList<>();
for (Element link : doc.getElementsByTag("a")) {
String href = link.absUrl("href");
if (href.isEmpty()) continue;
if (!idRegex.matcher(href).find()) continue;
hrefs.add(href);
}
int total = hrefs.size();
AtomicInteger done = new AtomicInteger(0);
ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE);
List<CompletableFuture<Article>> futures = new ArrayList<>(total);
for (String href : hrefs) {
futures.add(CompletableFuture.supplyAsync(() -> {
try {
URL articleUrl = URI.create(href).toURL();
@ -41,6 +60,11 @@ public class CrawlUtils {
} catch (Exception e) {
log.warn("Failed to fetch article: {}", href, e);
return null;
} finally {
int completed = done.incrementAndGet();
if (callback != null) {
callback.accept("Progress: " + completed + "/" + total);
}
}
}, executor));
}

Loading…
Cancel
Save