|
|
|
@ -14,25 +14,44 @@ import java.util.Objects; |
|
|
|
import java.util.concurrent.CompletableFuture; |
|
|
|
import java.util.concurrent.ExecutorService; |
|
|
|
import java.util.concurrent.Executors; |
|
|
|
import java.util.concurrent.atomic.AtomicInteger; |
|
|
|
import java.util.function.BiFunction; |
|
|
|
import java.util.function.Consumer; |
|
|
|
import java.util.regex.Pattern; |
|
|
|
import java.util.stream.Collectors; |
|
|
|
|
|
|
|
public class CrawlUtils { |
|
|
|
private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class); |
|
|
|
private static final int THREAD_POOL_SIZE = 4; |
|
|
|
private static final ThreadLocal<Consumer<String>> progressCallback = new ThreadLocal<>(); |
|
|
|
|
|
|
|
public static void setProgressCallback(Consumer<String> callback) { |
|
|
|
progressCallback.set(callback); |
|
|
|
} |
|
|
|
|
|
|
|
public static void clearProgressCallback() { |
|
|
|
progressCallback.remove(); |
|
|
|
} |
|
|
|
|
|
|
|
public static List<Article> parseHomepage(Document doc, Pattern idRegex, |
|
|
|
BiFunction<URL, Document, Article> singleParser) { |
|
|
|
HttpCrawler crawler = new HttpCrawler(); |
|
|
|
ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE); |
|
|
|
List<CompletableFuture<Article>> futures = new ArrayList<>(); |
|
|
|
Consumer<String> callback = progressCallback.get(); |
|
|
|
|
|
|
|
List<String> hrefs = new ArrayList<>(); |
|
|
|
for (Element link : doc.getElementsByTag("a")) { |
|
|
|
String href = link.absUrl("href"); |
|
|
|
if (href.isEmpty()) continue; |
|
|
|
if (!idRegex.matcher(href).find()) continue; |
|
|
|
hrefs.add(href); |
|
|
|
} |
|
|
|
|
|
|
|
int total = hrefs.size(); |
|
|
|
AtomicInteger done = new AtomicInteger(0); |
|
|
|
ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE); |
|
|
|
List<CompletableFuture<Article>> futures = new ArrayList<>(total); |
|
|
|
|
|
|
|
for (String href : hrefs) { |
|
|
|
futures.add(CompletableFuture.supplyAsync(() -> { |
|
|
|
try { |
|
|
|
URL articleUrl = URI.create(href).toURL(); |
|
|
|
@ -41,6 +60,11 @@ public class CrawlUtils { |
|
|
|
} catch (Exception e) { |
|
|
|
log.warn("Failed to fetch article: {}", href, e); |
|
|
|
return null; |
|
|
|
} finally { |
|
|
|
int completed = done.incrementAndGet(); |
|
|
|
if (callback != null) { |
|
|
|
callback.accept("Progress: " + completed + "/" + total); |
|
|
|
} |
|
|
|
} |
|
|
|
}, executor)); |
|
|
|
} |
|
|
|
|