|
|
|
@ -10,34 +10,47 @@ import java.net.URI; |
|
|
|
import java.net.URL; |
|
|
|
import java.util.ArrayList; |
|
|
|
import java.util.List; |
|
|
|
import java.util.Objects; |
|
|
|
import java.util.concurrent.CompletableFuture; |
|
|
|
import java.util.concurrent.ExecutorService; |
|
|
|
import java.util.concurrent.Executors; |
|
|
|
import java.util.function.BiFunction; |
|
|
|
import java.util.regex.Pattern; |
|
|
|
import java.util.stream.Collectors; |
|
|
|
|
|
|
|
public class CrawlUtils { |
|
|
|
private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class); |
|
|
|
private static final int THREAD_POOL_SIZE = 4; |
|
|
|
|
|
|
|
public static List<Article> parseHomepage(Document doc, Pattern idRegex, |
|
|
|
BiFunction<URL, Document, Article> singleParser) { |
|
|
|
HttpCrawler crawler = new HttpCrawler(); |
|
|
|
List<Article> articles = new ArrayList<>(); |
|
|
|
ExecutorService executor = Executors.newFixedThreadPool(THREAD_POOL_SIZE); |
|
|
|
List<CompletableFuture<Article>> futures = new ArrayList<>(); |
|
|
|
|
|
|
|
for (Element link : doc.getElementsByTag("a")) { |
|
|
|
String href = link.absUrl("href"); |
|
|
|
if (href.isEmpty()) continue; |
|
|
|
if (!idRegex.matcher(href).find()) continue; |
|
|
|
try { |
|
|
|
crawler.rateLimit(); |
|
|
|
URL articleUrl = URI.create(href).toURL(); |
|
|
|
Document articleDoc = crawler.fetch(articleUrl); |
|
|
|
Article article = singleParser.apply(articleUrl, articleDoc); |
|
|
|
if (article != null) { |
|
|
|
articles.add(article); |
|
|
|
|
|
|
|
futures.add(CompletableFuture.supplyAsync(() -> { |
|
|
|
try { |
|
|
|
URL articleUrl = URI.create(href).toURL(); |
|
|
|
Document articleDoc = crawler.fetch(articleUrl); |
|
|
|
return singleParser.apply(articleUrl, articleDoc); |
|
|
|
} catch (Exception e) { |
|
|
|
log.warn("Failed to fetch article: {}", href, e); |
|
|
|
return null; |
|
|
|
} |
|
|
|
} catch (CrawlException e) { |
|
|
|
log.warn("Failed to parse article: {}", href, e); |
|
|
|
} catch (Exception e) { |
|
|
|
log.warn("Failed to fetch article: {}", href, e); |
|
|
|
} |
|
|
|
}, executor)); |
|
|
|
} |
|
|
|
return articles; |
|
|
|
|
|
|
|
executor.shutdown(); |
|
|
|
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); |
|
|
|
|
|
|
|
return futures.stream() |
|
|
|
.map(CompletableFuture::join) |
|
|
|
.filter(Objects::nonNull) |
|
|
|
.collect(Collectors.toList()); |
|
|
|
} |
|
|
|
} |
|
|
|
|