4 changed files with 124 additions and 58 deletions
@ -0,0 +1,43 @@ |
|||||
|
package internal.hw.crawler.strategies.crawl; |
||||
|
|
||||
|
import internal.hw.crawler.models.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.net.URI; |
||||
|
import java.net.URL; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.function.BiFunction; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class CrawlUtils { |
||||
|
private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class); |
||||
|
|
||||
|
public static List<Article> parseHomepage(Document doc, Pattern idRegex, |
||||
|
BiFunction<URL, Document, Article> singleParser) { |
||||
|
HttpCrawler crawler = new HttpCrawler(); |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
for (Element link : doc.getElementsByTag("a")) { |
||||
|
String href = link.absUrl("href"); |
||||
|
if (href.isEmpty()) continue; |
||||
|
if (!idRegex.matcher(href).find()) continue; |
||||
|
try { |
||||
|
crawler.rateLimit(); |
||||
|
URL articleUrl = URI.create(href).toURL(); |
||||
|
Document articleDoc = crawler.fetch(articleUrl); |
||||
|
Article article = singleParser.apply(articleUrl, articleDoc); |
||||
|
if (article != null) { |
||||
|
articles.add(article); |
||||
|
} |
||||
|
} catch (CrawlException e) { |
||||
|
log.warn("Failed to parse article: {}", href, e); |
||||
|
} catch (Exception e) { |
||||
|
log.warn("Failed to fetch article: {}", href, e); |
||||
|
} |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,65 @@ |
|||||
|
package internal.hw.crawler.strategies.crawl; |
||||
|
|
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.net.URL; |
||||
|
import java.util.concurrent.ThreadLocalRandom; |
||||
|
|
||||
|
public class HttpCrawler { |
||||
|
private static final Logger log = LoggerFactory.getLogger(HttpCrawler.class); |
||||
|
|
||||
|
private final int timeoutMillis; |
||||
|
private final int maxRetries; |
||||
|
private final long requestDelayMs; |
||||
|
private final long retryBaseDelayMs; |
||||
|
private final String userAgent; |
||||
|
|
||||
|
public HttpCrawler() { |
||||
|
this(5000, 3, 100, 1000, "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:151.0) Gecko/20100101 Firefox/151.0"); |
||||
|
} |
||||
|
|
||||
|
public HttpCrawler(int timeoutMillis, int maxRetries, long requestDelayMs, long retryBaseDelayMs, String userAgent) { |
||||
|
this.timeoutMillis = timeoutMillis; |
||||
|
this.maxRetries = maxRetries; |
||||
|
this.requestDelayMs = requestDelayMs; |
||||
|
this.retryBaseDelayMs = retryBaseDelayMs; |
||||
|
this.userAgent = userAgent; |
||||
|
} |
||||
|
|
||||
|
public Document fetch(URL url) throws Exception { |
||||
|
for (int attempt = 0; attempt <= maxRetries; attempt++) { |
||||
|
try { |
||||
|
return Jsoup.connect(url.toString()) |
||||
|
.timeout(timeoutMillis) |
||||
|
.userAgent(userAgent) |
||||
|
.get(); |
||||
|
} catch (Exception e) { |
||||
|
if (attempt < maxRetries) { |
||||
|
long delay = computeBackoff(attempt); |
||||
|
log.warn("Failed to fetch {}, attempt {}/{}: {}. Retrying in {}ms", |
||||
|
url, attempt + 1, maxRetries, e.getMessage(), delay); |
||||
|
Thread.sleep(delay); |
||||
|
} else { |
||||
|
log.error("Failed to fetch {} after {} attempts", url, maxRetries + 1); |
||||
|
throw e; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
throw new RuntimeException("Unreachable"); |
||||
|
} |
||||
|
|
||||
|
public void rateLimit() throws InterruptedException { |
||||
|
if (requestDelayMs > 0) { |
||||
|
Thread.sleep(requestDelayMs); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private long computeBackoff(int attempt) { |
||||
|
long base = retryBaseDelayMs * (1L << attempt); |
||||
|
long jitter = ThreadLocalRandom.current().nextLong(-base / 2, base / 2 + 1); |
||||
|
return Math.max(0, base + jitter); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue