You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
65 lines
2.3 KiB
65 lines
2.3 KiB
package internal.hw.crawler.strategies.crawl;
|
|
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.net.URL;
|
|
import java.util.concurrent.ThreadLocalRandom;
|
|
|
|
public class HttpCrawler {
|
|
private static final Logger log = LoggerFactory.getLogger(HttpCrawler.class);
|
|
|
|
private final int timeoutMillis;
|
|
private final int maxRetries;
|
|
private final long requestDelayMs;
|
|
private final long retryBaseDelayMs;
|
|
private final String userAgent;
|
|
|
|
public HttpCrawler() {
|
|
this(5000, 3, 100, 1000, "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:151.0) Gecko/20100101 Firefox/151.0");
|
|
}
|
|
|
|
public HttpCrawler(int timeoutMillis, int maxRetries, long requestDelayMs, long retryBaseDelayMs, String userAgent) {
|
|
this.timeoutMillis = timeoutMillis;
|
|
this.maxRetries = maxRetries;
|
|
this.requestDelayMs = requestDelayMs;
|
|
this.retryBaseDelayMs = retryBaseDelayMs;
|
|
this.userAgent = userAgent;
|
|
}
|
|
|
|
public Document fetch(URL url) throws Exception {
|
|
for (int attempt = 0; attempt <= maxRetries; attempt++) {
|
|
try {
|
|
return Jsoup.connect(url.toString())
|
|
.timeout(timeoutMillis)
|
|
.userAgent(userAgent)
|
|
.get();
|
|
} catch (Exception e) {
|
|
if (attempt < maxRetries) {
|
|
long delay = computeBackoff(attempt);
|
|
log.warn("Failed to fetch {}, attempt {}/{}: {}. Retrying in {}ms",
|
|
url, attempt + 1, maxRetries, e.getMessage(), delay);
|
|
Thread.sleep(delay);
|
|
} else {
|
|
log.error("Failed to fetch {} after {} attempts", url, maxRetries + 1);
|
|
throw e;
|
|
}
|
|
}
|
|
}
|
|
throw new RuntimeException("Unreachable");
|
|
}
|
|
|
|
public void rateLimit() throws InterruptedException {
|
|
if (requestDelayMs > 0) {
|
|
Thread.sleep(requestDelayMs);
|
|
}
|
|
}
|
|
|
|
private long computeBackoff(int attempt) {
|
|
long base = retryBaseDelayMs * (1L << attempt);
|
|
long jitter = ThreadLocalRandom.current().nextLong(-base / 2, base / 2 + 1);
|
|
return Math.max(0, base + jitter);
|
|
}
|
|
}
|
|
|