You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

65 lines
2.3 KiB

package internal.hw.crawler.strategies.crawl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URL;
import java.util.concurrent.ThreadLocalRandom;
public class HttpCrawler {
private static final Logger log = LoggerFactory.getLogger(HttpCrawler.class);
private final int timeoutMillis;
private final int maxRetries;
private final long requestDelayMs;
private final long retryBaseDelayMs;
private final String userAgent;
public HttpCrawler() {
this(5000, 3, 100, 1000, "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:151.0) Gecko/20100101 Firefox/151.0");
}
public HttpCrawler(int timeoutMillis, int maxRetries, long requestDelayMs, long retryBaseDelayMs, String userAgent) {
this.timeoutMillis = timeoutMillis;
this.maxRetries = maxRetries;
this.requestDelayMs = requestDelayMs;
this.retryBaseDelayMs = retryBaseDelayMs;
this.userAgent = userAgent;
}
public Document fetch(URL url) throws Exception {
for (int attempt = 0; attempt <= maxRetries; attempt++) {
try {
return Jsoup.connect(url.toString())
.timeout(timeoutMillis)
.userAgent(userAgent)
.get();
} catch (Exception e) {
if (attempt < maxRetries) {
long delay = computeBackoff(attempt);
log.warn("Failed to fetch {}, attempt {}/{}: {}. Retrying in {}ms",
url, attempt + 1, maxRetries, e.getMessage(), delay);
Thread.sleep(delay);
} else {
log.error("Failed to fetch {} after {} attempts", url, maxRetries + 1);
throw e;
}
}
}
throw new RuntimeException("Unreachable");
}
public void rateLimit() throws InterruptedException {
if (requestDelayMs > 0) {
Thread.sleep(requestDelayMs);
}
}
private long computeBackoff(int attempt) {
long base = retryBaseDelayMs * (1L << attempt);
long jitter = ThreadLocalRandom.current().nextLong(-base / 2, base / 2 + 1);
return Math.max(0, base + jitter);
}
}