package internal.hw.crawler.strategies.crawl; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URL; import java.util.concurrent.ThreadLocalRandom; public class HttpCrawler { private static final Logger log = LoggerFactory.getLogger(HttpCrawler.class); private final int timeoutMillis; private final int maxRetries; private final long requestDelayMs; private final long retryBaseDelayMs; private final String userAgent; public HttpCrawler() { this(5000, 3, 100, 1000, "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:151.0) Gecko/20100101 Firefox/151.0"); } public HttpCrawler(int timeoutMillis, int maxRetries, long requestDelayMs, long retryBaseDelayMs, String userAgent) { this.timeoutMillis = timeoutMillis; this.maxRetries = maxRetries; this.requestDelayMs = requestDelayMs; this.retryBaseDelayMs = retryBaseDelayMs; this.userAgent = userAgent; } public Document fetch(URL url) throws Exception { for (int attempt = 0; attempt <= maxRetries; attempt++) { try { return Jsoup.connect(url.toString()) .timeout(timeoutMillis) .userAgent(userAgent) .get(); } catch (Exception e) { if (attempt < maxRetries) { long delay = computeBackoff(attempt); log.warn("Failed to fetch {}, attempt {}/{}: {}. Retrying in {}ms", url, attempt + 1, maxRetries, e.getMessage(), delay); Thread.sleep(delay); } else { log.error("Failed to fetch {} after {} attempts", url, maxRetries + 1); throw e; } } } throw new RuntimeException("Unreachable"); } public void rateLimit() throws InterruptedException { if (requestDelayMs > 0) { Thread.sleep(requestDelayMs); } } private long computeBackoff(int attempt) { long base = retryBaseDelayMs * (1L << attempt); long jitter = ThreadLocalRandom.current().nextLong(-base / 2, base / 2 + 1); return Math.max(0, base + jitter); } }