From be63667d983be8521501b544becffddf7f717d95 Mon Sep 17 00:00:00 2001 From: 283375 Date: Sat, 23 May 2026 22:01:09 +0800 Subject: [PATCH] limit and retrying --- .../crawler/strategies/crawl/CrawlUtils.java | 43 ++++++++++++ .../crawler/strategies/crawl/HttpCrawler.java | 65 +++++++++++++++++++ .../strategies/crawl/IthomeCrawlStrategy.java | 37 +++-------- .../crawl/PeopleCnCrawlStrategy.java | 37 +++-------- 4 files changed, 124 insertions(+), 58 deletions(-) create mode 100644 src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java create mode 100644 src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java new file mode 100644 index 0000000..ab7e3bc --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java @@ -0,0 +1,43 @@ +package internal.hw.crawler.strategies.crawl; + +import internal.hw.crawler.models.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URI; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.function.BiFunction; +import java.util.regex.Pattern; + +public class CrawlUtils { + private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class); + + public static List
parseHomepage(Document doc, Pattern idRegex, + BiFunction singleParser) { + HttpCrawler crawler = new HttpCrawler(); + List
articles = new ArrayList<>(); + for (Element link : doc.getElementsByTag("a")) { + String href = link.absUrl("href"); + if (href.isEmpty()) continue; + if (!idRegex.matcher(href).find()) continue; + try { + crawler.rateLimit(); + URL articleUrl = URI.create(href).toURL(); + Document articleDoc = crawler.fetch(articleUrl); + Article article = singleParser.apply(articleUrl, articleDoc); + if (article != null) { + articles.add(article); + } + } catch (CrawlException e) { + log.warn("Failed to parse article: {}", href, e); + } catch (Exception e) { + log.warn("Failed to fetch article: {}", href, e); + } + } + return articles; + } +} diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java b/src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java new file mode 100644 index 0000000..5705daa --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java @@ -0,0 +1,65 @@ +package internal.hw.crawler.strategies.crawl; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URL; +import java.util.concurrent.ThreadLocalRandom; + +public class HttpCrawler { + private static final Logger log = LoggerFactory.getLogger(HttpCrawler.class); + + private final int timeoutMillis; + private final int maxRetries; + private final long requestDelayMs; + private final long retryBaseDelayMs; + private final String userAgent; + + public HttpCrawler() { + this(5000, 3, 100, 1000, "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:151.0) Gecko/20100101 Firefox/151.0"); + } + + public HttpCrawler(int timeoutMillis, int maxRetries, long requestDelayMs, long retryBaseDelayMs, String userAgent) { + this.timeoutMillis = timeoutMillis; + this.maxRetries = maxRetries; + this.requestDelayMs = requestDelayMs; + this.retryBaseDelayMs = retryBaseDelayMs; + this.userAgent = userAgent; + } + + public Document fetch(URL url) throws Exception { + for (int attempt = 0; attempt <= maxRetries; attempt++) { + try { + return Jsoup.connect(url.toString()) + .timeout(timeoutMillis) + .userAgent(userAgent) + .get(); + } catch (Exception e) { + if (attempt < maxRetries) { + long delay = computeBackoff(attempt); + log.warn("Failed to fetch {}, attempt {}/{}: {}. Retrying in {}ms", + url, attempt + 1, maxRetries, e.getMessage(), delay); + Thread.sleep(delay); + } else { + log.error("Failed to fetch {} after {} attempts", url, maxRetries + 1); + throw e; + } + } + } + throw new RuntimeException("Unreachable"); + } + + public void rateLimit() throws InterruptedException { + if (requestDelayMs > 0) { + Thread.sleep(requestDelayMs); + } + } + + private long computeBackoff(int attempt) { + long base = retryBaseDelayMs * (1L << attempt); + long jitter = ThreadLocalRandom.current().nextLong(-base / 2, base / 2 + 1); + return Math.max(0, base + jitter); + } +} diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java index 6ad2eb5..021032f 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java @@ -1,16 +1,12 @@ package internal.hw.crawler.strategies.crawl; import internal.hw.crawler.models.Article; -import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.net.URI; import java.net.URL; -import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -30,7 +26,14 @@ public class IthomeCrawlStrategy implements CrawlStrategy { @Override public List
parse(URL url, Document doc) throws CrawlException { if (isHomepage(url)) { - return parseHomepage(doc); + return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> { + try { + return parseSingle(articleUrl, articleDoc); + } catch (CrawlException e) { + log.warn("Failed to parse article: {}", articleUrl, e); + return null; + } + }); } else { return List.of(parseSingle(url, doc)); } @@ -41,30 +44,6 @@ public class IthomeCrawlStrategy implements CrawlStrategy { return (path == null || path.isEmpty() || path.equals("/")); } - private List
parseHomepage(Document doc) { - List
articles = new ArrayList<>(); - Elements links = doc.getElementsByTag("a"); - for (Element link : links) { - String href = link.absUrl("href"); - if (href.isEmpty()) { - continue; - } - Matcher matcher = idRegex.matcher(href); - if (!matcher.find()) { - continue; - } - - try { - URL articleUrl = URI.create(href).toURL(); - Document articleDoc = Jsoup.parse(articleUrl, 5000); - articles.add(parseSingle(articleUrl, articleDoc)); - } catch (Exception e) { - log.warn("Failed to fetch article: {}", href, e); - } - } - return articles; - } - private Article parseSingle(URL url, Document doc) throws CrawlException { Matcher matcher = idRegex.matcher(url.getPath()); if (!matcher.find()) { diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java index 680e22a..99d481a 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java @@ -1,16 +1,12 @@ package internal.hw.crawler.strategies.crawl; import internal.hw.crawler.models.Article; -import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.net.URI; import java.net.URL; -import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -37,7 +33,14 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy { @Override public List
parse(URL url, Document doc) throws CrawlException { if (isHomepage(url)) { - return parseHomepage(doc); + return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> { + try { + return parseSingle(articleUrl, articleDoc); + } catch (CrawlException e) { + log.warn("Failed to parse article: {}", articleUrl, e); + return null; + } + }); } else { return List.of(parseSingle(url, doc)); } @@ -52,30 +55,6 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy { return path == null || path.isEmpty() || path.equals("/"); } - private List
parseHomepage(Document doc) { - List
articles = new ArrayList<>(); - Elements links = doc.getElementsByTag("a"); - for (Element link : links) { - String href = link.absUrl("href"); - if (href.isEmpty()) { - continue; - } - Matcher matcher = idRegex.matcher(href); - if (!matcher.find()) { - continue; - } - - try { - URL articleUrl = URI.create(href).toURL(); - Document articleDoc = Jsoup.parse(articleUrl, 5000); - articles.add(parseSingle(articleUrl, articleDoc)); - } catch (Exception e) { - log.warn("Failed to fetch article: {}", href, e); - } - } - return articles; - } - private Article parseSingle(URL url, Document doc) throws CrawlException { Matcher matcher = idRegex.matcher(url.getPath()); if (!matcher.find()) {