Browse Source

limit and retrying

master
283375 4 weeks ago
parent
commit
be63667d98
Failed to extract signature
  1. 43
      src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java
  2. 65
      src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java
  3. 37
      src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java
  4. 37
      src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java

43
src/main/java/internal/hw/crawler/strategies/crawl/CrawlUtils.java

@ -0,0 +1,43 @@
package internal.hw.crawler.strategies.crawl;
import internal.hw.crawler.models.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.function.BiFunction;
import java.util.regex.Pattern;
public class CrawlUtils {
private static final Logger log = LoggerFactory.getLogger(CrawlUtils.class);
public static List<Article> parseHomepage(Document doc, Pattern idRegex,
BiFunction<URL, Document, Article> singleParser) {
HttpCrawler crawler = new HttpCrawler();
List<Article> articles = new ArrayList<>();
for (Element link : doc.getElementsByTag("a")) {
String href = link.absUrl("href");
if (href.isEmpty()) continue;
if (!idRegex.matcher(href).find()) continue;
try {
crawler.rateLimit();
URL articleUrl = URI.create(href).toURL();
Document articleDoc = crawler.fetch(articleUrl);
Article article = singleParser.apply(articleUrl, articleDoc);
if (article != null) {
articles.add(article);
}
} catch (CrawlException e) {
log.warn("Failed to parse article: {}", href, e);
} catch (Exception e) {
log.warn("Failed to fetch article: {}", href, e);
}
}
return articles;
}
}

65
src/main/java/internal/hw/crawler/strategies/crawl/HttpCrawler.java

@ -0,0 +1,65 @@
package internal.hw.crawler.strategies.crawl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URL;
import java.util.concurrent.ThreadLocalRandom;
public class HttpCrawler {
private static final Logger log = LoggerFactory.getLogger(HttpCrawler.class);
private final int timeoutMillis;
private final int maxRetries;
private final long requestDelayMs;
private final long retryBaseDelayMs;
private final String userAgent;
public HttpCrawler() {
this(5000, 3, 100, 1000, "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:151.0) Gecko/20100101 Firefox/151.0");
}
public HttpCrawler(int timeoutMillis, int maxRetries, long requestDelayMs, long retryBaseDelayMs, String userAgent) {
this.timeoutMillis = timeoutMillis;
this.maxRetries = maxRetries;
this.requestDelayMs = requestDelayMs;
this.retryBaseDelayMs = retryBaseDelayMs;
this.userAgent = userAgent;
}
public Document fetch(URL url) throws Exception {
for (int attempt = 0; attempt <= maxRetries; attempt++) {
try {
return Jsoup.connect(url.toString())
.timeout(timeoutMillis)
.userAgent(userAgent)
.get();
} catch (Exception e) {
if (attempt < maxRetries) {
long delay = computeBackoff(attempt);
log.warn("Failed to fetch {}, attempt {}/{}: {}. Retrying in {}ms",
url, attempt + 1, maxRetries, e.getMessage(), delay);
Thread.sleep(delay);
} else {
log.error("Failed to fetch {} after {} attempts", url, maxRetries + 1);
throw e;
}
}
}
throw new RuntimeException("Unreachable");
}
public void rateLimit() throws InterruptedException {
if (requestDelayMs > 0) {
Thread.sleep(requestDelayMs);
}
}
private long computeBackoff(int attempt) {
long base = retryBaseDelayMs * (1L << attempt);
long jitter = ThreadLocalRandom.current().nextLong(-base / 2, base / 2 + 1);
return Math.max(0, base + jitter);
}
}

37
src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java

@ -1,16 +1,12 @@
package internal.hw.crawler.strategies.crawl;
import internal.hw.crawler.models.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@ -30,7 +26,14 @@ public class IthomeCrawlStrategy implements CrawlStrategy {
@Override
public List<Article> parse(URL url, Document doc) throws CrawlException {
if (isHomepage(url)) {
return parseHomepage(doc);
return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> {
try {
return parseSingle(articleUrl, articleDoc);
} catch (CrawlException e) {
log.warn("Failed to parse article: {}", articleUrl, e);
return null;
}
});
} else {
return List.of(parseSingle(url, doc));
}
@ -41,30 +44,6 @@ public class IthomeCrawlStrategy implements CrawlStrategy {
return (path == null || path.isEmpty() || path.equals("/"));
}
private List<Article> parseHomepage(Document doc) {
List<Article> articles = new ArrayList<>();
Elements links = doc.getElementsByTag("a");
for (Element link : links) {
String href = link.absUrl("href");
if (href.isEmpty()) {
continue;
}
Matcher matcher = idRegex.matcher(href);
if (!matcher.find()) {
continue;
}
try {
URL articleUrl = URI.create(href).toURL();
Document articleDoc = Jsoup.parse(articleUrl, 5000);
articles.add(parseSingle(articleUrl, articleDoc));
} catch (Exception e) {
log.warn("Failed to fetch article: {}", href, e);
}
}
return articles;
}
private Article parseSingle(URL url, Document doc) throws CrawlException {
Matcher matcher = idRegex.matcher(url.getPath());
if (!matcher.find()) {

37
src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java

@ -1,16 +1,12 @@
package internal.hw.crawler.strategies.crawl;
import internal.hw.crawler.models.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@ -37,7 +33,14 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy {
@Override
public List<Article> parse(URL url, Document doc) throws CrawlException {
if (isHomepage(url)) {
return parseHomepage(doc);
return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> {
try {
return parseSingle(articleUrl, articleDoc);
} catch (CrawlException e) {
log.warn("Failed to parse article: {}", articleUrl, e);
return null;
}
});
} else {
return List.of(parseSingle(url, doc));
}
@ -52,30 +55,6 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy {
return path == null || path.isEmpty() || path.equals("/");
}
private List<Article> parseHomepage(Document doc) {
List<Article> articles = new ArrayList<>();
Elements links = doc.getElementsByTag("a");
for (Element link : links) {
String href = link.absUrl("href");
if (href.isEmpty()) {
continue;
}
Matcher matcher = idRegex.matcher(href);
if (!matcher.find()) {
continue;
}
try {
URL articleUrl = URI.create(href).toURL();
Document articleDoc = Jsoup.parse(articleUrl, 5000);
articles.add(parseSingle(articleUrl, articleDoc));
} catch (Exception e) {
log.warn("Failed to fetch article: {}", href, e);
}
}
return articles;
}
private Article parseSingle(URL url, Document doc) throws CrawlException {
Matcher matcher = idRegex.matcher(url.getPath());
if (!matcher.find()) {

Loading…
Cancel
Save