From a35c8ec85b44952c0941d6fa900589b49c549338 Mon Sep 17 00:00:00 2001 From: 283375 Date: Sun, 17 May 2026 20:14:51 +0800 Subject: [PATCH] ithome homepage parsing --- .../strategies/crawl/IthomeCrawlStrategy.java | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java index 34d33ff..844cf4a 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java @@ -1,10 +1,13 @@ package internal.hw.crawler.strategies.crawl; import internal.hw.crawler.models.Article; +import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; import java.net.URL; +import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -21,7 +24,33 @@ public class IthomeCrawlStrategy implements CrawlStrategy { @Override public List
parse(URL url, Document doc) throws CrawlException { - return List.of(parseSingle(url, doc)); + List homepage = List.of("https://www.ithome.com", "https://ithome.com"); + if (homepage.contains(url.toString())) { + // 传入的是首页,解析所有链接 + return parseHomepage(doc); + } else { + return List.of(parseSingle(url, doc)); + } + } + + private List
parseHomepage(Document doc) { + List
articles = new ArrayList<>(); + Elements links = doc.getElementsByTag("a"); + for (Element link : links) { + String href = link.attr("href"); + Matcher matcher = idRegex.matcher(href); + if (!matcher.find()) { + continue; + } + + try { + URL articleUrl = new URL(href); + Document articleDoc = Jsoup.parse(articleUrl, 5000); + articles.add(parseSingle(articleUrl, articleDoc)); + } catch (Exception ignored) { + } + } + return articles; } private Article parseSingle(URL url, Document doc) throws CrawlException {