diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java index fb15b3f..fef8d8e 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java @@ -10,6 +10,7 @@ public class CrawlStrategyFactory { public CrawlStrategyFactory() { register(new IthomeCrawlStrategy()); register(new PeopleCnCrawlStrategy()); + register(new NeteaseNewsCrawlStrategy()); } public CrawlStrategy getStrategy(URL url) { diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java new file mode 100644 index 0000000..8bc1e28 --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java @@ -0,0 +1,75 @@ +package internal.hw.crawler.strategies.crawl; + +import internal.hw.crawler.models.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URL; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class NeteaseNewsCrawlStrategy implements CrawlStrategy { + private static final Logger log = LoggerFactory.getLogger(NeteaseNewsCrawlStrategy.class); + private final List supportedDomains = List.of("money.163.com", "news.163.com"); + /* 示例 URL:https://www.163.com/dy/article/KU6996L4053469LG.html */ + private final Pattern idRegex = Pattern.compile(".*/article/(.*)\\.html"); + + @Override + public boolean supports(URL url) { + String host = url.getHost(); + return supportedDomains.contains(host); + } + + @Override + public List
parse(URL url, Document doc) throws CrawlException { + if (isHomepage(url)) { + return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> { + try { + return parseSingle(articleUrl, articleDoc); + } catch (CrawlException e) { + log.warn("Failed to parse article: {}", articleUrl, e); + return null; + } + }); + } else { + return List.of(parseSingle(url, doc)); + } + } + + private boolean isHomepage(URL url) { + String path = url.getPath(); + return path == null || path.isEmpty() || path.equals("/"); + } + + private Article parseSingle(URL url, Document doc) throws CrawlException { + Matcher matcher = idRegex.matcher(url.getPath()); + if (!matcher.find()) { + throw new CrawlException(String.format("Cannot determine id for %s", url)); + } + + String id = matcher.group(1); + Element titleEl = doc.selectFirst("h1.post_title"); + if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url)); + String title = titleEl.text(); + + + Element contentEl = doc.selectFirst("div.post_body"); + if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url)); + String content = contentEl.text(); + + Article article = new Article(); + article.setId(id); + article.setSource("netease-news"); + article.setUrl(url); + article.setTitle(title); + // 网易新闻多为网站来源 + article.setAuthors(Set.of()); + article.setContent(content); + + return article; + } +}