add netease news source

4 weeks ago · 5ed16e83c4
2 changed files with 76 additions and 0 deletions
--- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
@ -10,6 +10,7 @@ public class CrawlStrategyFactory {
    public CrawlStrategyFactory() {
        register(new IthomeCrawlStrategy());
        register(new PeopleCnCrawlStrategy());
+        register(new NeteaseNewsCrawlStrategy());
    }

    public CrawlStrategy getStrategy(URL url) {
--- a/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java
@ -0,0 +1,75 @@
+package internal.hw.crawler.strategies.crawl;
+
+import internal.hw.crawler.models.Article;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.URL;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class NeteaseNewsCrawlStrategy implements CrawlStrategy {
+    private static final Logger log = LoggerFactory.getLogger(NeteaseNewsCrawlStrategy.class);
+    private final List<String> supportedDomains = List.of("money.163.com", "news.163.com");
+    /* 示例 URL：https://www.163.com/dy/article/KU6996L4053469LG.html */
+    private final Pattern idRegex = Pattern.compile(".*/article/(.*)\\.html");
+
+    @Override
+    public boolean supports(URL url) {
+        String host = url.getHost();
+        return supportedDomains.contains(host);
+    }
+
+    @Override
+    public List<Article> parse(URL url, Document doc) throws CrawlException {
+        if (isHomepage(url)) {
+            return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> {
+                try {
+                    return parseSingle(articleUrl, articleDoc);
+                } catch (CrawlException e) {
+                    log.warn("Failed to parse article: {}", articleUrl, e);
+                    return null;
+                }
+            });
+        } else {
+            return List.of(parseSingle(url, doc));
+        }
+    }
+
+    private boolean isHomepage(URL url) {
+        String path = url.getPath();
+        return path == null || path.isEmpty() || path.equals("/");
+    }
+
+    private Article parseSingle(URL url, Document doc) throws CrawlException {
+        Matcher matcher = idRegex.matcher(url.getPath());
+        if (!matcher.find()) {
+            throw new CrawlException(String.format("Cannot determine id for %s", url));
+        }
+
+        String id = matcher.group(1);
+        Element titleEl = doc.selectFirst("h1.post_title");
+        if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url));
+        String title = titleEl.text();
+
+
+        Element contentEl = doc.selectFirst("div.post_body");
+        if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url));
+        String content = contentEl.text();
+
+        Article article = new Article();
+        article.setId(id);
+        article.setSource("netease-news");
+        article.setUrl(url);
+        article.setTitle(title);
+        // 网易新闻多为网站来源
+        article.setAuthors(Set.of());
+        article.setContent(content);
+
+        return article;
+    }
+}