add netease news source

4 weeks ago · 5ed16e83c4
2 changed files with 76 additions and 0 deletions
--- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
@ -10,6 +10,7 @@ public class CrawlStrategyFactory {
    public CrawlStrategyFactory() {
        register(new IthomeCrawlStrategy());
        register(new PeopleCnCrawlStrategy());
        register(new NeteaseNewsCrawlStrategy());
    }
    public CrawlStrategy getStrategy(URL url) {
--- a/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java
@ -0,0 +1,75 @@
 package internal.hw.crawler.strategies.crawl;
 import internal.hw.crawler.models.Article;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.net.URL;
 import java.util.List;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 public class NeteaseNewsCrawlStrategy implements CrawlStrategy {
    private static final Logger log = LoggerFactory.getLogger(NeteaseNewsCrawlStrategy.class);
    private final List<String> supportedDomains = List.of("money.163.com", "news.163.com");
    /* 示例 URL：https://www.163.com/dy/article/KU6996L4053469LG.html */
    private final Pattern idRegex = Pattern.compile(".*/article/(.*)\\.html");
    @Override
    public boolean supports(URL url) {
        String host = url.getHost();
        return supportedDomains.contains(host);
    }
    @Override
    public List<Article> parse(URL url, Document doc) throws CrawlException {
        if (isHomepage(url)) {
            return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> {
                try {
                    return parseSingle(articleUrl, articleDoc);
                } catch (CrawlException e) {
                    log.warn("Failed to parse article: {}", articleUrl, e);
                    return null;
                }
            });
        } else {
            return List.of(parseSingle(url, doc));
        }
    }
    private boolean isHomepage(URL url) {
        String path = url.getPath();
        return path == null || path.isEmpty() || path.equals("/");
    }
    private Article parseSingle(URL url, Document doc) throws CrawlException {
        Matcher matcher = idRegex.matcher(url.getPath());
        if (!matcher.find()) {
            throw new CrawlException(String.format("Cannot determine id for %s", url));
        }
        String id = matcher.group(1);
        Element titleEl = doc.selectFirst("h1.post_title");
        if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url));
        String title = titleEl.text();
        Element contentEl = doc.selectFirst("div.post_body");
        if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url));
        String content = contentEl.text();
        Article article = new Article();
        article.setId(id);
        article.setSource("netease-news");
        article.setUrl(url);
        article.setTitle(title);
        // 网易新闻多为网站来源
        article.setAuthors(Set.of());
        article.setContent(content);
        return article;
    }
 }