people.com.cn parsing

1 month ago · e3a1136c23
3 changed files with 99 additions and 1 deletions
--- a/src/main/java/internal/hw/crawler/models/Article.java
+++ b/src/main/java/internal/hw/crawler/models/Article.java
@ -8,7 +8,7 @@ public class Article {
    private String source;
    private URL url;
    private String title;
-    private Set<String> authors;
+    private Set<String> authors = Set.of();
    private String content;
    public String getId() {
--- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
@ -9,6 +9,7 @@ public class CrawlStrategyFactory {
    public CrawlStrategyFactory() {
        register(new IthomeCrawlStrategy());
        register(new PeopleCnCrawlStrategy());
    }
    public CrawlStrategy getStrategy(URL url) {
--- a/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java
@ -0,0 +1,97 @@
 package internal.hw.crawler.strategies.crawl;
 import internal.hw.crawler.models.Article;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 public class PeopleCnCrawlStrategy implements CrawlStrategy {
    private final List<String> supportedDomains = List.of("people.cn", "people.com.cn");
    /* 示例 URL：http://env.people.com.cn/n1/2026/0530/c1010-40730688.html */
    private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/c(\\d+)-(\\d+).html");
    @Override
    public boolean supports(URL url) {
        for (String domain : supportedDomains) {
            if (url.getHost().endsWith(domain)) {
                return true;
            }
        }
        return false;
    }
    @Override
    public List<Article> parse(URL url, Document doc) throws CrawlException {
        List<String> homepage = supportedDomains.stream().map(it -> "https://www." + it).collect(Collectors.toList());
        if (homepage.contains(url.toString())) {
            // 传入的是首页，解析所有链接
            return parseHomepage(doc);
        } else {
            return List.of(parseSingle(url, doc));
        }
    }
    private List<Article> parseHomepage(Document doc) {
        List<Article> articles = new ArrayList<>();
        Elements links = doc.getElementsByTag("a");
        for (Element link : links) {
            String href = link.attr("href");
            Matcher matcher = idRegex.matcher(href);
            if (!matcher.find()) {
                continue;
            }
            try {
                URL articleUrl = new URL(href);
                Document articleDoc = Jsoup.parse(articleUrl, 5000);
                articles.add(parseSingle(articleUrl, articleDoc));
            } catch (Exception ignored) {
            }
        }
        return articles;
    }
    private Article parseSingle(URL url, Document doc) throws CrawlException {
        Matcher matcher = idRegex.matcher(url.getPath());
        if (!matcher.find()) {
            throw new CrawlException(String.format("Cannot determine id for %s", url));
        }
        String id = String.format("%s%s-c%s-%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4));
        Element titleEl = doc.selectFirst(".layout.rm_txt h1");
        if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url));
        String title = titleEl.text();
        Set<String> authors = new HashSet<>();
        Element authorEl = doc.selectFirst("div.author.cf");
        if (authorEl != null) {
            authors.add(authorEl.text());
        }
        Element contentEl = doc.selectFirst("div#rm_txt_zw");
        if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url));
        String content = contentEl.text();
        Article article = new Article();
        article.setId(id);
        article.setSource("people-cn");
        article.setUrl(url);
        article.setTitle(title);
        article.setAuthors(authors);
        article.setContent(content);
        return article;
    }
 }