project/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java


								package internal.hw.crawler.strategies.crawl;


								import internal.hw.crawler.models.Article;

								import org.jsoup.nodes.Document;

								import org.jsoup.nodes.Element;

								import org.slf4j.Logger;

								import org.slf4j.LoggerFactory;


								import java.net.URL;

								import java.util.HashSet;

								import java.util.List;

								import java.util.Set;

								import java.util.regex.Matcher;

								import java.util.regex.Pattern;


								public class PeopleCnCrawlStrategy implements CrawlStrategy {

								    private static final Logger log = LoggerFactory.getLogger(PeopleCnCrawlStrategy.class);

								    private final List<String> supportedDomains = List.of("people.cn", "people.com.cn");

								    /* 示例 URL：http://env.people.com.cn/n1/2026/0530/c1010-40730688.html */

								    private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/c(\\d+)-(\\d+).html");


								    @Override

								    public boolean supports(URL url) {

								        String host = url.getHost();

								        for (String domain : supportedDomains) {

								            if (host.equals(domain) || host.endsWith("." + domain)) {

								                return true;

								            }

								        }

								        return false;

								    }


								    @Override

								    public List<Article> parse(URL url, Document doc) throws CrawlException {

								        if (isHomepage(url)) {

								            return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> {

								                try {

								                    return parseSingle(articleUrl, articleDoc);

								                } catch (CrawlException e) {

								                    log.warn("Failed to parse article: {}", articleUrl, e);

								                    return null;

								                }

								            });

								        } else {

								            return List.of(parseSingle(url, doc));

								        }

								    }


								    private boolean isHomepage(URL url) {

								        String host = url.getHost();

								        boolean matched = supportedDomains.stream()

								            .anyMatch(d -> host.equals(d) || host.endsWith("." + d));

								        if (!matched) return false;

								        String path = url.getPath();

								        return path == null || path.isEmpty() || path.equals("/");

								    }


								    private Article parseSingle(URL url, Document doc) throws CrawlException {

								        Matcher matcher = idRegex.matcher(url.getPath());

								        if (!matcher.find()) {

								            throw new CrawlException(String.format("Cannot determine id for %s", url));

								        }


								        String id = String.format("%s%s-c%s-%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4));

								        Element titleEl = doc.selectFirst(".layout.rm_txt h1");

								        if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url));

								        String title = titleEl.text();


								        Set<String> authors = new HashSet<>();

								        Element authorEl = doc.selectFirst("div.author.cf");

								        if (authorEl != null) {

								            authors.add(authorEl.text());

								        }


								        Element contentEl = doc.selectFirst("div#rm_txt_zw");

								        if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url));

								        String content = contentEl.text();


								        Article article = new Article();

								        article.setId(id);

								        article.setSource("people-cn");

								        article.setUrl(url);

								        article.setTitle(title);

								        article.setAuthors(authors);

								        article.setContent(content);


								        return article;

								    }

								}