package internal.hw.crawler.strategies.crawl; import internal.hw.crawler.models.Article; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URL; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; public class PeopleCnCrawlStrategy implements CrawlStrategy { private static final Logger log = LoggerFactory.getLogger(PeopleCnCrawlStrategy.class); private final List supportedDomains = List.of("people.cn", "people.com.cn"); /* 示例 URL:http://env.people.com.cn/n1/2026/0530/c1010-40730688.html */ private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/c(\\d+)-(\\d+).html"); @Override public boolean supports(URL url) { String host = url.getHost(); for (String domain : supportedDomains) { if (host.equals(domain) || host.endsWith("." + domain)) { return true; } } return false; } @Override public List
parse(URL url, Document doc) throws CrawlException { if (isHomepage(url)) { return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> { try { return parseSingle(articleUrl, articleDoc); } catch (CrawlException e) { log.warn("Failed to parse article: {}", articleUrl, e); return null; } }); } else { return List.of(parseSingle(url, doc)); } } private boolean isHomepage(URL url) { String host = url.getHost(); boolean matched = supportedDomains.stream() .anyMatch(d -> host.equals(d) || host.endsWith("." + d)); if (!matched) return false; String path = url.getPath(); return path == null || path.isEmpty() || path.equals("/"); } private Article parseSingle(URL url, Document doc) throws CrawlException { Matcher matcher = idRegex.matcher(url.getPath()); if (!matcher.find()) { throw new CrawlException(String.format("Cannot determine id for %s", url)); } String id = String.format("%s%s-c%s-%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4)); Element titleEl = doc.selectFirst(".layout.rm_txt h1"); if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url)); String title = titleEl.text(); Set authors = new HashSet<>(); Element authorEl = doc.selectFirst("div.author.cf"); if (authorEl != null) { authors.add(authorEl.text()); } Element contentEl = doc.selectFirst("div#rm_txt_zw"); if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url)); String content = contentEl.text(); Article article = new Article(); article.setId(id); article.setSource("people-cn"); article.setUrl(url); article.setTitle(title); article.setAuthors(authors); article.setContent(content); return article; } }