diff --git a/src/main/java/internal/hw/crawler/models/Article.java b/src/main/java/internal/hw/crawler/models/Article.java index ff5e4f8..124bb31 100644 --- a/src/main/java/internal/hw/crawler/models/Article.java +++ b/src/main/java/internal/hw/crawler/models/Article.java @@ -8,7 +8,7 @@ public class Article { private String source; private URL url; private String title; - private Set authors; + private Set authors = Set.of(); private String content; public String getId() { diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java index cc80045..fb15b3f 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java @@ -9,6 +9,7 @@ public class CrawlStrategyFactory { public CrawlStrategyFactory() { register(new IthomeCrawlStrategy()); + register(new PeopleCnCrawlStrategy()); } public CrawlStrategy getStrategy(URL url) { diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java new file mode 100644 index 0000000..f166af5 --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java @@ -0,0 +1,97 @@ +package internal.hw.crawler.strategies.crawl; + +import internal.hw.crawler.models.Article; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.net.URL; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +public class PeopleCnCrawlStrategy implements CrawlStrategy { + private final List supportedDomains = List.of("people.cn", "people.com.cn"); + /* 示例 URL:http://env.people.com.cn/n1/2026/0530/c1010-40730688.html */ + private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/c(\\d+)-(\\d+).html"); + + @Override + public boolean supports(URL url) { + for (String domain : supportedDomains) { + if (url.getHost().endsWith(domain)) { + return true; + } + } + + return false; + } + + @Override + public List
parse(URL url, Document doc) throws CrawlException { + List homepage = supportedDomains.stream().map(it -> "https://www." + it).collect(Collectors.toList()); + + if (homepage.contains(url.toString())) { + // 传入的是首页,解析所有链接 + return parseHomepage(doc); + } else { + return List.of(parseSingle(url, doc)); + } + } + + private List
parseHomepage(Document doc) { + List
articles = new ArrayList<>(); + Elements links = doc.getElementsByTag("a"); + for (Element link : links) { + String href = link.attr("href"); + Matcher matcher = idRegex.matcher(href); + if (!matcher.find()) { + continue; + } + + try { + URL articleUrl = new URL(href); + Document articleDoc = Jsoup.parse(articleUrl, 5000); + articles.add(parseSingle(articleUrl, articleDoc)); + } catch (Exception ignored) { + } + } + return articles; + } + + private Article parseSingle(URL url, Document doc) throws CrawlException { + Matcher matcher = idRegex.matcher(url.getPath()); + if (!matcher.find()) { + throw new CrawlException(String.format("Cannot determine id for %s", url)); + } + + String id = String.format("%s%s-c%s-%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4)); + Element titleEl = doc.selectFirst(".layout.rm_txt h1"); + if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url)); + String title = titleEl.text(); + + Set authors = new HashSet<>(); + Element authorEl = doc.selectFirst("div.author.cf"); + if (authorEl != null) { + authors.add(authorEl.text()); + } + + Element contentEl = doc.selectFirst("div#rm_txt_zw"); + if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url)); + String content = contentEl.text(); + + Article article = new Article(); + article.setId(id); + article.setSource("people-cn"); + article.setUrl(url); + article.setTitle(title); + article.setAuthors(authors); + article.setContent(content); + + return article; + } +}