Browse Source

people.com.cn parsing

master
283375 1 month ago
parent
commit
e3a1136c23
Failed to extract signature
  1. 2
      src/main/java/internal/hw/crawler/models/Article.java
  2. 1
      src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
  3. 97
      src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java

2
src/main/java/internal/hw/crawler/models/Article.java

@ -8,7 +8,7 @@ public class Article {
private String source; private String source;
private URL url; private URL url;
private String title; private String title;
private Set<String> authors; private Set<String> authors = Set.of();
private String content; private String content;
public String getId() { public String getId() {

1
src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java

@ -9,6 +9,7 @@ public class CrawlStrategyFactory {
public CrawlStrategyFactory() { public CrawlStrategyFactory() {
register(new IthomeCrawlStrategy()); register(new IthomeCrawlStrategy());
register(new PeopleCnCrawlStrategy());
} }
public CrawlStrategy getStrategy(URL url) { public CrawlStrategy getStrategy(URL url) {

97
src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java

@ -0,0 +1,97 @@
package internal.hw.crawler.strategies.crawl;
import internal.hw.crawler.models.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class PeopleCnCrawlStrategy implements CrawlStrategy {
private final List<String> supportedDomains = List.of("people.cn", "people.com.cn");
/* 示例 URL:http://env.people.com.cn/n1/2026/0530/c1010-40730688.html */
private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/c(\\d+)-(\\d+).html");
@Override
public boolean supports(URL url) {
for (String domain : supportedDomains) {
if (url.getHost().endsWith(domain)) {
return true;
}
}
return false;
}
@Override
public List<Article> parse(URL url, Document doc) throws CrawlException {
List<String> homepage = supportedDomains.stream().map(it -> "https://www." + it).collect(Collectors.toList());
if (homepage.contains(url.toString())) {
// 传入的是首页,解析所有链接
return parseHomepage(doc);
} else {
return List.of(parseSingle(url, doc));
}
}
private List<Article> parseHomepage(Document doc) {
List<Article> articles = new ArrayList<>();
Elements links = doc.getElementsByTag("a");
for (Element link : links) {
String href = link.attr("href");
Matcher matcher = idRegex.matcher(href);
if (!matcher.find()) {
continue;
}
try {
URL articleUrl = new URL(href);
Document articleDoc = Jsoup.parse(articleUrl, 5000);
articles.add(parseSingle(articleUrl, articleDoc));
} catch (Exception ignored) {
}
}
return articles;
}
private Article parseSingle(URL url, Document doc) throws CrawlException {
Matcher matcher = idRegex.matcher(url.getPath());
if (!matcher.find()) {
throw new CrawlException(String.format("Cannot determine id for %s", url));
}
String id = String.format("%s%s-c%s-%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4));
Element titleEl = doc.selectFirst(".layout.rm_txt h1");
if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url));
String title = titleEl.text();
Set<String> authors = new HashSet<>();
Element authorEl = doc.selectFirst("div.author.cf");
if (authorEl != null) {
authors.add(authorEl.text());
}
Element contentEl = doc.selectFirst("div#rm_txt_zw");
if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url));
String content = contentEl.text();
Article article = new Article();
article.setId(id);
article.setSource("people-cn");
article.setUrl(url);
article.setTitle(title);
article.setAuthors(authors);
article.setContent(content);
return article;
}
}
Loading…
Cancel
Save