You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
89 lines
3.2 KiB
89 lines
3.2 KiB
package internal.hw.crawler.strategies.crawl;
|
|
|
|
import internal.hw.crawler.models.Article;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.net.URL;
|
|
import java.util.HashSet;
|
|
import java.util.List;
|
|
import java.util.Set;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class PeopleCnCrawlStrategy implements CrawlStrategy {
|
|
private static final Logger log = LoggerFactory.getLogger(PeopleCnCrawlStrategy.class);
|
|
private final List<String> supportedDomains = List.of("people.cn", "people.com.cn");
|
|
/* 示例 URL:http://env.people.com.cn/n1/2026/0530/c1010-40730688.html */
|
|
private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/c(\\d+)-(\\d+).html");
|
|
|
|
@Override
|
|
public boolean supports(URL url) {
|
|
String host = url.getHost();
|
|
for (String domain : supportedDomains) {
|
|
if (host.equals(domain) || host.endsWith("." + domain)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
@Override
|
|
public List<Article> parse(URL url, Document doc) throws CrawlException {
|
|
if (isHomepage(url)) {
|
|
return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> {
|
|
try {
|
|
return parseSingle(articleUrl, articleDoc);
|
|
} catch (CrawlException e) {
|
|
log.warn("Failed to parse article: {}", articleUrl, e);
|
|
return null;
|
|
}
|
|
});
|
|
} else {
|
|
return List.of(parseSingle(url, doc));
|
|
}
|
|
}
|
|
|
|
private boolean isHomepage(URL url) {
|
|
String host = url.getHost();
|
|
boolean matched = supportedDomains.stream()
|
|
.anyMatch(d -> host.equals(d) || host.endsWith("." + d));
|
|
if (!matched) return false;
|
|
String path = url.getPath();
|
|
return path == null || path.isEmpty() || path.equals("/");
|
|
}
|
|
|
|
private Article parseSingle(URL url, Document doc) throws CrawlException {
|
|
Matcher matcher = idRegex.matcher(url.getPath());
|
|
if (!matcher.find()) {
|
|
throw new CrawlException(String.format("Cannot determine id for %s", url));
|
|
}
|
|
|
|
String id = String.format("%s%s-c%s-%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4));
|
|
Element titleEl = doc.selectFirst(".layout.rm_txt h1");
|
|
if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url));
|
|
String title = titleEl.text();
|
|
|
|
Set<String> authors = new HashSet<>();
|
|
Element authorEl = doc.selectFirst("div.author.cf");
|
|
if (authorEl != null) {
|
|
authors.add(authorEl.text());
|
|
}
|
|
|
|
Element contentEl = doc.selectFirst("div#rm_txt_zw");
|
|
if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url));
|
|
String content = contentEl.text();
|
|
|
|
Article article = new Article();
|
|
article.setId(id);
|
|
article.setSource("people-cn");
|
|
article.setUrl(url);
|
|
article.setTitle(title);
|
|
article.setAuthors(authors);
|
|
article.setContent(content);
|
|
|
|
return article;
|
|
}
|
|
}
|
|
|