3 changed files with 99 additions and 1 deletions
@ -0,0 +1,97 @@ |
|||||
|
package internal.hw.crawler.strategies.crawl; |
||||
|
|
||||
|
import internal.hw.crawler.models.Article; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.net.URL; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.HashSet; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class PeopleCnCrawlStrategy implements CrawlStrategy { |
||||
|
private final List<String> supportedDomains = List.of("people.cn", "people.com.cn"); |
||||
|
/* 示例 URL:http://env.people.com.cn/n1/2026/0530/c1010-40730688.html */ |
||||
|
private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/c(\\d+)-(\\d+).html"); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(URL url) { |
||||
|
for (String domain : supportedDomains) { |
||||
|
if (url.getHost().endsWith(domain)) { |
||||
|
return true; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(URL url, Document doc) throws CrawlException { |
||||
|
List<String> homepage = supportedDomains.stream().map(it -> "https://www." + it).collect(Collectors.toList()); |
||||
|
|
||||
|
if (homepage.contains(url.toString())) { |
||||
|
// 传入的是首页,解析所有链接
|
||||
|
return parseHomepage(doc); |
||||
|
} else { |
||||
|
return List.of(parseSingle(url, doc)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private List<Article> parseHomepage(Document doc) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
Elements links = doc.getElementsByTag("a"); |
||||
|
for (Element link : links) { |
||||
|
String href = link.attr("href"); |
||||
|
Matcher matcher = idRegex.matcher(href); |
||||
|
if (!matcher.find()) { |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
URL articleUrl = new URL(href); |
||||
|
Document articleDoc = Jsoup.parse(articleUrl, 5000); |
||||
|
articles.add(parseSingle(articleUrl, articleDoc)); |
||||
|
} catch (Exception ignored) { |
||||
|
} |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
private Article parseSingle(URL url, Document doc) throws CrawlException { |
||||
|
Matcher matcher = idRegex.matcher(url.getPath()); |
||||
|
if (!matcher.find()) { |
||||
|
throw new CrawlException(String.format("Cannot determine id for %s", url)); |
||||
|
} |
||||
|
|
||||
|
String id = String.format("%s%s-c%s-%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4)); |
||||
|
Element titleEl = doc.selectFirst(".layout.rm_txt h1"); |
||||
|
if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url)); |
||||
|
String title = titleEl.text(); |
||||
|
|
||||
|
Set<String> authors = new HashSet<>(); |
||||
|
Element authorEl = doc.selectFirst("div.author.cf"); |
||||
|
if (authorEl != null) { |
||||
|
authors.add(authorEl.text()); |
||||
|
} |
||||
|
|
||||
|
Element contentEl = doc.selectFirst("div#rm_txt_zw"); |
||||
|
if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url)); |
||||
|
String content = contentEl.text(); |
||||
|
|
||||
|
Article article = new Article(); |
||||
|
article.setId(id); |
||||
|
article.setSource("people-cn"); |
||||
|
article.setUrl(url); |
||||
|
article.setTitle(title); |
||||
|
article.setAuthors(authors); |
||||
|
article.setContent(content); |
||||
|
|
||||
|
return article; |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue