2 changed files with 76 additions and 0 deletions
@ -0,0 +1,75 @@ |
|||||
|
package internal.hw.crawler.strategies.crawl; |
||||
|
|
||||
|
import internal.hw.crawler.models.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.net.URL; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class NeteaseNewsCrawlStrategy implements CrawlStrategy { |
||||
|
private static final Logger log = LoggerFactory.getLogger(NeteaseNewsCrawlStrategy.class); |
||||
|
private final List<String> supportedDomains = List.of("money.163.com", "news.163.com"); |
||||
|
/* 示例 URL:https://www.163.com/dy/article/KU6996L4053469LG.html */ |
||||
|
private final Pattern idRegex = Pattern.compile(".*/article/(.*)\\.html"); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(URL url) { |
||||
|
String host = url.getHost(); |
||||
|
return supportedDomains.contains(host); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(URL url, Document doc) throws CrawlException { |
||||
|
if (isHomepage(url)) { |
||||
|
return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> { |
||||
|
try { |
||||
|
return parseSingle(articleUrl, articleDoc); |
||||
|
} catch (CrawlException e) { |
||||
|
log.warn("Failed to parse article: {}", articleUrl, e); |
||||
|
return null; |
||||
|
} |
||||
|
}); |
||||
|
} else { |
||||
|
return List.of(parseSingle(url, doc)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private boolean isHomepage(URL url) { |
||||
|
String path = url.getPath(); |
||||
|
return path == null || path.isEmpty() || path.equals("/"); |
||||
|
} |
||||
|
|
||||
|
private Article parseSingle(URL url, Document doc) throws CrawlException { |
||||
|
Matcher matcher = idRegex.matcher(url.getPath()); |
||||
|
if (!matcher.find()) { |
||||
|
throw new CrawlException(String.format("Cannot determine id for %s", url)); |
||||
|
} |
||||
|
|
||||
|
String id = matcher.group(1); |
||||
|
Element titleEl = doc.selectFirst("h1.post_title"); |
||||
|
if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url)); |
||||
|
String title = titleEl.text(); |
||||
|
|
||||
|
|
||||
|
Element contentEl = doc.selectFirst("div.post_body"); |
||||
|
if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url)); |
||||
|
String content = contentEl.text(); |
||||
|
|
||||
|
Article article = new Article(); |
||||
|
article.setId(id); |
||||
|
article.setSource("netease-news"); |
||||
|
article.setUrl(url); |
||||
|
article.setTitle(title); |
||||
|
// 网易新闻多为网站来源
|
||||
|
article.setAuthors(Set.of()); |
||||
|
article.setContent(content); |
||||
|
|
||||
|
return article; |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue