Browse Source

add netease news source

master
283375 4 weeks ago
parent
commit
5ed16e83c4
Failed to extract signature
  1. 1
      src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
  2. 75
      src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java

1
src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java

@ -10,6 +10,7 @@ public class CrawlStrategyFactory {
public CrawlStrategyFactory() {
register(new IthomeCrawlStrategy());
register(new PeopleCnCrawlStrategy());
register(new NeteaseNewsCrawlStrategy());
}
public CrawlStrategy getStrategy(URL url) {

75
src/main/java/internal/hw/crawler/strategies/crawl/NeteaseNewsCrawlStrategy.java

@ -0,0 +1,75 @@
package internal.hw.crawler.strategies.crawl;
import internal.hw.crawler.models.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URL;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class NeteaseNewsCrawlStrategy implements CrawlStrategy {
private static final Logger log = LoggerFactory.getLogger(NeteaseNewsCrawlStrategy.class);
private final List<String> supportedDomains = List.of("money.163.com", "news.163.com");
/* 示例 URL:https://www.163.com/dy/article/KU6996L4053469LG.html */
private final Pattern idRegex = Pattern.compile(".*/article/(.*)\\.html");
@Override
public boolean supports(URL url) {
String host = url.getHost();
return supportedDomains.contains(host);
}
@Override
public List<Article> parse(URL url, Document doc) throws CrawlException {
if (isHomepage(url)) {
return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> {
try {
return parseSingle(articleUrl, articleDoc);
} catch (CrawlException e) {
log.warn("Failed to parse article: {}", articleUrl, e);
return null;
}
});
} else {
return List.of(parseSingle(url, doc));
}
}
private boolean isHomepage(URL url) {
String path = url.getPath();
return path == null || path.isEmpty() || path.equals("/");
}
private Article parseSingle(URL url, Document doc) throws CrawlException {
Matcher matcher = idRegex.matcher(url.getPath());
if (!matcher.find()) {
throw new CrawlException(String.format("Cannot determine id for %s", url));
}
String id = matcher.group(1);
Element titleEl = doc.selectFirst("h1.post_title");
if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url));
String title = titleEl.text();
Element contentEl = doc.selectFirst("div.post_body");
if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url));
String content = contentEl.text();
Article article = new Article();
article.setId(id);
article.setSource("netease-news");
article.setUrl(url);
article.setTitle(title);
// 网易新闻多为网站来源
article.setAuthors(Set.of());
article.setContent(content);
return article;
}
}
Loading…
Cancel
Save