|
|
|
@ -2,6 +2,7 @@ package internal.hw.crawler.strategies.crawl; |
|
|
|
|
|
|
|
import internal.hw.crawler.models.Article; |
|
|
|
import org.jsoup.nodes.Document; |
|
|
|
import org.jsoup.nodes.Element; |
|
|
|
|
|
|
|
import java.net.URL; |
|
|
|
import java.util.HashSet; |
|
|
|
@ -25,17 +26,29 @@ public class IthomeCrawlStrategy implements CrawlStrategy { |
|
|
|
|
|
|
|
private Article parseSingle(URL url, Document doc) throws CrawlException { |
|
|
|
Matcher matcher = idRegex.matcher(url.getPath()); |
|
|
|
if (!matcher.find()) throw new CrawlException(String.format("Cannot determine id for %s", url)); |
|
|
|
if (!matcher.find()) { |
|
|
|
throw new CrawlException(String.format("Cannot determine id for %s", url)); |
|
|
|
} |
|
|
|
|
|
|
|
String id = String.format("%s-%s-%s", matcher.group(1), matcher.group(2), matcher.group(3)); |
|
|
|
String title = doc.selectFirst("h1").text(); |
|
|
|
String content = doc.selectFirst("#paragraph").text(); |
|
|
|
|
|
|
|
String authorRaw = doc.selectFirst("#author_baidu > strong").text(); |
|
|
|
String editorRaw = doc.selectFirst("#editor_baidu > strong").text(); |
|
|
|
Element h1 = doc.selectFirst("h1"); |
|
|
|
if (h1 == null) { |
|
|
|
throw new CrawlException("Missing <h1> element in page: " + url); |
|
|
|
} |
|
|
|
String title = h1.text(); |
|
|
|
|
|
|
|
Element paragraph = doc.selectFirst("#paragraph"); |
|
|
|
if (paragraph == null) { |
|
|
|
throw new CrawlException("Missing #paragraph element in page: " + url); |
|
|
|
} |
|
|
|
String content = paragraph.text(); |
|
|
|
|
|
|
|
Element authorEl = doc.selectFirst("#author_baidu > strong"); |
|
|
|
Element editorEl = doc.selectFirst("#editor_baidu > strong"); |
|
|
|
Set<String> authors = new HashSet<>(); |
|
|
|
authors.add(authorRaw); |
|
|
|
authors.add(editorRaw); |
|
|
|
if (authorEl != null) authors.add(authorEl.text()); |
|
|
|
if (editorEl != null) authors.add(editorEl.text()); |
|
|
|
|
|
|
|
Article article = new Article(); |
|
|
|
article.setId(id); |
|
|
|
|