7 changed files with 154 additions and 10 deletions
@ -0,0 +1,11 @@ |
|||
package internal.hw.crawler.strategies.crawl; |
|||
|
|||
public class CrawlException extends Exception { |
|||
public CrawlException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public CrawlException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,13 @@ |
|||
package internal.hw.crawler.strategies.crawl; |
|||
|
|||
import internal.hw.crawler.models.Article; |
|||
import org.jsoup.nodes.Document; |
|||
|
|||
import java.net.URL; |
|||
import java.util.List; |
|||
|
|||
public interface CrawlStrategy { |
|||
List<Article> parse(URL url, Document doc) throws CrawlException; |
|||
|
|||
boolean supports(URL url); |
|||
} |
|||
@ -0,0 +1,27 @@ |
|||
package internal.hw.crawler.strategies.crawl; |
|||
|
|||
import java.net.URL; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class CrawlStrategyFactory { |
|||
private final List<CrawlStrategy> strategies = new ArrayList<>(); |
|||
|
|||
public CrawlStrategyFactory() { |
|||
register(new IthomeCrawlStrategy()); |
|||
} |
|||
|
|||
public CrawlStrategy getStrategy(URL url) { |
|||
for (CrawlStrategy s : strategies) { |
|||
if (s.supports(url)) { |
|||
return s; |
|||
} |
|||
} |
|||
return null; |
|||
} |
|||
|
|||
public void register(CrawlStrategy strategy) { |
|||
strategies.add(strategy); |
|||
} |
|||
} |
|||
|
|||
@ -0,0 +1,48 @@ |
|||
package internal.hw.crawler.strategies.crawl; |
|||
|
|||
import internal.hw.crawler.models.Article; |
|||
import org.jsoup.nodes.Document; |
|||
|
|||
import java.net.URL; |
|||
import java.util.HashSet; |
|||
import java.util.List; |
|||
import java.util.Set; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class IthomeCrawlStrategy implements CrawlStrategy { |
|||
private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/(\\d+)\\.htm"); |
|||
|
|||
@Override |
|||
public boolean supports(URL url) { |
|||
return url.getHost().endsWith("ithome.com"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(URL url, Document doc) throws CrawlException { |
|||
return List.of(parseSingle(url, doc)); |
|||
} |
|||
|
|||
private Article parseSingle(URL url, Document doc) throws CrawlException { |
|||
Matcher matcher = idRegex.matcher(url.getPath()); |
|||
if (!matcher.find()) throw new CrawlException(String.format("Cannot determine id for %s", url)); |
|||
|
|||
String id = String.format("%s-%s-%s", matcher.group(1), matcher.group(2), matcher.group(3)); |
|||
String title = doc.selectFirst("h1").text(); |
|||
String content = doc.selectFirst("#paragraph").text(); |
|||
|
|||
String authorRaw = doc.selectFirst("#author_baidu > strong").text(); |
|||
String editorRaw = doc.selectFirst("#editor_baidu > strong").text(); |
|||
Set<String> authors = new HashSet<>(); |
|||
authors.add(authorRaw); |
|||
authors.add(editorRaw); |
|||
|
|||
Article article = new Article(); |
|||
article.setId(id); |
|||
article.setUrl(url); |
|||
article.setTitle(title); |
|||
article.setAuthors(authors); |
|||
article.setContent(content); |
|||
return article; |
|||
} |
|||
} |
|||
Loading…
Reference in new issue