7 changed files with 154 additions and 10 deletions
@ -0,0 +1,11 @@ |
|||||
|
package internal.hw.crawler.strategies.crawl; |
||||
|
|
||||
|
public class CrawlException extends Exception { |
||||
|
public CrawlException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public CrawlException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,13 @@ |
|||||
|
package internal.hw.crawler.strategies.crawl; |
||||
|
|
||||
|
import internal.hw.crawler.models.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
|
||||
|
import java.net.URL; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlStrategy { |
||||
|
List<Article> parse(URL url, Document doc) throws CrawlException; |
||||
|
|
||||
|
boolean supports(URL url); |
||||
|
} |
||||
@ -0,0 +1,27 @@ |
|||||
|
package internal.hw.crawler.strategies.crawl; |
||||
|
|
||||
|
import java.net.URL; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CrawlStrategyFactory { |
||||
|
private final List<CrawlStrategy> strategies = new ArrayList<>(); |
||||
|
|
||||
|
public CrawlStrategyFactory() { |
||||
|
register(new IthomeCrawlStrategy()); |
||||
|
} |
||||
|
|
||||
|
public CrawlStrategy getStrategy(URL url) { |
||||
|
for (CrawlStrategy s : strategies) { |
||||
|
if (s.supports(url)) { |
||||
|
return s; |
||||
|
} |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public void register(CrawlStrategy strategy) { |
||||
|
strategies.add(strategy); |
||||
|
} |
||||
|
} |
||||
|
|
||||
@ -0,0 +1,48 @@ |
|||||
|
package internal.hw.crawler.strategies.crawl; |
||||
|
|
||||
|
import internal.hw.crawler.models.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
|
||||
|
import java.net.URL; |
||||
|
import java.util.HashSet; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class IthomeCrawlStrategy implements CrawlStrategy { |
||||
|
private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/(\\d+)\\.htm"); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(URL url) { |
||||
|
return url.getHost().endsWith("ithome.com"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(URL url, Document doc) throws CrawlException { |
||||
|
return List.of(parseSingle(url, doc)); |
||||
|
} |
||||
|
|
||||
|
private Article parseSingle(URL url, Document doc) throws CrawlException { |
||||
|
Matcher matcher = idRegex.matcher(url.getPath()); |
||||
|
if (!matcher.find()) throw new CrawlException(String.format("Cannot determine id for %s", url)); |
||||
|
|
||||
|
String id = String.format("%s-%s-%s", matcher.group(1), matcher.group(2), matcher.group(3)); |
||||
|
String title = doc.selectFirst("h1").text(); |
||||
|
String content = doc.selectFirst("#paragraph").text(); |
||||
|
|
||||
|
String authorRaw = doc.selectFirst("#author_baidu > strong").text(); |
||||
|
String editorRaw = doc.selectFirst("#editor_baidu > strong").text(); |
||||
|
Set<String> authors = new HashSet<>(); |
||||
|
authors.add(authorRaw); |
||||
|
authors.add(editorRaw); |
||||
|
|
||||
|
Article article = new Article(); |
||||
|
article.setId(id); |
||||
|
article.setUrl(url); |
||||
|
article.setTitle(title); |
||||
|
article.setAuthors(authors); |
||||
|
article.setContent(content); |
||||
|
return article; |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue