diff --git a/pom.xml b/pom.xml index 42c2374..d657eb4 100644 --- a/pom.xml +++ b/pom.xml @@ -12,4 +12,11 @@ 11 + + + org.jsoup + jsoup + 1.22.2 + + \ No newline at end of file diff --git a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java index d4e0218..21bd8ad 100644 --- a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java +++ b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java @@ -1,11 +1,18 @@ package internal.hw.crawler.commands; +import internal.hw.crawler.models.Article; import internal.hw.crawler.repositories.ArticleRepository; +import internal.hw.crawler.strategies.crawl.CrawlStrategy; +import internal.hw.crawler.strategies.crawl.CrawlStrategyFactory; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import java.net.URL; import java.util.List; public class CrawlCommand implements Command { - private ArticleRepository repository; + private final ArticleRepository repository; + private final CrawlStrategyFactory crawlStrategyFactory = new CrawlStrategyFactory(); public CrawlCommand(ArticleRepository repository) { this.repository = repository; @@ -23,6 +30,22 @@ public class CrawlCommand implements Command { @Override public void execute(String[] args) { - System.out.printf("Will crawl %s%n", args[1]); + try { + String urlRaw = args[1]; + URL url = new URL(urlRaw); + CrawlStrategy strategy = crawlStrategyFactory.getStrategy(url); + if (strategy == null) { + System.err.println("Unsupported URL: " + urlRaw); + return; + } + + Document doc = Jsoup.parse(url, 5000); + List
articles = strategy.parse(url, doc); + for (Article article : articles) { + repository.add(article); + } + } catch (Exception e) { + throw new RuntimeException(e); + } } } diff --git a/src/main/java/internal/hw/crawler/models/Article.java b/src/main/java/internal/hw/crawler/models/Article.java index b37c916..bc9be98 100644 --- a/src/main/java/internal/hw/crawler/models/Article.java +++ b/src/main/java/internal/hw/crawler/models/Article.java @@ -1,21 +1,28 @@ package internal.hw.crawler.models; +import java.net.URL; +import java.util.Set; + public class Article { - private String url; + private String id; + private URL url; private String title; + private Set authors; private String content; - public Article(String url, String title, String content) { - this.url = url; - this.title = title; - this.content = content; + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; } - public String getUrl() { + public URL getUrl() { return url; } - public void setUrl(String url) { + public void setUrl(URL url) { this.url = url; } @@ -27,6 +34,14 @@ public class Article { this.title = title; } + public Set getAuthors() { + return authors; + } + + public void setAuthors(Set authors) { + this.authors = authors; + } + public String getContent() { return content; } @@ -37,6 +52,6 @@ public class Article { @Override public String toString() { - return String.format("Article{%s}", url); + return String.format("Article{%s (%s)}", title, url); } } diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java new file mode 100644 index 0000000..7ff25df --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java @@ -0,0 +1,11 @@ +package internal.hw.crawler.strategies.crawl; + +public class CrawlException extends Exception { + public CrawlException(String message) { + super(message); + } + + public CrawlException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java new file mode 100644 index 0000000..81e0484 --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java @@ -0,0 +1,13 @@ +package internal.hw.crawler.strategies.crawl; + +import internal.hw.crawler.models.Article; +import org.jsoup.nodes.Document; + +import java.net.URL; +import java.util.List; + +public interface CrawlStrategy { + List
parse(URL url, Document doc) throws CrawlException; + + boolean supports(URL url); +} diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java new file mode 100644 index 0000000..cc80045 --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java @@ -0,0 +1,27 @@ +package internal.hw.crawler.strategies.crawl; + +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +public class CrawlStrategyFactory { + private final List strategies = new ArrayList<>(); + + public CrawlStrategyFactory() { + register(new IthomeCrawlStrategy()); + } + + public CrawlStrategy getStrategy(URL url) { + for (CrawlStrategy s : strategies) { + if (s.supports(url)) { + return s; + } + } + return null; + } + + public void register(CrawlStrategy strategy) { + strategies.add(strategy); + } +} + diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java new file mode 100644 index 0000000..86ef862 --- /dev/null +++ b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java @@ -0,0 +1,48 @@ +package internal.hw.crawler.strategies.crawl; + +import internal.hw.crawler.models.Article; +import org.jsoup.nodes.Document; + +import java.net.URL; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class IthomeCrawlStrategy implements CrawlStrategy { + private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/(\\d+)\\.htm"); + + @Override + public boolean supports(URL url) { + return url.getHost().endsWith("ithome.com"); + } + + @Override + public List
parse(URL url, Document doc) throws CrawlException { + return List.of(parseSingle(url, doc)); + } + + private Article parseSingle(URL url, Document doc) throws CrawlException { + Matcher matcher = idRegex.matcher(url.getPath()); + if (!matcher.find()) throw new CrawlException(String.format("Cannot determine id for %s", url)); + + String id = String.format("%s-%s-%s", matcher.group(1), matcher.group(2), matcher.group(3)); + String title = doc.selectFirst("h1").text(); + String content = doc.selectFirst("#paragraph").text(); + + String authorRaw = doc.selectFirst("#author_baidu > strong").text(); + String editorRaw = doc.selectFirst("#editor_baidu > strong").text(); + Set authors = new HashSet<>(); + authors.add(authorRaw); + authors.add(editorRaw); + + Article article = new Article(); + article.setId(id); + article.setUrl(url); + article.setTitle(title); + article.setAuthors(authors); + article.setContent(content); + return article; + } +}