diff --git a/pom.xml b/pom.xml
index 42c2374..d657eb4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -12,4 +12,11 @@
11
+
+
+ org.jsoup
+ jsoup
+ 1.22.2
+
+
\ No newline at end of file
diff --git a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java
index d4e0218..21bd8ad 100644
--- a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java
+++ b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java
@@ -1,11 +1,18 @@
package internal.hw.crawler.commands;
+import internal.hw.crawler.models.Article;
import internal.hw.crawler.repositories.ArticleRepository;
+import internal.hw.crawler.strategies.crawl.CrawlStrategy;
+import internal.hw.crawler.strategies.crawl.CrawlStrategyFactory;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import java.net.URL;
import java.util.List;
public class CrawlCommand implements Command {
- private ArticleRepository repository;
+ private final ArticleRepository repository;
+ private final CrawlStrategyFactory crawlStrategyFactory = new CrawlStrategyFactory();
public CrawlCommand(ArticleRepository repository) {
this.repository = repository;
@@ -23,6 +30,22 @@ public class CrawlCommand implements Command {
@Override
public void execute(String[] args) {
- System.out.printf("Will crawl %s%n", args[1]);
+ try {
+ String urlRaw = args[1];
+ URL url = new URL(urlRaw);
+ CrawlStrategy strategy = crawlStrategyFactory.getStrategy(url);
+ if (strategy == null) {
+ System.err.println("Unsupported URL: " + urlRaw);
+ return;
+ }
+
+ Document doc = Jsoup.parse(url, 5000);
+ List articles = strategy.parse(url, doc);
+ for (Article article : articles) {
+ repository.add(article);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
}
}
diff --git a/src/main/java/internal/hw/crawler/models/Article.java b/src/main/java/internal/hw/crawler/models/Article.java
index b37c916..bc9be98 100644
--- a/src/main/java/internal/hw/crawler/models/Article.java
+++ b/src/main/java/internal/hw/crawler/models/Article.java
@@ -1,21 +1,28 @@
package internal.hw.crawler.models;
+import java.net.URL;
+import java.util.Set;
+
public class Article {
- private String url;
+ private String id;
+ private URL url;
private String title;
+ private Set authors;
private String content;
- public Article(String url, String title, String content) {
- this.url = url;
- this.title = title;
- this.content = content;
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
}
- public String getUrl() {
+ public URL getUrl() {
return url;
}
- public void setUrl(String url) {
+ public void setUrl(URL url) {
this.url = url;
}
@@ -27,6 +34,14 @@ public class Article {
this.title = title;
}
+ public Set getAuthors() {
+ return authors;
+ }
+
+ public void setAuthors(Set authors) {
+ this.authors = authors;
+ }
+
public String getContent() {
return content;
}
@@ -37,6 +52,6 @@ public class Article {
@Override
public String toString() {
- return String.format("Article{%s}", url);
+ return String.format("Article{%s (%s)}", title, url);
}
}
diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java
new file mode 100644
index 0000000..7ff25df
--- /dev/null
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java
@@ -0,0 +1,11 @@
+package internal.hw.crawler.strategies.crawl;
+
+public class CrawlException extends Exception {
+ public CrawlException(String message) {
+ super(message);
+ }
+
+ public CrawlException(String message, Throwable cause) {
+ super(message, cause);
+ }
+}
diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java
new file mode 100644
index 0000000..81e0484
--- /dev/null
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java
@@ -0,0 +1,13 @@
+package internal.hw.crawler.strategies.crawl;
+
+import internal.hw.crawler.models.Article;
+import org.jsoup.nodes.Document;
+
+import java.net.URL;
+import java.util.List;
+
+public interface CrawlStrategy {
+ List parse(URL url, Document doc) throws CrawlException;
+
+ boolean supports(URL url);
+}
diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
new file mode 100644
index 0000000..cc80045
--- /dev/null
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
@@ -0,0 +1,27 @@
+package internal.hw.crawler.strategies.crawl;
+
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+public class CrawlStrategyFactory {
+ private final List strategies = new ArrayList<>();
+
+ public CrawlStrategyFactory() {
+ register(new IthomeCrawlStrategy());
+ }
+
+ public CrawlStrategy getStrategy(URL url) {
+ for (CrawlStrategy s : strategies) {
+ if (s.supports(url)) {
+ return s;
+ }
+ }
+ return null;
+ }
+
+ public void register(CrawlStrategy strategy) {
+ strategies.add(strategy);
+ }
+}
+
diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java
new file mode 100644
index 0000000..86ef862
--- /dev/null
+++ b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java
@@ -0,0 +1,48 @@
+package internal.hw.crawler.strategies.crawl;
+
+import internal.hw.crawler.models.Article;
+import org.jsoup.nodes.Document;
+
+import java.net.URL;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class IthomeCrawlStrategy implements CrawlStrategy {
+ private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/(\\d+)\\.htm");
+
+ @Override
+ public boolean supports(URL url) {
+ return url.getHost().endsWith("ithome.com");
+ }
+
+ @Override
+ public List parse(URL url, Document doc) throws CrawlException {
+ return List.of(parseSingle(url, doc));
+ }
+
+ private Article parseSingle(URL url, Document doc) throws CrawlException {
+ Matcher matcher = idRegex.matcher(url.getPath());
+ if (!matcher.find()) throw new CrawlException(String.format("Cannot determine id for %s", url));
+
+ String id = String.format("%s-%s-%s", matcher.group(1), matcher.group(2), matcher.group(3));
+ String title = doc.selectFirst("h1").text();
+ String content = doc.selectFirst("#paragraph").text();
+
+ String authorRaw = doc.selectFirst("#author_baidu > strong").text();
+ String editorRaw = doc.selectFirst("#editor_baidu > strong").text();
+ Set authors = new HashSet<>();
+ authors.add(authorRaw);
+ authors.add(editorRaw);
+
+ Article article = new Article();
+ article.setId(id);
+ article.setUrl(url);
+ article.setTitle(title);
+ article.setAuthors(authors);
+ article.setContent(content);
+ return article;
+ }
+}