Browse Source

basic crawling

master
283375 1 month ago
parent
commit
9d07dec0f4
Failed to extract signature
  1. 7
      pom.xml
  2. 27
      src/main/java/internal/hw/crawler/commands/CrawlCommand.java
  3. 31
      src/main/java/internal/hw/crawler/models/Article.java
  4. 11
      src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java
  5. 13
      src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java
  6. 27
      src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java
  7. 48
      src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java

7
pom.xml

@ -12,4 +12,11 @@
<maven.compiler.target>11</maven.compiler.target> <maven.compiler.target>11</maven.compiler.target>
</properties> </properties>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.22.2</version>
</dependency>
</dependencies>
</project> </project>

27
src/main/java/internal/hw/crawler/commands/CrawlCommand.java

@ -1,11 +1,18 @@
package internal.hw.crawler.commands; package internal.hw.crawler.commands;
import internal.hw.crawler.models.Article;
import internal.hw.crawler.repositories.ArticleRepository; import internal.hw.crawler.repositories.ArticleRepository;
import internal.hw.crawler.strategies.crawl.CrawlStrategy;
import internal.hw.crawler.strategies.crawl.CrawlStrategyFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.net.URL;
import java.util.List; import java.util.List;
public class CrawlCommand implements Command { public class CrawlCommand implements Command {
private ArticleRepository repository; private final ArticleRepository repository;
private final CrawlStrategyFactory crawlStrategyFactory = new CrawlStrategyFactory();
public CrawlCommand(ArticleRepository repository) { public CrawlCommand(ArticleRepository repository) {
this.repository = repository; this.repository = repository;
@ -23,6 +30,22 @@ public class CrawlCommand implements Command {
@Override @Override
public void execute(String[] args) { public void execute(String[] args) {
System.out.printf("Will crawl %s%n", args[1]); try {
String urlRaw = args[1];
URL url = new URL(urlRaw);
CrawlStrategy strategy = crawlStrategyFactory.getStrategy(url);
if (strategy == null) {
System.err.println("Unsupported URL: " + urlRaw);
return;
}
Document doc = Jsoup.parse(url, 5000);
List<Article> articles = strategy.parse(url, doc);
for (Article article : articles) {
repository.add(article);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
} }
} }

31
src/main/java/internal/hw/crawler/models/Article.java

@ -1,21 +1,28 @@
package internal.hw.crawler.models; package internal.hw.crawler.models;
import java.net.URL;
import java.util.Set;
public class Article { public class Article {
private String url; private String id;
private URL url;
private String title; private String title;
private Set<String> authors;
private String content; private String content;
public Article(String url, String title, String content) { public String getId() {
this.url = url; return id;
this.title = title; }
this.content = content;
public void setId(String id) {
this.id = id;
} }
public String getUrl() { public URL getUrl() {
return url; return url;
} }
public void setUrl(String url) { public void setUrl(URL url) {
this.url = url; this.url = url;
} }
@ -27,6 +34,14 @@ public class Article {
this.title = title; this.title = title;
} }
public Set<String> getAuthors() {
return authors;
}
public void setAuthors(Set<String> authors) {
this.authors = authors;
}
public String getContent() { public String getContent() {
return content; return content;
} }
@ -37,6 +52,6 @@ public class Article {
@Override @Override
public String toString() { public String toString() {
return String.format("Article{%s}", url); return String.format("Article{%s (%s)}", title, url);
} }
} }

11
src/main/java/internal/hw/crawler/strategies/crawl/CrawlException.java

@ -0,0 +1,11 @@
package internal.hw.crawler.strategies.crawl;
public class CrawlException extends Exception {
public CrawlException(String message) {
super(message);
}
public CrawlException(String message, Throwable cause) {
super(message, cause);
}
}

13
src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategy.java

@ -0,0 +1,13 @@
package internal.hw.crawler.strategies.crawl;
import internal.hw.crawler.models.Article;
import org.jsoup.nodes.Document;
import java.net.URL;
import java.util.List;
public interface CrawlStrategy {
List<Article> parse(URL url, Document doc) throws CrawlException;
boolean supports(URL url);
}

27
src/main/java/internal/hw/crawler/strategies/crawl/CrawlStrategyFactory.java

@ -0,0 +1,27 @@
package internal.hw.crawler.strategies.crawl;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
public class CrawlStrategyFactory {
private final List<CrawlStrategy> strategies = new ArrayList<>();
public CrawlStrategyFactory() {
register(new IthomeCrawlStrategy());
}
public CrawlStrategy getStrategy(URL url) {
for (CrawlStrategy s : strategies) {
if (s.supports(url)) {
return s;
}
}
return null;
}
public void register(CrawlStrategy strategy) {
strategies.add(strategy);
}
}

48
src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java

@ -0,0 +1,48 @@
package internal.hw.crawler.strategies.crawl;
import internal.hw.crawler.models.Article;
import org.jsoup.nodes.Document;
import java.net.URL;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class IthomeCrawlStrategy implements CrawlStrategy {
private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/(\\d+)\\.htm");
@Override
public boolean supports(URL url) {
return url.getHost().endsWith("ithome.com");
}
@Override
public List<Article> parse(URL url, Document doc) throws CrawlException {
return List.of(parseSingle(url, doc));
}
private Article parseSingle(URL url, Document doc) throws CrawlException {
Matcher matcher = idRegex.matcher(url.getPath());
if (!matcher.find()) throw new CrawlException(String.format("Cannot determine id for %s", url));
String id = String.format("%s-%s-%s", matcher.group(1), matcher.group(2), matcher.group(3));
String title = doc.selectFirst("h1").text();
String content = doc.selectFirst("#paragraph").text();
String authorRaw = doc.selectFirst("#author_baidu > strong").text();
String editorRaw = doc.selectFirst("#editor_baidu > strong").text();
Set<String> authors = new HashSet<>();
authors.add(authorRaw);
authors.add(editorRaw);
Article article = new Article();
article.setId(id);
article.setUrl(url);
article.setTitle(title);
article.setAuthors(authors);
article.setContent(content);
return article;
}
}
Loading…
Cancel
Save