Browse Source

quality improvements

* fix domain name matching
* fix url processing
* articles deduplication
master
283375 1 month ago
parent
commit
8f8ac344ba
Failed to extract signature
  1. 9
      src/main/java/internal/hw/crawler/commands/CrawlCommand.java
  2. 11
      src/main/java/internal/hw/crawler/commands/SaveCommand.java
  3. 4
      src/main/java/internal/hw/crawler/models/Article.java
  4. 9
      src/main/java/internal/hw/crawler/repositories/ArticleRepository.java
  5. 20
      src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java
  6. 27
      src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java

9
src/main/java/internal/hw/crawler/commands/CrawlCommand.java

@ -10,6 +10,7 @@ import org.jsoup.nodes.Document;
import java.net.URL;
import java.util.List;
import java.util.Objects;
public class CrawlCommand implements Command {
private final ArticleRepository repository;
@ -41,13 +42,9 @@ public class CrawlCommand implements Command {
return;
}
Document doc = Jsoup.parse(url, 5000);
Document doc = Jsoup.connect(url.toString()).timeout(5000).get();
List<Article> articles = strategy.parse(url, doc);
for (Article article : articles) {
if (article != null) {
repository.add(article);
}
}
articles.stream().filter(Objects::nonNull).forEach(repository::add);
out.success(String.format("Crawled %d articles from %s", articles.size(), urlRaw));
}
}

11
src/main/java/internal/hw/crawler/commands/SaveCommand.java

@ -6,8 +6,10 @@ import internal.hw.crawler.repositories.ArticleRepository;
import internal.hw.crawler.views.CommandOutput;
import com.google.gson.JsonSyntaxException;
import java.io.*;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
public class SaveCommand implements Command {
@ -30,11 +32,12 @@ public class SaveCommand implements Command {
String filename = "articles.output.json";
List<Article> articles = getExistingArticles(filename);
Map<String, Article> articleMap = articles.stream().collect(Collectors.toMap(this::articleMapId, it -> it));
Map<String, Article> articleMap = articles.stream()
.collect(Collectors.toMap(Article::computeUniqueKey, Function.identity()));
// Update existing articles with new articles
for (Article article : articleRepository.getAll()) {
articleMap.put(articleMapId(article), article);
articleMap.put(article.computeUniqueKey(), article);
}
Article[] articlesToSave = articleMap.values().toArray(new Article[0]);
@ -45,10 +48,6 @@ public class SaveCommand implements Command {
}
}
private String articleMapId(Article article) {
return String.format("%s-%s", article.getSource(), article.getId());
}
private List<Article> getExistingArticles(String filename) {
try (BufferedReader reader = new BufferedReader(new FileReader(filename))) {
Article[] articles = gson.fromJson(reader, Article[].class);

4
src/main/java/internal/hw/crawler/models/Article.java

@ -59,6 +59,10 @@ public class Article {
this.content = content;
}
public String computeUniqueKey() {
return source + "-" + id;
}
@Override
public String toString() {
return String.format("Article{%s (%s)}", title, url);

9
src/main/java/internal/hw/crawler/repositories/ArticleRepository.java

@ -2,22 +2,21 @@ package internal.hw.crawler.repositories;
import internal.hw.crawler.models.Article;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
public class ArticleRepository {
private final List<Article> articles = new ArrayList<>();
private final LinkedHashMap<String, Article> articles = new LinkedHashMap<>();
public void add(Article article) {
if (article == null) {
throw new IllegalArgumentException("Article cannot be null");
}
articles.add(article);
articles.put(article.computeUniqueKey(), article);
}
public List<Article> getAll() {
return Collections.unmodifiableList(articles);
return List.copyOf(articles.values());
}
public int size() {

20
src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java

@ -8,6 +8,7 @@ import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
@ -22,32 +23,39 @@ public class IthomeCrawlStrategy implements CrawlStrategy {
@Override
public boolean supports(URL url) {
return url.getHost().endsWith("ithome.com");
String host = url.getHost();
return host.equals("ithome.com") || host.endsWith(".ithome.com");
}
@Override
public List<Article> parse(URL url, Document doc) throws CrawlException {
List<String> homepage = List.of("https://www.ithome.com", "https://ithome.com");
if (homepage.contains(url.toString())) {
// 传入的是首页,解析所有链接
if (isHomepage(url)) {
return parseHomepage(doc);
} else {
return List.of(parseSingle(url, doc));
}
}
private boolean isHomepage(URL url) {
String path = url.getPath();
return (path == null || path.isEmpty() || path.equals("/"));
}
private List<Article> parseHomepage(Document doc) {
List<Article> articles = new ArrayList<>();
Elements links = doc.getElementsByTag("a");
for (Element link : links) {
String href = link.attr("href");
String href = link.absUrl("href");
if (href.isEmpty()) {
continue;
}
Matcher matcher = idRegex.matcher(href);
if (!matcher.find()) {
continue;
}
try {
URL articleUrl = new URL(href);
URL articleUrl = URI.create(href).toURL();
Document articleDoc = Jsoup.parse(articleUrl, 5000);
articles.add(parseSingle(articleUrl, articleDoc));
} catch (Exception e) {

27
src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java

@ -8,6 +8,7 @@ import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
@ -15,7 +16,6 @@ import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class PeopleCnCrawlStrategy implements CrawlStrategy {
private static final Logger log = LoggerFactory.getLogger(PeopleCnCrawlStrategy.class);
@ -25,39 +25,48 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy {
@Override
public boolean supports(URL url) {
String host = url.getHost();
for (String domain : supportedDomains) {
if (url.getHost().endsWith(domain)) {
if (host.equals(domain) || host.endsWith("." + domain)) {
return true;
}
}
return false;
}
@Override
public List<Article> parse(URL url, Document doc) throws CrawlException {
List<String> homepage = supportedDomains.stream().map(it -> "https://www." + it).collect(Collectors.toList());
if (homepage.contains(url.toString())) {
// 传入的是首页,解析所有链接
if (isHomepage(url)) {
return parseHomepage(doc);
} else {
return List.of(parseSingle(url, doc));
}
}
private boolean isHomepage(URL url) {
String host = url.getHost();
boolean matched = supportedDomains.stream()
.anyMatch(d -> host.equals(d) || host.endsWith("." + d));
if (!matched) return false;
String path = url.getPath();
return path == null || path.isEmpty() || path.equals("/");
}
private List<Article> parseHomepage(Document doc) {
List<Article> articles = new ArrayList<>();
Elements links = doc.getElementsByTag("a");
for (Element link : links) {
String href = link.attr("href");
String href = link.absUrl("href");
if (href.isEmpty()) {
continue;
}
Matcher matcher = idRegex.matcher(href);
if (!matcher.find()) {
continue;
}
try {
URL articleUrl = new URL(href);
URL articleUrl = URI.create(href).toURL();
Document articleDoc = Jsoup.parse(articleUrl, 5000);
articles.add(parseSingle(articleUrl, articleDoc));
} catch (Exception e) {

Loading…
Cancel
Save