diff --git a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java index 7e00e53..6012c03 100644 --- a/src/main/java/internal/hw/crawler/commands/CrawlCommand.java +++ b/src/main/java/internal/hw/crawler/commands/CrawlCommand.java @@ -10,6 +10,7 @@ import org.jsoup.nodes.Document; import java.net.URL; import java.util.List; +import java.util.Objects; public class CrawlCommand implements Command { private final ArticleRepository repository; @@ -41,13 +42,9 @@ public class CrawlCommand implements Command { return; } - Document doc = Jsoup.parse(url, 5000); + Document doc = Jsoup.connect(url.toString()).timeout(5000).get(); List
articles = strategy.parse(url, doc); - for (Article article : articles) { - if (article != null) { - repository.add(article); - } - } + articles.stream().filter(Objects::nonNull).forEach(repository::add); out.success(String.format("Crawled %d articles from %s", articles.size(), urlRaw)); } } diff --git a/src/main/java/internal/hw/crawler/commands/SaveCommand.java b/src/main/java/internal/hw/crawler/commands/SaveCommand.java index 24b55d1..69f40f3 100644 --- a/src/main/java/internal/hw/crawler/commands/SaveCommand.java +++ b/src/main/java/internal/hw/crawler/commands/SaveCommand.java @@ -6,8 +6,10 @@ import internal.hw.crawler.repositories.ArticleRepository; import internal.hw.crawler.views.CommandOutput; import com.google.gson.JsonSyntaxException; + import java.io.*; import java.util.*; +import java.util.function.Function; import java.util.stream.Collectors; public class SaveCommand implements Command { @@ -30,11 +32,12 @@ public class SaveCommand implements Command { String filename = "articles.output.json"; List
articles = getExistingArticles(filename); - Map articleMap = articles.stream().collect(Collectors.toMap(this::articleMapId, it -> it)); + Map articleMap = articles.stream() + .collect(Collectors.toMap(Article::computeUniqueKey, Function.identity())); // Update existing articles with new articles for (Article article : articleRepository.getAll()) { - articleMap.put(articleMapId(article), article); + articleMap.put(article.computeUniqueKey(), article); } Article[] articlesToSave = articleMap.values().toArray(new Article[0]); @@ -45,10 +48,6 @@ public class SaveCommand implements Command { } } - private String articleMapId(Article article) { - return String.format("%s-%s", article.getSource(), article.getId()); - } - private List
getExistingArticles(String filename) { try (BufferedReader reader = new BufferedReader(new FileReader(filename))) { Article[] articles = gson.fromJson(reader, Article[].class); diff --git a/src/main/java/internal/hw/crawler/models/Article.java b/src/main/java/internal/hw/crawler/models/Article.java index 124bb31..4d87f4a 100644 --- a/src/main/java/internal/hw/crawler/models/Article.java +++ b/src/main/java/internal/hw/crawler/models/Article.java @@ -59,6 +59,10 @@ public class Article { this.content = content; } + public String computeUniqueKey() { + return source + "-" + id; + } + @Override public String toString() { return String.format("Article{%s (%s)}", title, url); diff --git a/src/main/java/internal/hw/crawler/repositories/ArticleRepository.java b/src/main/java/internal/hw/crawler/repositories/ArticleRepository.java index 79931cd..346fa51 100644 --- a/src/main/java/internal/hw/crawler/repositories/ArticleRepository.java +++ b/src/main/java/internal/hw/crawler/repositories/ArticleRepository.java @@ -2,22 +2,21 @@ package internal.hw.crawler.repositories; import internal.hw.crawler.models.Article; -import java.util.ArrayList; -import java.util.Collections; +import java.util.LinkedHashMap; import java.util.List; public class ArticleRepository { - private final List
articles = new ArrayList<>(); + private final LinkedHashMap articles = new LinkedHashMap<>(); public void add(Article article) { if (article == null) { throw new IllegalArgumentException("Article cannot be null"); } - articles.add(article); + articles.put(article.computeUniqueKey(), article); } public List
getAll() { - return Collections.unmodifiableList(articles); + return List.copyOf(articles.values()); } public int size() { diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java index 2bbc11a..6ad2eb5 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java @@ -8,6 +8,7 @@ import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.net.URI; import java.net.URL; import java.util.ArrayList; import java.util.HashSet; @@ -22,32 +23,39 @@ public class IthomeCrawlStrategy implements CrawlStrategy { @Override public boolean supports(URL url) { - return url.getHost().endsWith("ithome.com"); + String host = url.getHost(); + return host.equals("ithome.com") || host.endsWith(".ithome.com"); } @Override public List
parse(URL url, Document doc) throws CrawlException { - List homepage = List.of("https://www.ithome.com", "https://ithome.com"); - if (homepage.contains(url.toString())) { - // 传入的是首页,解析所有链接 + if (isHomepage(url)) { return parseHomepage(doc); } else { return List.of(parseSingle(url, doc)); } } + private boolean isHomepage(URL url) { + String path = url.getPath(); + return (path == null || path.isEmpty() || path.equals("/")); + } + private List
parseHomepage(Document doc) { List
articles = new ArrayList<>(); Elements links = doc.getElementsByTag("a"); for (Element link : links) { - String href = link.attr("href"); + String href = link.absUrl("href"); + if (href.isEmpty()) { + continue; + } Matcher matcher = idRegex.matcher(href); if (!matcher.find()) { continue; } try { - URL articleUrl = new URL(href); + URL articleUrl = URI.create(href).toURL(); Document articleDoc = Jsoup.parse(articleUrl, 5000); articles.add(parseSingle(articleUrl, articleDoc)); } catch (Exception e) { diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java index 0ce0f93..680e22a 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java @@ -8,6 +8,7 @@ import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.net.URI; import java.net.URL; import java.util.ArrayList; import java.util.HashSet; @@ -15,7 +16,6 @@ import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; public class PeopleCnCrawlStrategy implements CrawlStrategy { private static final Logger log = LoggerFactory.getLogger(PeopleCnCrawlStrategy.class); @@ -25,39 +25,48 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy { @Override public boolean supports(URL url) { + String host = url.getHost(); for (String domain : supportedDomains) { - if (url.getHost().endsWith(domain)) { + if (host.equals(domain) || host.endsWith("." + domain)) { return true; } } - return false; } @Override public List
parse(URL url, Document doc) throws CrawlException { - List homepage = supportedDomains.stream().map(it -> "https://www." + it).collect(Collectors.toList()); - - if (homepage.contains(url.toString())) { - // 传入的是首页,解析所有链接 + if (isHomepage(url)) { return parseHomepage(doc); } else { return List.of(parseSingle(url, doc)); } } + private boolean isHomepage(URL url) { + String host = url.getHost(); + boolean matched = supportedDomains.stream() + .anyMatch(d -> host.equals(d) || host.endsWith("." + d)); + if (!matched) return false; + String path = url.getPath(); + return path == null || path.isEmpty() || path.equals("/"); + } + private List
parseHomepage(Document doc) { List
articles = new ArrayList<>(); Elements links = doc.getElementsByTag("a"); for (Element link : links) { - String href = link.attr("href"); + String href = link.absUrl("href"); + if (href.isEmpty()) { + continue; + } Matcher matcher = idRegex.matcher(href); if (!matcher.find()) { continue; } try { - URL articleUrl = new URL(href); + URL articleUrl = URI.create(href).toURL(); Document articleDoc = Jsoup.parse(articleUrl, 5000); articles.add(parseSingle(articleUrl, articleDoc)); } catch (Exception e) {