Browse Source

ithome homepage parsing

master
283375 1 month ago
parent
commit
a35c8ec85b
Failed to extract signature
  1. 29
      src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java

29
src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java

@ -1,10 +1,13 @@
package internal.hw.crawler.strategies.crawl;
import internal.hw.crawler.models.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@ -21,8 +24,34 @@ public class IthomeCrawlStrategy implements CrawlStrategy {
@Override
public List<Article> parse(URL url, Document doc) throws CrawlException {
List<String> homepage = List.of("https://www.ithome.com", "https://ithome.com");
if (homepage.contains(url.toString())) {
// 传入的是首页,解析所有链接
return parseHomepage(doc);
} else {
return List.of(parseSingle(url, doc));
}
}
private List<Article> parseHomepage(Document doc) {
List<Article> articles = new ArrayList<>();
Elements links = doc.getElementsByTag("a");
for (Element link : links) {
String href = link.attr("href");
Matcher matcher = idRegex.matcher(href);
if (!matcher.find()) {
continue;
}
try {
URL articleUrl = new URL(href);
Document articleDoc = Jsoup.parse(articleUrl, 5000);
articles.add(parseSingle(articleUrl, articleDoc));
} catch (Exception ignored) {
}
}
return articles;
}
private Article parseSingle(URL url, Document doc) throws CrawlException {
Matcher matcher = idRegex.matcher(url.getPath());

Loading…
Cancel
Save