|
|
|
@ -1,10 +1,13 @@ |
|
|
|
package internal.hw.crawler.strategies.crawl; |
|
|
|
|
|
|
|
import internal.hw.crawler.models.Article; |
|
|
|
import org.jsoup.Jsoup; |
|
|
|
import org.jsoup.nodes.Document; |
|
|
|
import org.jsoup.nodes.Element; |
|
|
|
import org.jsoup.select.Elements; |
|
|
|
|
|
|
|
import java.net.URL; |
|
|
|
import java.util.ArrayList; |
|
|
|
import java.util.HashSet; |
|
|
|
import java.util.List; |
|
|
|
import java.util.Set; |
|
|
|
@ -21,7 +24,33 @@ public class IthomeCrawlStrategy implements CrawlStrategy { |
|
|
|
|
|
|
|
@Override |
|
|
|
public List<Article> parse(URL url, Document doc) throws CrawlException { |
|
|
|
return List.of(parseSingle(url, doc)); |
|
|
|
List<String> homepage = List.of("https://www.ithome.com", "https://ithome.com"); |
|
|
|
if (homepage.contains(url.toString())) { |
|
|
|
// 传入的是首页,解析所有链接
|
|
|
|
return parseHomepage(doc); |
|
|
|
} else { |
|
|
|
return List.of(parseSingle(url, doc)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
private List<Article> parseHomepage(Document doc) { |
|
|
|
List<Article> articles = new ArrayList<>(); |
|
|
|
Elements links = doc.getElementsByTag("a"); |
|
|
|
for (Element link : links) { |
|
|
|
String href = link.attr("href"); |
|
|
|
Matcher matcher = idRegex.matcher(href); |
|
|
|
if (!matcher.find()) { |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
try { |
|
|
|
URL articleUrl = new URL(href); |
|
|
|
Document articleDoc = Jsoup.parse(articleUrl, 5000); |
|
|
|
articles.add(parseSingle(articleUrl, articleDoc)); |
|
|
|
} catch (Exception ignored) { |
|
|
|
} |
|
|
|
} |
|
|
|
return articles; |
|
|
|
} |
|
|
|
|
|
|
|
private Article parseSingle(URL url, Document doc) throws CrawlException { |
|
|
|
|