|
|
@ -8,6 +8,7 @@ import org.jsoup.select.Elements; |
|
|
import org.slf4j.Logger; |
|
|
import org.slf4j.Logger; |
|
|
import org.slf4j.LoggerFactory; |
|
|
import org.slf4j.LoggerFactory; |
|
|
|
|
|
|
|
|
|
|
|
import java.net.URI; |
|
|
import java.net.URL; |
|
|
import java.net.URL; |
|
|
import java.util.ArrayList; |
|
|
import java.util.ArrayList; |
|
|
import java.util.HashSet; |
|
|
import java.util.HashSet; |
|
|
@ -15,7 +16,6 @@ import java.util.List; |
|
|
import java.util.Set; |
|
|
import java.util.Set; |
|
|
import java.util.regex.Matcher; |
|
|
import java.util.regex.Matcher; |
|
|
import java.util.regex.Pattern; |
|
|
import java.util.regex.Pattern; |
|
|
import java.util.stream.Collectors; |
|
|
|
|
|
|
|
|
|
|
|
public class PeopleCnCrawlStrategy implements CrawlStrategy { |
|
|
public class PeopleCnCrawlStrategy implements CrawlStrategy { |
|
|
private static final Logger log = LoggerFactory.getLogger(PeopleCnCrawlStrategy.class); |
|
|
private static final Logger log = LoggerFactory.getLogger(PeopleCnCrawlStrategy.class); |
|
|
@ -25,39 +25,48 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy { |
|
|
|
|
|
|
|
|
@Override |
|
|
@Override |
|
|
public boolean supports(URL url) { |
|
|
public boolean supports(URL url) { |
|
|
|
|
|
String host = url.getHost(); |
|
|
for (String domain : supportedDomains) { |
|
|
for (String domain : supportedDomains) { |
|
|
if (url.getHost().endsWith(domain)) { |
|
|
if (host.equals(domain) || host.endsWith("." + domain)) { |
|
|
return true; |
|
|
return true; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
return false; |
|
|
return false; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
@Override |
|
|
@Override |
|
|
public List<Article> parse(URL url, Document doc) throws CrawlException { |
|
|
public List<Article> parse(URL url, Document doc) throws CrawlException { |
|
|
List<String> homepage = supportedDomains.stream().map(it -> "https://www." + it).collect(Collectors.toList()); |
|
|
if (isHomepage(url)) { |
|
|
|
|
|
|
|
|
if (homepage.contains(url.toString())) { |
|
|
|
|
|
// 传入的是首页,解析所有链接
|
|
|
|
|
|
return parseHomepage(doc); |
|
|
return parseHomepage(doc); |
|
|
} else { |
|
|
} else { |
|
|
return List.of(parseSingle(url, doc)); |
|
|
return List.of(parseSingle(url, doc)); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
private boolean isHomepage(URL url) { |
|
|
|
|
|
String host = url.getHost(); |
|
|
|
|
|
boolean matched = supportedDomains.stream() |
|
|
|
|
|
.anyMatch(d -> host.equals(d) || host.endsWith("." + d)); |
|
|
|
|
|
if (!matched) return false; |
|
|
|
|
|
String path = url.getPath(); |
|
|
|
|
|
return path == null || path.isEmpty() || path.equals("/"); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
private List<Article> parseHomepage(Document doc) { |
|
|
private List<Article> parseHomepage(Document doc) { |
|
|
List<Article> articles = new ArrayList<>(); |
|
|
List<Article> articles = new ArrayList<>(); |
|
|
Elements links = doc.getElementsByTag("a"); |
|
|
Elements links = doc.getElementsByTag("a"); |
|
|
for (Element link : links) { |
|
|
for (Element link : links) { |
|
|
String href = link.attr("href"); |
|
|
String href = link.absUrl("href"); |
|
|
|
|
|
if (href.isEmpty()) { |
|
|
|
|
|
continue; |
|
|
|
|
|
} |
|
|
Matcher matcher = idRegex.matcher(href); |
|
|
Matcher matcher = idRegex.matcher(href); |
|
|
if (!matcher.find()) { |
|
|
if (!matcher.find()) { |
|
|
continue; |
|
|
continue; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
try { |
|
|
try { |
|
|
URL articleUrl = new URL(href); |
|
|
URL articleUrl = URI.create(href).toURL(); |
|
|
Document articleDoc = Jsoup.parse(articleUrl, 5000); |
|
|
Document articleDoc = Jsoup.parse(articleUrl, 5000); |
|
|
articles.add(parseSingle(articleUrl, articleDoc)); |
|
|
articles.add(parseSingle(articleUrl, articleDoc)); |
|
|
} catch (Exception e) { |
|
|
} catch (Exception e) { |
|
|
|