5 changed files with 387 additions and 0 deletions
@ -0,0 +1,84 @@ |
|||||
|
package com.crawler.crawler; |
||||
|
|
||||
|
import com.crawler.exception.HttpRequestException; |
||||
|
import com.crawler.exception.TimeoutException; |
||||
|
import com.crawler.model.CrawlerConfig; |
||||
|
import com.crawler.model.CrawlerData; |
||||
|
|
||||
|
import java.net.URI; |
||||
|
import java.net.http.HttpClient; |
||||
|
import java.net.http.HttpRequest; |
||||
|
import java.net.http.HttpResponse; |
||||
|
import java.time.Duration; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public abstract class BaseCrawler implements Crawler { |
||||
|
protected CrawlerConfig config; |
||||
|
|
||||
|
@Override |
||||
|
public void setConfig(CrawlerConfig config) { |
||||
|
this.config = config; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CrawlerConfig getConfig() { |
||||
|
return config; |
||||
|
} |
||||
|
|
||||
|
protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException { |
||||
|
HttpClient client = HttpClient.newBuilder() |
||||
|
.connectTimeout(Duration.ofMillis(config.getTimeout())) |
||||
|
.followRedirects(HttpClient.Redirect.NORMAL) |
||||
|
.build(); |
||||
|
|
||||
|
HttpRequest request = HttpRequest.newBuilder() |
||||
|
.uri(URI.create(urlStr)) |
||||
|
.header("User-Agent", config.getUserAgent()) |
||||
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
||||
|
.GET() |
||||
|
.build(); |
||||
|
|
||||
|
try { |
||||
|
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString()); |
||||
|
|
||||
|
if (response.statusCode() < 200 || response.statusCode() >= 300) { |
||||
|
throw new HttpRequestException("HTTP请求失败", response.statusCode()); |
||||
|
} |
||||
|
|
||||
|
return response.body(); |
||||
|
} catch (java.net.http.HttpTimeoutException e) { |
||||
|
throw new TimeoutException("连接超时", e); |
||||
|
} catch (HttpRequestException e) { |
||||
|
throw e; |
||||
|
} catch (Exception e) { |
||||
|
throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
protected abstract List<CrawlerData> parseHtml(String html); |
||||
|
|
||||
|
@Override |
||||
|
public List<CrawlerData> crawl() { |
||||
|
List<CrawlerData> results = new ArrayList<>(); |
||||
|
|
||||
|
if (config == null || config.getTargetUrl() == null) { |
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
String html = fetchHtml(config.getTargetUrl()); |
||||
|
results = parseHtml(html); |
||||
|
|
||||
|
for (CrawlerData data : results) { |
||||
|
if (data.getSource() == null) { |
||||
|
data.setSource(getCrawlerName()); |
||||
|
} |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
throw new RuntimeException(e); |
||||
|
} |
||||
|
|
||||
|
return results; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,145 @@ |
|||||
|
package com.crawler.crawler.impl; |
||||
|
|
||||
|
import com.crawler.crawler.BaseCrawler; |
||||
|
import com.crawler.exception.HttpRequestException; |
||||
|
import com.crawler.exception.TimeoutException; |
||||
|
import com.crawler.model.CrawlerData; |
||||
|
|
||||
|
import java.net.URI; |
||||
|
import java.net.http.HttpClient; |
||||
|
import java.net.http.HttpRequest; |
||||
|
import java.net.http.HttpResponse; |
||||
|
import java.time.Duration; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class ChinaWeatherCrawler extends BaseCrawler { |
||||
|
private static final String BASE_URL = "https://www.weather.com.cn"; |
||||
|
|
||||
|
@Override |
||||
|
public String getCrawlerName() { |
||||
|
return "ChinaWeatherCrawler"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException { |
||||
|
if (!urlStr.startsWith("https")) { |
||||
|
urlStr = urlStr.replace("http://", "https://"); |
||||
|
} |
||||
|
|
||||
|
HttpClient client = HttpClient.newBuilder() |
||||
|
.connectTimeout(Duration.ofMillis(30000)) |
||||
|
.followRedirects(HttpClient.Redirect.NORMAL) |
||||
|
.build(); |
||||
|
|
||||
|
HttpRequest request = HttpRequest.newBuilder() |
||||
|
.uri(URI.create(urlStr)) |
||||
|
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
||||
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
||||
|
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") |
||||
|
.GET() |
||||
|
.build(); |
||||
|
|
||||
|
try { |
||||
|
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString()); |
||||
|
|
||||
|
if (response.statusCode() < 200 || response.statusCode() >= 300) { |
||||
|
throw new HttpRequestException("HTTP请求失败", response.statusCode()); |
||||
|
} |
||||
|
|
||||
|
return response.body(); |
||||
|
} catch (java.net.http.HttpTimeoutException e) { |
||||
|
throw new TimeoutException("连接超时", e); |
||||
|
} catch (HttpRequestException e) { |
||||
|
throw e; |
||||
|
} catch (Exception e) { |
||||
|
throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<CrawlerData> parseHtml(String html) { |
||||
|
List<CrawlerData> results = new ArrayList<>(); |
||||
|
|
||||
|
if (html == null || html.isEmpty()) { |
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
String cleanHtml = html.replaceAll("\\s+", " "); |
||||
|
|
||||
|
Pattern linkPattern = Pattern.compile( |
||||
|
"<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]{2,80})</a>", |
||||
|
Pattern.CASE_INSENSITIVE |
||||
|
); |
||||
|
|
||||
|
Matcher matcher = linkPattern.matcher(cleanHtml); |
||||
|
|
||||
|
int count = 0; |
||||
|
while (matcher.find() && count < 30) { |
||||
|
String url = matcher.group(1); |
||||
|
String title = matcher.group(2).trim(); |
||||
|
|
||||
|
if (isValidUrl(url) && isValidTitle(title)) { |
||||
|
url = normalizeUrl(url); |
||||
|
|
||||
|
CrawlerData data = new CrawlerData(); |
||||
|
data.setTitle(cleanText(title)); |
||||
|
data.setUrl(url); |
||||
|
data.setSource(getCrawlerName()); |
||||
|
results.add(data); |
||||
|
count++; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
private String normalizeUrl(String url) { |
||||
|
if (url == null) return null; |
||||
|
if (url.startsWith("//")) { |
||||
|
return "https:" + url; |
||||
|
} |
||||
|
if (url.startsWith("/")) { |
||||
|
return BASE_URL + url; |
||||
|
} |
||||
|
if (!url.startsWith("http")) { |
||||
|
return BASE_URL + "/" + url; |
||||
|
} |
||||
|
if (url.startsWith("http://")) { |
||||
|
return url.replace("http://", "https://"); |
||||
|
} |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
private boolean isValidUrl(String url) { |
||||
|
if (url == null || url.isEmpty()) { |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
if (url.contains("mailto:") || url.contains("javascript:")) { |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
return url.contains("weather.com.cn") || url.startsWith("/"); |
||||
|
} |
||||
|
|
||||
|
private boolean isValidTitle(String title) { |
||||
|
if (title == null || title.isEmpty()) { |
||||
|
return false; |
||||
|
} |
||||
|
String cleaned = cleanText(title); |
||||
|
return cleaned != null && cleaned.length() >= 2 && cleaned.length() <= 80; |
||||
|
} |
||||
|
|
||||
|
private String cleanText(String text) { |
||||
|
if (text == null) return null; |
||||
|
return text.replaceAll("<[^>]+>", "") |
||||
|
.replaceAll(" ", " ") |
||||
|
.replaceAll("&#[0-9]+;", "") |
||||
|
.replaceAll("&[a-zA-Z]+;", " ") |
||||
|
.replaceAll("\\s+", " ") |
||||
|
.trim(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,13 @@ |
|||||
|
package com.crawler.crawler; |
||||
|
|
||||
|
import com.crawler.model.CrawlerConfig; |
||||
|
import com.crawler.model.CrawlerData; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface Crawler { |
||||
|
void setConfig(CrawlerConfig config); |
||||
|
CrawlerConfig getConfig(); |
||||
|
String getCrawlerName(); |
||||
|
List<CrawlerData> crawl(); |
||||
|
} |
||||
@ -0,0 +1,103 @@ |
|||||
|
package com.crawler.crawler; |
||||
|
|
||||
|
import com.crawler.crawler.impl.*; |
||||
|
import com.crawler.exception.InvalidUrlException; |
||||
|
import java.util.LinkedHashMap; |
||||
|
import java.util.Map; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class CrawlerFactory { |
||||
|
private static CrawlerFactory instance; |
||||
|
private Map<String, Pattern> crawlerPatterns; |
||||
|
|
||||
|
private CrawlerFactory() { |
||||
|
crawlerPatterns = new LinkedHashMap<>(); |
||||
|
initPatterns(); |
||||
|
} |
||||
|
|
||||
|
public static CrawlerFactory getInstance() { |
||||
|
if (instance == null) { |
||||
|
instance = new CrawlerFactory(); |
||||
|
} |
||||
|
return instance; |
||||
|
} |
||||
|
|
||||
|
private void initPatterns() { |
||||
|
crawlerPatterns.put("MountBladeCrawler", |
||||
|
Pattern.compile(".*mountblade\\.com\\.cn.*", Pattern.CASE_INSENSITIVE)); |
||||
|
crawlerPatterns.put("HunanUniversityNewsCrawler", |
||||
|
Pattern.compile(".*news\\.hnu\\.edu\\.cn.*", Pattern.CASE_INSENSITIVE)); |
||||
|
crawlerPatterns.put("HunanUniversityCrawler", |
||||
|
Pattern.compile(".*hnu\\.edu\\.cn.*", Pattern.CASE_INSENSITIVE)); |
||||
|
crawlerPatterns.put("ChinaWeatherCrawler", |
||||
|
Pattern.compile(".*weather\\.com\\.cn.*", Pattern.CASE_INSENSITIVE)); |
||||
|
crawlerPatterns.put("ExampleCrawler", |
||||
|
Pattern.compile(".*", Pattern.CASE_INSENSITIVE)); |
||||
|
} |
||||
|
|
||||
|
public Crawler createCrawler(String url) { |
||||
|
validateUrl(url); |
||||
|
|
||||
|
for (Map.Entry<String, Pattern> entry : crawlerPatterns.entrySet()) { |
||||
|
if (entry.getValue().matcher(url).matches()) { |
||||
|
return createCrawlerByName(entry.getKey()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return new ExampleCrawler(); |
||||
|
} |
||||
|
|
||||
|
private void validateUrl(String url) { |
||||
|
if (url == null || url.isEmpty()) { |
||||
|
throw new InvalidUrlException("URL不能为空", url); |
||||
|
} |
||||
|
|
||||
|
if (!url.startsWith("http://") && !url.startsWith("https://")) { |
||||
|
throw new InvalidUrlException("URL格式无效,必须以http://或https://开头", url); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private Crawler createCrawlerByName(String crawlerName) { |
||||
|
switch (crawlerName) { |
||||
|
case "MountBladeCrawler": |
||||
|
return new MountBladeCrawler(); |
||||
|
case "HunanUniversityNewsCrawler": |
||||
|
return new HunanUniversityNewsCrawler(); |
||||
|
case "HunanUniversityCrawler": |
||||
|
return new HunanUniversityCrawler(); |
||||
|
case "ChinaWeatherCrawler": |
||||
|
return new ChinaWeatherCrawler(); |
||||
|
case "ExampleCrawler": |
||||
|
default: |
||||
|
return new ExampleCrawler(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public String getCrawlerName(String url) { |
||||
|
if (url == null || url.isEmpty()) { |
||||
|
return "ExampleCrawler"; |
||||
|
} |
||||
|
|
||||
|
for (Map.Entry<String, Pattern> entry : crawlerPatterns.entrySet()) { |
||||
|
if (entry.getValue().matcher(url).matches()) { |
||||
|
return entry.getKey(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return "ExampleCrawler"; |
||||
|
} |
||||
|
|
||||
|
public boolean isUrlSupported(String url) { |
||||
|
if (url == null || url.isEmpty()) { |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
for (Pattern pattern : crawlerPatterns.values()) { |
||||
|
if (pattern.matcher(url).matches()) { |
||||
|
return true; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return true; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,42 @@ |
|||||
|
package com.crawler.crawler.impl; |
||||
|
|
||||
|
import com.crawler.crawler.BaseCrawler; |
||||
|
import com.crawler.model.CrawlerData; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class ExampleCrawler extends BaseCrawler { |
||||
|
@Override |
||||
|
public String getCrawlerName() { |
||||
|
return "ExampleCrawler"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<CrawlerData> parseHtml(String html) { |
||||
|
List<CrawlerData> results = new ArrayList<>(); |
||||
|
|
||||
|
String cleanHtml = html.replaceAll("\\s+", " ").replaceAll(">\\s*<", "><"); |
||||
|
|
||||
|
Pattern linkPattern = Pattern.compile("<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>", Pattern.CASE_INSENSITIVE); |
||||
|
Matcher matcher = linkPattern.matcher(cleanHtml); |
||||
|
|
||||
|
int count = 0; |
||||
|
while (matcher.find() && count < 10) { |
||||
|
CrawlerData data = new CrawlerData(); |
||||
|
data.setTitle(cleanText(matcher.group(2))); |
||||
|
data.setUrl(matcher.group(1)); |
||||
|
results.add(data); |
||||
|
count++; |
||||
|
} |
||||
|
|
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
private String cleanText(String text) { |
||||
|
if (text == null) return null; |
||||
|
return text.replaceAll("<[^>]+>", "").trim(); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue