5 changed files with 387 additions and 0 deletions
@ -0,0 +1,84 @@ |
|||
package com.crawler.crawler; |
|||
|
|||
import com.crawler.exception.HttpRequestException; |
|||
import com.crawler.exception.TimeoutException; |
|||
import com.crawler.model.CrawlerConfig; |
|||
import com.crawler.model.CrawlerData; |
|||
|
|||
import java.net.URI; |
|||
import java.net.http.HttpClient; |
|||
import java.net.http.HttpRequest; |
|||
import java.net.http.HttpResponse; |
|||
import java.time.Duration; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public abstract class BaseCrawler implements Crawler { |
|||
protected CrawlerConfig config; |
|||
|
|||
@Override |
|||
public void setConfig(CrawlerConfig config) { |
|||
this.config = config; |
|||
} |
|||
|
|||
@Override |
|||
public CrawlerConfig getConfig() { |
|||
return config; |
|||
} |
|||
|
|||
protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException { |
|||
HttpClient client = HttpClient.newBuilder() |
|||
.connectTimeout(Duration.ofMillis(config.getTimeout())) |
|||
.followRedirects(HttpClient.Redirect.NORMAL) |
|||
.build(); |
|||
|
|||
HttpRequest request = HttpRequest.newBuilder() |
|||
.uri(URI.create(urlStr)) |
|||
.header("User-Agent", config.getUserAgent()) |
|||
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
|||
.GET() |
|||
.build(); |
|||
|
|||
try { |
|||
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString()); |
|||
|
|||
if (response.statusCode() < 200 || response.statusCode() >= 300) { |
|||
throw new HttpRequestException("HTTP请求失败", response.statusCode()); |
|||
} |
|||
|
|||
return response.body(); |
|||
} catch (java.net.http.HttpTimeoutException e) { |
|||
throw new TimeoutException("连接超时", e); |
|||
} catch (HttpRequestException e) { |
|||
throw e; |
|||
} catch (Exception e) { |
|||
throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e); |
|||
} |
|||
} |
|||
|
|||
protected abstract List<CrawlerData> parseHtml(String html); |
|||
|
|||
@Override |
|||
public List<CrawlerData> crawl() { |
|||
List<CrawlerData> results = new ArrayList<>(); |
|||
|
|||
if (config == null || config.getTargetUrl() == null) { |
|||
return results; |
|||
} |
|||
|
|||
try { |
|||
String html = fetchHtml(config.getTargetUrl()); |
|||
results = parseHtml(html); |
|||
|
|||
for (CrawlerData data : results) { |
|||
if (data.getSource() == null) { |
|||
data.setSource(getCrawlerName()); |
|||
} |
|||
} |
|||
} catch (Exception e) { |
|||
throw new RuntimeException(e); |
|||
} |
|||
|
|||
return results; |
|||
} |
|||
} |
|||
@ -0,0 +1,145 @@ |
|||
package com.crawler.crawler.impl; |
|||
|
|||
import com.crawler.crawler.BaseCrawler; |
|||
import com.crawler.exception.HttpRequestException; |
|||
import com.crawler.exception.TimeoutException; |
|||
import com.crawler.model.CrawlerData; |
|||
|
|||
import java.net.URI; |
|||
import java.net.http.HttpClient; |
|||
import java.net.http.HttpRequest; |
|||
import java.net.http.HttpResponse; |
|||
import java.time.Duration; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class ChinaWeatherCrawler extends BaseCrawler { |
|||
private static final String BASE_URL = "https://www.weather.com.cn"; |
|||
|
|||
@Override |
|||
public String getCrawlerName() { |
|||
return "ChinaWeatherCrawler"; |
|||
} |
|||
|
|||
@Override |
|||
protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException { |
|||
if (!urlStr.startsWith("https")) { |
|||
urlStr = urlStr.replace("http://", "https://"); |
|||
} |
|||
|
|||
HttpClient client = HttpClient.newBuilder() |
|||
.connectTimeout(Duration.ofMillis(30000)) |
|||
.followRedirects(HttpClient.Redirect.NORMAL) |
|||
.build(); |
|||
|
|||
HttpRequest request = HttpRequest.newBuilder() |
|||
.uri(URI.create(urlStr)) |
|||
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
|||
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
|||
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") |
|||
.GET() |
|||
.build(); |
|||
|
|||
try { |
|||
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString()); |
|||
|
|||
if (response.statusCode() < 200 || response.statusCode() >= 300) { |
|||
throw new HttpRequestException("HTTP请求失败", response.statusCode()); |
|||
} |
|||
|
|||
return response.body(); |
|||
} catch (java.net.http.HttpTimeoutException e) { |
|||
throw new TimeoutException("连接超时", e); |
|||
} catch (HttpRequestException e) { |
|||
throw e; |
|||
} catch (Exception e) { |
|||
throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
protected List<CrawlerData> parseHtml(String html) { |
|||
List<CrawlerData> results = new ArrayList<>(); |
|||
|
|||
if (html == null || html.isEmpty()) { |
|||
return results; |
|||
} |
|||
|
|||
String cleanHtml = html.replaceAll("\\s+", " "); |
|||
|
|||
Pattern linkPattern = Pattern.compile( |
|||
"<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]{2,80})</a>", |
|||
Pattern.CASE_INSENSITIVE |
|||
); |
|||
|
|||
Matcher matcher = linkPattern.matcher(cleanHtml); |
|||
|
|||
int count = 0; |
|||
while (matcher.find() && count < 30) { |
|||
String url = matcher.group(1); |
|||
String title = matcher.group(2).trim(); |
|||
|
|||
if (isValidUrl(url) && isValidTitle(title)) { |
|||
url = normalizeUrl(url); |
|||
|
|||
CrawlerData data = new CrawlerData(); |
|||
data.setTitle(cleanText(title)); |
|||
data.setUrl(url); |
|||
data.setSource(getCrawlerName()); |
|||
results.add(data); |
|||
count++; |
|||
} |
|||
} |
|||
|
|||
return results; |
|||
} |
|||
|
|||
private String normalizeUrl(String url) { |
|||
if (url == null) return null; |
|||
if (url.startsWith("//")) { |
|||
return "https:" + url; |
|||
} |
|||
if (url.startsWith("/")) { |
|||
return BASE_URL + url; |
|||
} |
|||
if (!url.startsWith("http")) { |
|||
return BASE_URL + "/" + url; |
|||
} |
|||
if (url.startsWith("http://")) { |
|||
return url.replace("http://", "https://"); |
|||
} |
|||
return url; |
|||
} |
|||
|
|||
private boolean isValidUrl(String url) { |
|||
if (url == null || url.isEmpty()) { |
|||
return false; |
|||
} |
|||
|
|||
if (url.contains("mailto:") || url.contains("javascript:")) { |
|||
return false; |
|||
} |
|||
|
|||
return url.contains("weather.com.cn") || url.startsWith("/"); |
|||
} |
|||
|
|||
private boolean isValidTitle(String title) { |
|||
if (title == null || title.isEmpty()) { |
|||
return false; |
|||
} |
|||
String cleaned = cleanText(title); |
|||
return cleaned != null && cleaned.length() >= 2 && cleaned.length() <= 80; |
|||
} |
|||
|
|||
private String cleanText(String text) { |
|||
if (text == null) return null; |
|||
return text.replaceAll("<[^>]+>", "") |
|||
.replaceAll(" ", " ") |
|||
.replaceAll("&#[0-9]+;", "") |
|||
.replaceAll("&[a-zA-Z]+;", " ") |
|||
.replaceAll("\\s+", " ") |
|||
.trim(); |
|||
} |
|||
} |
|||
@ -0,0 +1,13 @@ |
|||
package com.crawler.crawler; |
|||
|
|||
import com.crawler.model.CrawlerConfig; |
|||
import com.crawler.model.CrawlerData; |
|||
|
|||
import java.util.List; |
|||
|
|||
public interface Crawler { |
|||
void setConfig(CrawlerConfig config); |
|||
CrawlerConfig getConfig(); |
|||
String getCrawlerName(); |
|||
List<CrawlerData> crawl(); |
|||
} |
|||
@ -0,0 +1,103 @@ |
|||
package com.crawler.crawler; |
|||
|
|||
import com.crawler.crawler.impl.*; |
|||
import com.crawler.exception.InvalidUrlException; |
|||
import java.util.LinkedHashMap; |
|||
import java.util.Map; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class CrawlerFactory { |
|||
private static CrawlerFactory instance; |
|||
private Map<String, Pattern> crawlerPatterns; |
|||
|
|||
private CrawlerFactory() { |
|||
crawlerPatterns = new LinkedHashMap<>(); |
|||
initPatterns(); |
|||
} |
|||
|
|||
public static CrawlerFactory getInstance() { |
|||
if (instance == null) { |
|||
instance = new CrawlerFactory(); |
|||
} |
|||
return instance; |
|||
} |
|||
|
|||
private void initPatterns() { |
|||
crawlerPatterns.put("MountBladeCrawler", |
|||
Pattern.compile(".*mountblade\\.com\\.cn.*", Pattern.CASE_INSENSITIVE)); |
|||
crawlerPatterns.put("HunanUniversityNewsCrawler", |
|||
Pattern.compile(".*news\\.hnu\\.edu\\.cn.*", Pattern.CASE_INSENSITIVE)); |
|||
crawlerPatterns.put("HunanUniversityCrawler", |
|||
Pattern.compile(".*hnu\\.edu\\.cn.*", Pattern.CASE_INSENSITIVE)); |
|||
crawlerPatterns.put("ChinaWeatherCrawler", |
|||
Pattern.compile(".*weather\\.com\\.cn.*", Pattern.CASE_INSENSITIVE)); |
|||
crawlerPatterns.put("ExampleCrawler", |
|||
Pattern.compile(".*", Pattern.CASE_INSENSITIVE)); |
|||
} |
|||
|
|||
public Crawler createCrawler(String url) { |
|||
validateUrl(url); |
|||
|
|||
for (Map.Entry<String, Pattern> entry : crawlerPatterns.entrySet()) { |
|||
if (entry.getValue().matcher(url).matches()) { |
|||
return createCrawlerByName(entry.getKey()); |
|||
} |
|||
} |
|||
|
|||
return new ExampleCrawler(); |
|||
} |
|||
|
|||
private void validateUrl(String url) { |
|||
if (url == null || url.isEmpty()) { |
|||
throw new InvalidUrlException("URL不能为空", url); |
|||
} |
|||
|
|||
if (!url.startsWith("http://") && !url.startsWith("https://")) { |
|||
throw new InvalidUrlException("URL格式无效,必须以http://或https://开头", url); |
|||
} |
|||
} |
|||
|
|||
private Crawler createCrawlerByName(String crawlerName) { |
|||
switch (crawlerName) { |
|||
case "MountBladeCrawler": |
|||
return new MountBladeCrawler(); |
|||
case "HunanUniversityNewsCrawler": |
|||
return new HunanUniversityNewsCrawler(); |
|||
case "HunanUniversityCrawler": |
|||
return new HunanUniversityCrawler(); |
|||
case "ChinaWeatherCrawler": |
|||
return new ChinaWeatherCrawler(); |
|||
case "ExampleCrawler": |
|||
default: |
|||
return new ExampleCrawler(); |
|||
} |
|||
} |
|||
|
|||
public String getCrawlerName(String url) { |
|||
if (url == null || url.isEmpty()) { |
|||
return "ExampleCrawler"; |
|||
} |
|||
|
|||
for (Map.Entry<String, Pattern> entry : crawlerPatterns.entrySet()) { |
|||
if (entry.getValue().matcher(url).matches()) { |
|||
return entry.getKey(); |
|||
} |
|||
} |
|||
|
|||
return "ExampleCrawler"; |
|||
} |
|||
|
|||
public boolean isUrlSupported(String url) { |
|||
if (url == null || url.isEmpty()) { |
|||
return false; |
|||
} |
|||
|
|||
for (Pattern pattern : crawlerPatterns.values()) { |
|||
if (pattern.matcher(url).matches()) { |
|||
return true; |
|||
} |
|||
} |
|||
|
|||
return true; |
|||
} |
|||
} |
|||
@ -0,0 +1,42 @@ |
|||
package com.crawler.crawler.impl; |
|||
|
|||
import com.crawler.crawler.BaseCrawler; |
|||
import com.crawler.model.CrawlerData; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class ExampleCrawler extends BaseCrawler { |
|||
@Override |
|||
public String getCrawlerName() { |
|||
return "ExampleCrawler"; |
|||
} |
|||
|
|||
@Override |
|||
protected List<CrawlerData> parseHtml(String html) { |
|||
List<CrawlerData> results = new ArrayList<>(); |
|||
|
|||
String cleanHtml = html.replaceAll("\\s+", " ").replaceAll(">\\s*<", "><"); |
|||
|
|||
Pattern linkPattern = Pattern.compile("<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>", Pattern.CASE_INSENSITIVE); |
|||
Matcher matcher = linkPattern.matcher(cleanHtml); |
|||
|
|||
int count = 0; |
|||
while (matcher.find() && count < 10) { |
|||
CrawlerData data = new CrawlerData(); |
|||
data.setTitle(cleanText(matcher.group(2))); |
|||
data.setUrl(matcher.group(1)); |
|||
results.add(data); |
|||
count++; |
|||
} |
|||
|
|||
return results; |
|||
} |
|||
|
|||
private String cleanText(String text) { |
|||
if (text == null) return null; |
|||
return text.replaceAll("<[^>]+>", "").trim(); |
|||
} |
|||
} |
|||
Loading…
Reference in new issue