Browse Source

上传文件至 'project/crawler'

main
HuangZhikai 3 weeks ago
parent
commit
8cad5bdf2b
  1. 84
      project/crawler/BaseCrawler.java
  2. 145
      project/crawler/ChinaWeatherCrawler.java
  3. 13
      project/crawler/Crawler.java
  4. 103
      project/crawler/CrawlerFactory.java
  5. 42
      project/crawler/ExampleCrawler.java

84
project/crawler/BaseCrawler.java

@ -0,0 +1,84 @@
package com.crawler.crawler;
import com.crawler.exception.HttpRequestException;
import com.crawler.exception.TimeoutException;
import com.crawler.model.CrawlerConfig;
import com.crawler.model.CrawlerData;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
public abstract class BaseCrawler implements Crawler {
protected CrawlerConfig config;
@Override
public void setConfig(CrawlerConfig config) {
this.config = config;
}
@Override
public CrawlerConfig getConfig() {
return config;
}
protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException {
HttpClient client = HttpClient.newBuilder()
.connectTimeout(Duration.ofMillis(config.getTimeout()))
.followRedirects(HttpClient.Redirect.NORMAL)
.build();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(urlStr))
.header("User-Agent", config.getUserAgent())
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.GET()
.build();
try {
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
if (response.statusCode() < 200 || response.statusCode() >= 300) {
throw new HttpRequestException("HTTP请求失败", response.statusCode());
}
return response.body();
} catch (java.net.http.HttpTimeoutException e) {
throw new TimeoutException("连接超时", e);
} catch (HttpRequestException e) {
throw e;
} catch (Exception e) {
throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e);
}
}
protected abstract List<CrawlerData> parseHtml(String html);
@Override
public List<CrawlerData> crawl() {
List<CrawlerData> results = new ArrayList<>();
if (config == null || config.getTargetUrl() == null) {
return results;
}
try {
String html = fetchHtml(config.getTargetUrl());
results = parseHtml(html);
for (CrawlerData data : results) {
if (data.getSource() == null) {
data.setSource(getCrawlerName());
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
return results;
}
}

145
project/crawler/ChinaWeatherCrawler.java

@ -0,0 +1,145 @@
package com.crawler.crawler.impl;
import com.crawler.crawler.BaseCrawler;
import com.crawler.exception.HttpRequestException;
import com.crawler.exception.TimeoutException;
import com.crawler.model.CrawlerData;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ChinaWeatherCrawler extends BaseCrawler {
private static final String BASE_URL = "https://www.weather.com.cn";
@Override
public String getCrawlerName() {
return "ChinaWeatherCrawler";
}
@Override
protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException {
if (!urlStr.startsWith("https")) {
urlStr = urlStr.replace("http://", "https://");
}
HttpClient client = HttpClient.newBuilder()
.connectTimeout(Duration.ofMillis(30000))
.followRedirects(HttpClient.Redirect.NORMAL)
.build();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(urlStr))
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.GET()
.build();
try {
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
if (response.statusCode() < 200 || response.statusCode() >= 300) {
throw new HttpRequestException("HTTP请求失败", response.statusCode());
}
return response.body();
} catch (java.net.http.HttpTimeoutException e) {
throw new TimeoutException("连接超时", e);
} catch (HttpRequestException e) {
throw e;
} catch (Exception e) {
throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e);
}
}
@Override
protected List<CrawlerData> parseHtml(String html) {
List<CrawlerData> results = new ArrayList<>();
if (html == null || html.isEmpty()) {
return results;
}
String cleanHtml = html.replaceAll("\\s+", " ");
Pattern linkPattern = Pattern.compile(
"<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]{2,80})</a>",
Pattern.CASE_INSENSITIVE
);
Matcher matcher = linkPattern.matcher(cleanHtml);
int count = 0;
while (matcher.find() && count < 30) {
String url = matcher.group(1);
String title = matcher.group(2).trim();
if (isValidUrl(url) && isValidTitle(title)) {
url = normalizeUrl(url);
CrawlerData data = new CrawlerData();
data.setTitle(cleanText(title));
data.setUrl(url);
data.setSource(getCrawlerName());
results.add(data);
count++;
}
}
return results;
}
private String normalizeUrl(String url) {
if (url == null) return null;
if (url.startsWith("//")) {
return "https:" + url;
}
if (url.startsWith("/")) {
return BASE_URL + url;
}
if (!url.startsWith("http")) {
return BASE_URL + "/" + url;
}
if (url.startsWith("http://")) {
return url.replace("http://", "https://");
}
return url;
}
private boolean isValidUrl(String url) {
if (url == null || url.isEmpty()) {
return false;
}
if (url.contains("mailto:") || url.contains("javascript:")) {
return false;
}
return url.contains("weather.com.cn") || url.startsWith("/");
}
private boolean isValidTitle(String title) {
if (title == null || title.isEmpty()) {
return false;
}
String cleaned = cleanText(title);
return cleaned != null && cleaned.length() >= 2 && cleaned.length() <= 80;
}
private String cleanText(String text) {
if (text == null) return null;
return text.replaceAll("<[^>]+>", "")
.replaceAll("&nbsp;", " ")
.replaceAll("&#[0-9]+;", "")
.replaceAll("&[a-zA-Z]+;", " ")
.replaceAll("\\s+", " ")
.trim();
}
}

13
project/crawler/Crawler.java

@ -0,0 +1,13 @@
package com.crawler.crawler;
import com.crawler.model.CrawlerConfig;
import com.crawler.model.CrawlerData;
import java.util.List;
public interface Crawler {
void setConfig(CrawlerConfig config);
CrawlerConfig getConfig();
String getCrawlerName();
List<CrawlerData> crawl();
}

103
project/crawler/CrawlerFactory.java

@ -0,0 +1,103 @@
package com.crawler.crawler;
import com.crawler.crawler.impl.*;
import com.crawler.exception.InvalidUrlException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Pattern;
public class CrawlerFactory {
private static CrawlerFactory instance;
private Map<String, Pattern> crawlerPatterns;
private CrawlerFactory() {
crawlerPatterns = new LinkedHashMap<>();
initPatterns();
}
public static CrawlerFactory getInstance() {
if (instance == null) {
instance = new CrawlerFactory();
}
return instance;
}
private void initPatterns() {
crawlerPatterns.put("MountBladeCrawler",
Pattern.compile(".*mountblade\\.com\\.cn.*", Pattern.CASE_INSENSITIVE));
crawlerPatterns.put("HunanUniversityNewsCrawler",
Pattern.compile(".*news\\.hnu\\.edu\\.cn.*", Pattern.CASE_INSENSITIVE));
crawlerPatterns.put("HunanUniversityCrawler",
Pattern.compile(".*hnu\\.edu\\.cn.*", Pattern.CASE_INSENSITIVE));
crawlerPatterns.put("ChinaWeatherCrawler",
Pattern.compile(".*weather\\.com\\.cn.*", Pattern.CASE_INSENSITIVE));
crawlerPatterns.put("ExampleCrawler",
Pattern.compile(".*", Pattern.CASE_INSENSITIVE));
}
public Crawler createCrawler(String url) {
validateUrl(url);
for (Map.Entry<String, Pattern> entry : crawlerPatterns.entrySet()) {
if (entry.getValue().matcher(url).matches()) {
return createCrawlerByName(entry.getKey());
}
}
return new ExampleCrawler();
}
private void validateUrl(String url) {
if (url == null || url.isEmpty()) {
throw new InvalidUrlException("URL不能为空", url);
}
if (!url.startsWith("http://") && !url.startsWith("https://")) {
throw new InvalidUrlException("URL格式无效,必须以http://或https://开头", url);
}
}
private Crawler createCrawlerByName(String crawlerName) {
switch (crawlerName) {
case "MountBladeCrawler":
return new MountBladeCrawler();
case "HunanUniversityNewsCrawler":
return new HunanUniversityNewsCrawler();
case "HunanUniversityCrawler":
return new HunanUniversityCrawler();
case "ChinaWeatherCrawler":
return new ChinaWeatherCrawler();
case "ExampleCrawler":
default:
return new ExampleCrawler();
}
}
public String getCrawlerName(String url) {
if (url == null || url.isEmpty()) {
return "ExampleCrawler";
}
for (Map.Entry<String, Pattern> entry : crawlerPatterns.entrySet()) {
if (entry.getValue().matcher(url).matches()) {
return entry.getKey();
}
}
return "ExampleCrawler";
}
public boolean isUrlSupported(String url) {
if (url == null || url.isEmpty()) {
return false;
}
for (Pattern pattern : crawlerPatterns.values()) {
if (pattern.matcher(url).matches()) {
return true;
}
}
return true;
}
}

42
project/crawler/ExampleCrawler.java

@ -0,0 +1,42 @@
package com.crawler.crawler.impl;
import com.crawler.crawler.BaseCrawler;
import com.crawler.model.CrawlerData;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ExampleCrawler extends BaseCrawler {
@Override
public String getCrawlerName() {
return "ExampleCrawler";
}
@Override
protected List<CrawlerData> parseHtml(String html) {
List<CrawlerData> results = new ArrayList<>();
String cleanHtml = html.replaceAll("\\s+", " ").replaceAll(">\\s*<", "><");
Pattern linkPattern = Pattern.compile("<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>", Pattern.CASE_INSENSITIVE);
Matcher matcher = linkPattern.matcher(cleanHtml);
int count = 0;
while (matcher.find() && count < 10) {
CrawlerData data = new CrawlerData();
data.setTitle(cleanText(matcher.group(2)));
data.setUrl(matcher.group(1));
results.add(data);
count++;
}
return results;
}
private String cleanText(String text) {
if (text == null) return null;
return text.replaceAll("<[^>]+>", "").trim();
}
}
Loading…
Cancel
Save