You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
145 lines
4.8 KiB
145 lines
4.8 KiB
package com.crawler.crawler.impl;
|
|
|
|
import com.crawler.crawler.BaseCrawler;
|
|
import com.crawler.exception.HttpRequestException;
|
|
import com.crawler.exception.TimeoutException;
|
|
import com.crawler.model.CrawlerData;
|
|
|
|
import java.net.URI;
|
|
import java.net.http.HttpClient;
|
|
import java.net.http.HttpRequest;
|
|
import java.net.http.HttpResponse;
|
|
import java.time.Duration;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class ChinaWeatherCrawler extends BaseCrawler {
|
|
private static final String BASE_URL = "https://www.weather.com.cn";
|
|
|
|
@Override
|
|
public String getCrawlerName() {
|
|
return "ChinaWeatherCrawler";
|
|
}
|
|
|
|
@Override
|
|
protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException {
|
|
if (!urlStr.startsWith("https")) {
|
|
urlStr = urlStr.replace("http://", "https://");
|
|
}
|
|
|
|
HttpClient client = HttpClient.newBuilder()
|
|
.connectTimeout(Duration.ofMillis(30000))
|
|
.followRedirects(HttpClient.Redirect.NORMAL)
|
|
.build();
|
|
|
|
HttpRequest request = HttpRequest.newBuilder()
|
|
.uri(URI.create(urlStr))
|
|
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
|
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
|
.GET()
|
|
.build();
|
|
|
|
try {
|
|
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
|
|
|
if (response.statusCode() < 200 || response.statusCode() >= 300) {
|
|
throw new HttpRequestException("HTTP请求失败", response.statusCode());
|
|
}
|
|
|
|
return response.body();
|
|
} catch (java.net.http.HttpTimeoutException e) {
|
|
throw new TimeoutException("连接超时", e);
|
|
} catch (HttpRequestException e) {
|
|
throw e;
|
|
} catch (Exception e) {
|
|
throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e);
|
|
}
|
|
}
|
|
|
|
@Override
|
|
protected List<CrawlerData> parseHtml(String html) {
|
|
List<CrawlerData> results = new ArrayList<>();
|
|
|
|
if (html == null || html.isEmpty()) {
|
|
return results;
|
|
}
|
|
|
|
String cleanHtml = html.replaceAll("\\s+", " ");
|
|
|
|
Pattern linkPattern = Pattern.compile(
|
|
"<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]{2,80})</a>",
|
|
Pattern.CASE_INSENSITIVE
|
|
);
|
|
|
|
Matcher matcher = linkPattern.matcher(cleanHtml);
|
|
|
|
int count = 0;
|
|
while (matcher.find() && count < 30) {
|
|
String url = matcher.group(1);
|
|
String title = matcher.group(2).trim();
|
|
|
|
if (isValidUrl(url) && isValidTitle(title)) {
|
|
url = normalizeUrl(url);
|
|
|
|
CrawlerData data = new CrawlerData();
|
|
data.setTitle(cleanText(title));
|
|
data.setUrl(url);
|
|
data.setSource(getCrawlerName());
|
|
results.add(data);
|
|
count++;
|
|
}
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
private String normalizeUrl(String url) {
|
|
if (url == null) return null;
|
|
if (url.startsWith("//")) {
|
|
return "https:" + url;
|
|
}
|
|
if (url.startsWith("/")) {
|
|
return BASE_URL + url;
|
|
}
|
|
if (!url.startsWith("http")) {
|
|
return BASE_URL + "/" + url;
|
|
}
|
|
if (url.startsWith("http://")) {
|
|
return url.replace("http://", "https://");
|
|
}
|
|
return url;
|
|
}
|
|
|
|
private boolean isValidUrl(String url) {
|
|
if (url == null || url.isEmpty()) {
|
|
return false;
|
|
}
|
|
|
|
if (url.contains("mailto:") || url.contains("javascript:")) {
|
|
return false;
|
|
}
|
|
|
|
return url.contains("weather.com.cn") || url.startsWith("/");
|
|
}
|
|
|
|
private boolean isValidTitle(String title) {
|
|
if (title == null || title.isEmpty()) {
|
|
return false;
|
|
}
|
|
String cleaned = cleanText(title);
|
|
return cleaned != null && cleaned.length() >= 2 && cleaned.length() <= 80;
|
|
}
|
|
|
|
private String cleanText(String text) {
|
|
if (text == null) return null;
|
|
return text.replaceAll("<[^>]+>", "")
|
|
.replaceAll(" ", " ")
|
|
.replaceAll("&#[0-9]+;", "")
|
|
.replaceAll("&[a-zA-Z]+;", " ")
|
|
.replaceAll("\\s+", " ")
|
|
.trim();
|
|
}
|
|
}
|