package com.crawler.crawler.impl; import com.crawler.crawler.BaseCrawler; import com.crawler.exception.HttpRequestException; import com.crawler.exception.TimeoutException; import com.crawler.model.CrawlerData; import java.net.URI; import java.net.http.HttpClient; import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.time.Duration; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class ChinaWeatherCrawler extends BaseCrawler { private static final String BASE_URL = "https://www.weather.com.cn"; @Override public String getCrawlerName() { return "ChinaWeatherCrawler"; } @Override protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException { if (!urlStr.startsWith("https")) { urlStr = urlStr.replace("http://", "https://"); } HttpClient client = HttpClient.newBuilder() .connectTimeout(Duration.ofMillis(30000)) .followRedirects(HttpClient.Redirect.NORMAL) .build(); HttpRequest request = HttpRequest.newBuilder() .uri(URI.create(urlStr)) .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") .GET() .build(); try { HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); if (response.statusCode() < 200 || response.statusCode() >= 300) { throw new HttpRequestException("HTTP请求失败", response.statusCode()); } return response.body(); } catch (java.net.http.HttpTimeoutException e) { throw new TimeoutException("连接超时", e); } catch (HttpRequestException e) { throw e; } catch (Exception e) { throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e); } } @Override protected List parseHtml(String html) { List results = new ArrayList<>(); if (html == null || html.isEmpty()) { return results; } String cleanHtml = html.replaceAll("\\s+", " "); Pattern linkPattern = Pattern.compile( "]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]{2,80})", Pattern.CASE_INSENSITIVE ); Matcher matcher = linkPattern.matcher(cleanHtml); int count = 0; while (matcher.find() && count < 30) { String url = matcher.group(1); String title = matcher.group(2).trim(); if (isValidUrl(url) && isValidTitle(title)) { url = normalizeUrl(url); CrawlerData data = new CrawlerData(); data.setTitle(cleanText(title)); data.setUrl(url); data.setSource(getCrawlerName()); results.add(data); count++; } } return results; } private String normalizeUrl(String url) { if (url == null) return null; if (url.startsWith("//")) { return "https:" + url; } if (url.startsWith("/")) { return BASE_URL + url; } if (!url.startsWith("http")) { return BASE_URL + "/" + url; } if (url.startsWith("http://")) { return url.replace("http://", "https://"); } return url; } private boolean isValidUrl(String url) { if (url == null || url.isEmpty()) { return false; } if (url.contains("mailto:") || url.contains("javascript:")) { return false; } return url.contains("weather.com.cn") || url.startsWith("/"); } private boolean isValidTitle(String title) { if (title == null || title.isEmpty()) { return false; } String cleaned = cleanText(title); return cleaned != null && cleaned.length() >= 2 && cleaned.length() <= 80; } private String cleanText(String text) { if (text == null) return null; return text.replaceAll("<[^>]+>", "") .replaceAll(" ", " ") .replaceAll("&#[0-9]+;", "") .replaceAll("&[a-zA-Z]+;", " ") .replaceAll("\\s+", " ") .trim(); } }