java/project/crawler/ChinaWeatherCrawler.java


								package com.crawler.crawler.impl;


								import com.crawler.crawler.BaseCrawler;

								import com.crawler.exception.HttpRequestException;

								import com.crawler.exception.TimeoutException;

								import com.crawler.model.CrawlerData;


								import java.net.URI;

								import java.net.http.HttpClient;

								import java.net.http.HttpRequest;

								import java.net.http.HttpResponse;

								import java.time.Duration;

								import java.util.ArrayList;

								import java.util.List;

								import java.util.regex.Matcher;

								import java.util.regex.Pattern;


								public class ChinaWeatherCrawler extends BaseCrawler {

								    private static final String BASE_URL = "https://www.weather.com.cn";


								    @Override

								    public String getCrawlerName() {

								        return "ChinaWeatherCrawler";

								    }


								    @Override

								    protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException {

								        if (!urlStr.startsWith("https")) {

								            urlStr = urlStr.replace("http://", "https://");

								        }


								        HttpClient client = HttpClient.newBuilder()

								                .connectTimeout(Duration.ofMillis(30000))

								                .followRedirects(HttpClient.Redirect.NORMAL)

								                .build();


								        HttpRequest request = HttpRequest.newBuilder()

								                .uri(URI.create(urlStr))

								                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

								                .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")

								                .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")

								                .GET()

								                .build();


								        try {

								            HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());


								            if (response.statusCode() < 200 || response.statusCode() >= 300) {

								                throw new HttpRequestException("HTTP请求失败", response.statusCode());

								            }


								            return response.body();

								        } catch (java.net.http.HttpTimeoutException e) {

								            throw new TimeoutException("连接超时", e);

								        } catch (HttpRequestException e) {

								            throw e;

								        } catch (Exception e) {

								            throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e);

								        }

								    }


								    @Override

								    protected List<CrawlerData> parseHtml(String html) {

								        List<CrawlerData> results = new ArrayList<>();


								        if (html == null || html.isEmpty()) {

								            return results;

								        }


								        String cleanHtml = html.replaceAll("\\s+", " ");


								        Pattern linkPattern = Pattern.compile(

								            "<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]{2,80})</a>",

								            Pattern.CASE_INSENSITIVE

								        );


								        Matcher matcher = linkPattern.matcher(cleanHtml);


								        int count = 0;

								        while (matcher.find() && count < 30) {

								            String url = matcher.group(1);

								            String title = matcher.group(2).trim();


								            if (isValidUrl(url) && isValidTitle(title)) {

								                url = normalizeUrl(url);


								                CrawlerData data = new CrawlerData();

								                data.setTitle(cleanText(title));

								                data.setUrl(url);

								                data.setSource(getCrawlerName());

								                results.add(data);

								                count++;

								            }

								        }


								        return results;

								    }


								    private String normalizeUrl(String url) {

								        if (url == null) return null;

								        if (url.startsWith("//")) {

								            return "https:" + url;

								        }

								        if (url.startsWith("/")) {

								            return BASE_URL + url;

								        }

								        if (!url.startsWith("http")) {

								            return BASE_URL + "/" + url;

								        }

								        if (url.startsWith("http://")) {

								            return url.replace("http://", "https://");

								        }

								        return url;

								    }


								    private boolean isValidUrl(String url) {

								        if (url == null || url.isEmpty()) {

								            return false;

								        }


								        if (url.contains("mailto:") || url.contains("javascript:")) {

								            return false;

								        }


								        return url.contains("weather.com.cn") || url.startsWith("/");

								    }


								    private boolean isValidTitle(String title) {

								        if (title == null || title.isEmpty()) {

								            return false;

								        }

								        String cleaned = cleanText(title);

								        return cleaned != null && cleaned.length() >= 2 && cleaned.length() <= 80;

								    }


								    private String cleanText(String text) {

								        if (text == null) return null;

								        return text.replaceAll("<[^>]+>", "")

								                   .replaceAll("&nbsp;", " ")

								                   .replaceAll("&#[0-9]+;", "")

								                   .replaceAll("&[a-zA-Z]+;", " ")

								                   .replaceAll("\\s+", " ")

								                   .trim();

								    }

								}