package com.crawler.crawler; import com.crawler.exception.HttpRequestException; import com.crawler.exception.TimeoutException; import com.crawler.model.CrawlerConfig; import com.crawler.model.CrawlerData; import java.net.URI; import java.net.http.HttpClient; import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.time.Duration; import java.util.ArrayList; import java.util.List; public abstract class BaseCrawler implements Crawler { protected CrawlerConfig config; @Override public void setConfig(CrawlerConfig config) { this.config = config; } @Override public CrawlerConfig getConfig() { return config; } protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException { HttpClient client = HttpClient.newBuilder() .connectTimeout(Duration.ofMillis(config.getTimeout())) .followRedirects(HttpClient.Redirect.NORMAL) .build(); HttpRequest request = HttpRequest.newBuilder() .uri(URI.create(urlStr)) .header("User-Agent", config.getUserAgent()) .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") .GET() .build(); try { HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); if (response.statusCode() < 200 || response.statusCode() >= 300) { throw new HttpRequestException("HTTP请求失败", response.statusCode()); } return response.body(); } catch (java.net.http.HttpTimeoutException e) { throw new TimeoutException("连接超时", e); } catch (HttpRequestException e) { throw e; } catch (Exception e) { throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e); } } protected abstract List parseHtml(String html); @Override public List crawl() { List results = new ArrayList<>(); if (config == null || config.getTargetUrl() == null) { return results; } try { String html = fetchHtml(config.getTargetUrl()); results = parseHtml(html); for (CrawlerData data : results) { if (data.getSource() == null) { data.setSource(getCrawlerName()); } } } catch (Exception e) { throw new RuntimeException(e); } return results; } }