You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
84 lines
2.6 KiB
84 lines
2.6 KiB
package com.crawler.crawler;
|
|
|
|
import com.crawler.exception.HttpRequestException;
|
|
import com.crawler.exception.TimeoutException;
|
|
import com.crawler.model.CrawlerConfig;
|
|
import com.crawler.model.CrawlerData;
|
|
|
|
import java.net.URI;
|
|
import java.net.http.HttpClient;
|
|
import java.net.http.HttpRequest;
|
|
import java.net.http.HttpResponse;
|
|
import java.time.Duration;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public abstract class BaseCrawler implements Crawler {
|
|
protected CrawlerConfig config;
|
|
|
|
@Override
|
|
public void setConfig(CrawlerConfig config) {
|
|
this.config = config;
|
|
}
|
|
|
|
@Override
|
|
public CrawlerConfig getConfig() {
|
|
return config;
|
|
}
|
|
|
|
protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException {
|
|
HttpClient client = HttpClient.newBuilder()
|
|
.connectTimeout(Duration.ofMillis(config.getTimeout()))
|
|
.followRedirects(HttpClient.Redirect.NORMAL)
|
|
.build();
|
|
|
|
HttpRequest request = HttpRequest.newBuilder()
|
|
.uri(URI.create(urlStr))
|
|
.header("User-Agent", config.getUserAgent())
|
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
|
.GET()
|
|
.build();
|
|
|
|
try {
|
|
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
|
|
|
if (response.statusCode() < 200 || response.statusCode() >= 300) {
|
|
throw new HttpRequestException("HTTP请求失败", response.statusCode());
|
|
}
|
|
|
|
return response.body();
|
|
} catch (java.net.http.HttpTimeoutException e) {
|
|
throw new TimeoutException("连接超时", e);
|
|
} catch (HttpRequestException e) {
|
|
throw e;
|
|
} catch (Exception e) {
|
|
throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e);
|
|
}
|
|
}
|
|
|
|
protected abstract List<CrawlerData> parseHtml(String html);
|
|
|
|
@Override
|
|
public List<CrawlerData> crawl() {
|
|
List<CrawlerData> results = new ArrayList<>();
|
|
|
|
if (config == null || config.getTargetUrl() == null) {
|
|
return results;
|
|
}
|
|
|
|
try {
|
|
String html = fetchHtml(config.getTargetUrl());
|
|
results = parseHtml(html);
|
|
|
|
for (CrawlerData data : results) {
|
|
if (data.getSource() == null) {
|
|
data.setSource(getCrawlerName());
|
|
}
|
|
}
|
|
} catch (Exception e) {
|
|
throw new RuntimeException(e);
|
|
}
|
|
|
|
return results;
|
|
}
|
|
}
|