You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
174 lines
8.0 KiB
174 lines
8.0 KiB
package strategy;
|
|
|
|
import model.CrawlResult;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import exception.ParseException;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.io.IOException;
|
|
import java.util.Random;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public abstract class AbstractCrawlStrategy implements CrawlStrategy {
|
|
protected static final Logger logger = LoggerFactory.getLogger(AbstractCrawlStrategy.class);
|
|
|
|
private static final Random random = new Random();
|
|
|
|
protected static final String[] USER_AGENTS = {
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0"
|
|
};
|
|
|
|
protected int baseDelay = 1000;
|
|
protected int maxDelay = 2000;
|
|
|
|
public Document fetchDocument(String url) throws IOException {
|
|
int maxRetries = 3;
|
|
logger.debug("开始获取页面: {}", url);
|
|
|
|
for (int retry = 0; retry < maxRetries; retry++) {
|
|
try {
|
|
String userAgent = USER_AGENTS[random.nextInt(USER_AGENTS.length)];
|
|
int delay = baseDelay + random.nextInt(maxDelay - baseDelay);
|
|
logger.debug("随机延迟: {}ms", delay);
|
|
Thread.sleep(delay);
|
|
|
|
Document doc = Jsoup.connect(url)
|
|
.timeout(20000)
|
|
.userAgent(userAgent)
|
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
|
|
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
|
.header("Accept-Encoding", "gzip, deflate, br")
|
|
.header("Connection", "keep-alive")
|
|
.header("Sec-Fetch-Dest", "document")
|
|
.header("Sec-Fetch-Mode", "navigate")
|
|
.header("Sec-Fetch-Site", "none")
|
|
.header("Sec-Fetch-User", "?1")
|
|
.header("Upgrade-Insecure-Requests", "1")
|
|
.header("Cache-Control", "max-age=0")
|
|
.header("Referer", getReferer(url))
|
|
.followRedirects(true)
|
|
.get();
|
|
|
|
logger.debug("页面获取成功: {} (尝试 {}/{})", url, retry + 1, maxRetries);
|
|
return doc;
|
|
} catch (java.net.ConnectException e) {
|
|
logger.error("【断网异常】网络连接失败 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
|
|
if (retry == maxRetries - 1) {
|
|
throw new IOException("【网络连接失败】无法连接到服务器,请检查网络连接状态: " + url, e);
|
|
}
|
|
} catch (java.net.UnknownHostException e) {
|
|
logger.error("【断网异常】无法解析域名 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
|
|
throw new IOException("【网络连接失败】无法解析域名,请检查网络连接或DNS设置: " + url, e);
|
|
} catch (java.net.SocketTimeoutException e) {
|
|
logger.error("【网络超时】请求超时 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
|
|
if (retry == maxRetries - 1) {
|
|
throw new IOException("【网络超时】连接超时,请检查网络稳定性: " + url, e);
|
|
}
|
|
} catch (java.net.NoRouteToHostException e) {
|
|
logger.error("【断网异常】无法到达目标主机 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
|
|
throw new IOException("【网络连接失败】无法到达目标主机,请检查网络连接: " + url, e);
|
|
} catch (java.net.SocketException e) {
|
|
logger.error("【断网异常】Socket异常 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
|
|
throw new IOException("【网络连接失败】Socket异常,请检查网络连接: " + url, e);
|
|
} catch (InterruptedException e) {
|
|
Thread.currentThread().interrupt();
|
|
logger.warn("页面获取过程被中断");
|
|
throw new IOException("获取被中断: " + url, e);
|
|
} catch (IOException e) {
|
|
logger.error("获取页面失败 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
|
|
if (retry == maxRetries - 1) {
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
if (retry < maxRetries - 1) {
|
|
try {
|
|
int retryDelay = baseDelay * (int) Math.pow(2, retry) + random.nextInt(500);
|
|
logger.debug("重试前等待: {}ms", retryDelay);
|
|
Thread.sleep(retryDelay);
|
|
} catch (InterruptedException ie) {
|
|
Thread.currentThread().interrupt();
|
|
logger.warn("重试等待被中断");
|
|
throw new IOException("重试等待被中断", ie);
|
|
}
|
|
}
|
|
}
|
|
|
|
throw new IOException("【网络连接失败】多次尝试后仍无法获取页面: " + url);
|
|
}
|
|
|
|
protected String getReferer(String url) {
|
|
if (url.contains("douban")) {
|
|
return "https://movie.douban.com/";
|
|
} else if (url.contains("dangdang")) {
|
|
return "http://www.dangdang.com/";
|
|
} else if (url.contains("weather")) {
|
|
return "http://www.weather.com.cn/";
|
|
} else if (url.contains("12306")) {
|
|
return "https://www.12306.cn/";
|
|
}
|
|
return "";
|
|
}
|
|
|
|
public CrawlResult parseItem(Element element) throws ParseException {
|
|
throw new ParseException("parseItem method must be implemented by subclass");
|
|
}
|
|
|
|
protected double parsePrice(String priceText) {
|
|
if (priceText == null || priceText.isEmpty()) {
|
|
return 0;
|
|
}
|
|
String cleaned = priceText.replaceAll("[^0-9.]", "");
|
|
try {
|
|
return Double.parseDouble(cleaned);
|
|
} catch (NumberFormatException e) {
|
|
logger.warn("价格解析失败: '{}' -> {}", priceText, cleaned);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
protected double parseDiscount(double price, double originalPrice) {
|
|
if (originalPrice <= 0) {
|
|
return 10.0;
|
|
}
|
|
double discount = (price / originalPrice) * 10;
|
|
return Math.round(discount * 10) / 10.0;
|
|
}
|
|
|
|
protected List<String> extractTextList(Elements elements) {
|
|
List<String> result = new ArrayList<>();
|
|
if (elements != null) {
|
|
for (Element el : elements) {
|
|
String text = el.text().trim();
|
|
if (!text.isEmpty()) {
|
|
result.add(text);
|
|
}
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
protected String cleanText(String text) {
|
|
if (text == null) return "";
|
|
return text.replaceAll("\\s+", " ").trim();
|
|
}
|
|
|
|
@Override
|
|
public int getPageSize() {
|
|
return 20;
|
|
}
|
|
}
|