You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

174 lines
8.0 KiB

package strategy;
import model.CrawlResult;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import exception.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Random;
import java.util.ArrayList;
import java.util.List;
public abstract class AbstractCrawlStrategy implements CrawlStrategy {
protected static final Logger logger = LoggerFactory.getLogger(AbstractCrawlStrategy.class);
private static final Random random = new Random();
protected static final String[] USER_AGENTS = {
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0"
};
protected int baseDelay = 1000;
protected int maxDelay = 2000;
public Document fetchDocument(String url) throws IOException {
int maxRetries = 3;
logger.debug("开始获取页面: {}", url);
for (int retry = 0; retry < maxRetries; retry++) {
try {
String userAgent = USER_AGENTS[random.nextInt(USER_AGENTS.length)];
int delay = baseDelay + random.nextInt(maxDelay - baseDelay);
logger.debug("随机延迟: {}ms", delay);
Thread.sleep(delay);
Document doc = Jsoup.connect(url)
.timeout(20000)
.userAgent(userAgent)
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.header("Accept-Encoding", "gzip, deflate, br")
.header("Connection", "keep-alive")
.header("Sec-Fetch-Dest", "document")
.header("Sec-Fetch-Mode", "navigate")
.header("Sec-Fetch-Site", "none")
.header("Sec-Fetch-User", "?1")
.header("Upgrade-Insecure-Requests", "1")
.header("Cache-Control", "max-age=0")
.header("Referer", getReferer(url))
.followRedirects(true)
.get();
logger.debug("页面获取成功: {} (尝试 {}/{})", url, retry + 1, maxRetries);
return doc;
} catch (java.net.ConnectException e) {
logger.error("【断网异常】网络连接失败 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
if (retry == maxRetries - 1) {
throw new IOException("【网络连接失败】无法连接到服务器,请检查网络连接状态: " + url, e);
}
} catch (java.net.UnknownHostException e) {
logger.error("【断网异常】无法解析域名 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
throw new IOException("【网络连接失败】无法解析域名,请检查网络连接或DNS设置: " + url, e);
} catch (java.net.SocketTimeoutException e) {
logger.error("【网络超时】请求超时 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
if (retry == maxRetries - 1) {
throw new IOException("【网络超时】连接超时,请检查网络稳定性: " + url, e);
}
} catch (java.net.NoRouteToHostException e) {
logger.error("【断网异常】无法到达目标主机 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
throw new IOException("【网络连接失败】无法到达目标主机,请检查网络连接: " + url, e);
} catch (java.net.SocketException e) {
logger.error("【断网异常】Socket异常 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
throw new IOException("【网络连接失败】Socket异常,请检查网络连接: " + url, e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("页面获取过程被中断");
throw new IOException("获取被中断: " + url, e);
} catch (IOException e) {
logger.error("获取页面失败 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage());
if (retry == maxRetries - 1) {
throw e;
}
}
if (retry < maxRetries - 1) {
try {
int retryDelay = baseDelay * (int) Math.pow(2, retry) + random.nextInt(500);
logger.debug("重试前等待: {}ms", retryDelay);
Thread.sleep(retryDelay);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
logger.warn("重试等待被中断");
throw new IOException("重试等待被中断", ie);
}
}
}
throw new IOException("【网络连接失败】多次尝试后仍无法获取页面: " + url);
}
protected String getReferer(String url) {
if (url.contains("douban")) {
return "https://movie.douban.com/";
} else if (url.contains("dangdang")) {
return "http://www.dangdang.com/";
} else if (url.contains("weather")) {
return "http://www.weather.com.cn/";
} else if (url.contains("12306")) {
return "https://www.12306.cn/";
}
return "";
}
public CrawlResult parseItem(Element element) throws ParseException {
throw new ParseException("parseItem method must be implemented by subclass");
}
protected double parsePrice(String priceText) {
if (priceText == null || priceText.isEmpty()) {
return 0;
}
String cleaned = priceText.replaceAll("[^0-9.]", "");
try {
return Double.parseDouble(cleaned);
} catch (NumberFormatException e) {
logger.warn("价格解析失败: '{}' -> {}", priceText, cleaned);
return 0;
}
}
protected double parseDiscount(double price, double originalPrice) {
if (originalPrice <= 0) {
return 10.0;
}
double discount = (price / originalPrice) * 10;
return Math.round(discount * 10) / 10.0;
}
protected List<String> extractTextList(Elements elements) {
List<String> result = new ArrayList<>();
if (elements != null) {
for (Element el : elements) {
String text = el.text().trim();
if (!text.isEmpty()) {
result.add(text);
}
}
}
return result;
}
protected String cleanText(String text) {
if (text == null) return "";
return text.replaceAll("\\s+", " ").trim();
}
@Override
public int getPageSize() {
return 20;
}
}