package strategy; import model.CrawlResult; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import exception.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Random; import java.util.ArrayList; import java.util.List; public abstract class AbstractCrawlStrategy implements CrawlStrategy { protected static final Logger logger = LoggerFactory.getLogger(AbstractCrawlStrategy.class); private static final Random random = new Random(); protected static final String[] USER_AGENTS = { "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0" }; protected int baseDelay = 1000; protected int maxDelay = 2000; public Document fetchDocument(String url) throws IOException { int maxRetries = 3; logger.debug("开始获取页面: {}", url); for (int retry = 0; retry < maxRetries; retry++) { try { String userAgent = USER_AGENTS[random.nextInt(USER_AGENTS.length)]; int delay = baseDelay + random.nextInt(maxDelay - baseDelay); logger.debug("随机延迟: {}ms", delay); Thread.sleep(delay); Document doc = Jsoup.connect(url) .timeout(20000) .userAgent(userAgent) .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") .header("Accept-Encoding", "gzip, deflate, br") .header("Connection", "keep-alive") .header("Sec-Fetch-Dest", "document") .header("Sec-Fetch-Mode", "navigate") .header("Sec-Fetch-Site", "none") .header("Sec-Fetch-User", "?1") .header("Upgrade-Insecure-Requests", "1") .header("Cache-Control", "max-age=0") .header("Referer", getReferer(url)) .followRedirects(true) .get(); logger.debug("页面获取成功: {} (尝试 {}/{})", url, retry + 1, maxRetries); return doc; } catch (java.net.ConnectException e) { logger.error("【断网异常】网络连接失败 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage()); if (retry == maxRetries - 1) { throw new IOException("【网络连接失败】无法连接到服务器,请检查网络连接状态: " + url, e); } } catch (java.net.UnknownHostException e) { logger.error("【断网异常】无法解析域名 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage()); throw new IOException("【网络连接失败】无法解析域名,请检查网络连接或DNS设置: " + url, e); } catch (java.net.SocketTimeoutException e) { logger.error("【网络超时】请求超时 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage()); if (retry == maxRetries - 1) { throw new IOException("【网络超时】连接超时,请检查网络稳定性: " + url, e); } } catch (java.net.NoRouteToHostException e) { logger.error("【断网异常】无法到达目标主机 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage()); throw new IOException("【网络连接失败】无法到达目标主机,请检查网络连接: " + url, e); } catch (java.net.SocketException e) { logger.error("【断网异常】Socket异常 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage()); throw new IOException("【网络连接失败】Socket异常,请检查网络连接: " + url, e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); logger.warn("页面获取过程被中断"); throw new IOException("获取被中断: " + url, e); } catch (IOException e) { logger.error("获取页面失败 (尝试 {}/{}): {}", retry + 1, maxRetries, e.getMessage()); if (retry == maxRetries - 1) { throw e; } } if (retry < maxRetries - 1) { try { int retryDelay = baseDelay * (int) Math.pow(2, retry) + random.nextInt(500); logger.debug("重试前等待: {}ms", retryDelay); Thread.sleep(retryDelay); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); logger.warn("重试等待被中断"); throw new IOException("重试等待被中断", ie); } } } throw new IOException("【网络连接失败】多次尝试后仍无法获取页面: " + url); } protected String getReferer(String url) { if (url.contains("douban")) { return "https://movie.douban.com/"; } else if (url.contains("dangdang")) { return "http://www.dangdang.com/"; } else if (url.contains("weather")) { return "http://www.weather.com.cn/"; } else if (url.contains("12306")) { return "https://www.12306.cn/"; } return ""; } public CrawlResult parseItem(Element element) throws ParseException { throw new ParseException("parseItem method must be implemented by subclass"); } protected double parsePrice(String priceText) { if (priceText == null || priceText.isEmpty()) { return 0; } String cleaned = priceText.replaceAll("[^0-9.]", ""); try { return Double.parseDouble(cleaned); } catch (NumberFormatException e) { logger.warn("价格解析失败: '{}' -> {}", priceText, cleaned); return 0; } } protected double parseDiscount(double price, double originalPrice) { if (originalPrice <= 0) { return 10.0; } double discount = (price / originalPrice) * 10; return Math.round(discount * 10) / 10.0; } protected List extractTextList(Elements elements) { List result = new ArrayList<>(); if (elements != null) { for (Element el : elements) { String text = el.text().trim(); if (!text.isEmpty()) { result.add(text); } } } return result; } protected String cleanText(String text) { if (text == null) return ""; return text.replaceAll("\\s+", " ").trim(); } @Override public int getPageSize() { return 20; } }