package utils; import org.apache.hc.client5.http.classic.methods.HttpGet; import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; import org.apache.hc.client5.http.impl.classic.HttpClients; import org.apache.hc.core5.http.protocol.BasicHttpContext; import org.apache.hc.core5.http.io.entity.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.Random; public class Utils { // 随机User-Agent列表 private static final List USER_AGENTS = new ArrayList<>(); static { USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"); USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"); USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/124.0"); USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0"); USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15"); USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"); USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/123.0"); USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15"); USER_AGENTS.add("Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36"); USER_AGENTS.add("Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1"); } // 随机Referer列表 private static final List REFERERS = new ArrayList<>(); static { REFERERS.add("https://www.google.com/"); REFERERS.add("https://www.bing.com/"); REFERERS.add("https://www.baidu.com/"); REFERERS.add("https://scholar.google.com/"); REFERERS.add("https://www.sciencedirect.com/"); REFERERS.add("https://link.springer.com/"); REFERERS.add("https://ieeexplore.ieee.org/"); REFERERS.add("https://dl.acm.org/"); REFERERS.add("https://kns.cnki.net/"); REFERERS.add("https://www.google.com/search"); } private static final Random RANDOM = new Random(); // 发送 HTTP GET 请求 public static String sendGetRequest(String urlString) throws Exception { System.out.println("正在发送HTTP请求: " + urlString); // 尝试多次普通HTTP请求,使用不同的User-Agent和Referer for (int i = 0; i < 2; i++) { // 减少重试次数,避免卡住 String html = sendHttpGetRequest(urlString); if (!html.isEmpty()) { return html; } // 每次失败后添加更长的延迟 int delay = 2000 + i * 1000; System.out.println("第 " + (i + 1) + " 次请求失败,添加延迟: " + delay + "ms"); Thread.sleep(delay); } // 暂时禁用Selenium,因为初始化可能会卡住 System.out.println("所有HTTP请求都失败,暂时跳过Selenium..."); return ""; } // 使用普通HTTP请求 private static String sendHttpGetRequest(String urlString) throws Exception { long startTime = System.currentTimeMillis(); // 设置请求超时时间 final int TIMEOUT = 15000; // 15秒 try { // 使用默认的HttpClient CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(urlString); // 随机选择User-Agent String userAgent = USER_AGENTS.get(RANDOM.nextInt(USER_AGENTS.size())); // 随机选择Referer String referer = REFERERS.get(RANDOM.nextInt(REFERERS.size())); // 添加更完整的HTTP头信息,模拟真实浏览器 httpGet.setHeader("User-Agent", userAgent); httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"); httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"); httpGet.setHeader("Accept-Encoding", "gzip, deflate, br"); httpGet.setHeader("Connection", "keep-alive"); httpGet.setHeader("Referer", referer); httpGet.setHeader("Upgrade-Insecure-Requests", "1"); httpGet.setHeader("Sec-Fetch-Dest", "document"); httpGet.setHeader("Sec-Fetch-Mode", "navigate"); httpGet.setHeader("Sec-Fetch-Site", "cross-site"); httpGet.setHeader("Sec-Fetch-User", "?1"); httpGet.setHeader("Cache-Control", "max-age=0"); httpGet.setHeader("DNT", "1"); httpGet.setHeader("TE", "trailers"); // 执行请求 System.out.println("开始执行HTTP请求..."); System.out.println("请求超时设置: " + TIMEOUT + "ms"); CloseableHttpResponse response = null; try { // 使用Future来处理超时 java.util.concurrent.Future future = java.util.concurrent.Executors.newSingleThreadExecutor().submit(new java.util.concurrent.Callable() { @Override public CloseableHttpResponse call() throws Exception { try { return (CloseableHttpResponse) httpClient.executeOpen(null, httpGet, new BasicHttpContext()); } catch (Exception e) { throw new RuntimeException(e); } } }); try { response = future.get(TIMEOUT, java.util.concurrent.TimeUnit.MILLISECONDS); } catch (java.util.concurrent.TimeoutException e) { System.out.println("HTTP请求超时: " + e.getMessage()); future.cancel(true); return ""; } // 获取响应状态码 int statusCode = response.getCode(); System.out.println("HTTP响应状态码: " + statusCode); System.out.println("使用的User-Agent: " + userAgent); System.out.println("使用的Referer: " + referer); if (statusCode != 200) { System.out.println("HTTP请求失败,状态码: " + statusCode); return ""; } // 读取响应内容 System.out.println("正在读取响应内容..."); // 限制读取的内容长度,避免程序卡住 String html = EntityUtils.toString(response.getEntity(), "UTF-8"); // 如果内容长度超过100000字符,只保留前100000字符 if (html.length() > 100000) { html = html.substring(0, 100000); System.out.println("响应内容过长,已截断为100000字符"); } long endTime = System.currentTimeMillis(); System.out.println("HTTP请求完成,耗时: " + (endTime - startTime) + "ms"); System.out.println("响应内容长度: " + html.length() + " 字符"); // 检查响应内容是否为空或包含反爬信息 if (html == null || html.isEmpty()) { System.out.println("响应内容为空"); return ""; } // 检查是否是反爬页面 boolean isAntiCrawl = false; String[] antiCrawlKeywords = {"captcha", "verify", "robot", "Robot", "reCAPTCHA", "blocked", "Blocked"}; for (String keyword : antiCrawlKeywords) { if (html.contains(keyword)) { isAntiCrawl = true; break; } } // 特殊处理arXiv,因为它的页面可能包含一些被误判为反爬的关键词 if (urlString.contains("arxiv.org")) { isAntiCrawl = false; // 对于arXiv,我们信任它返回的内容 } if (isAntiCrawl) { System.out.println("检测到反爬页面"); return ""; } // 智能延迟,模拟真实用户行为,使用更随机的延迟时间 int delay = RANDOM.nextInt(1500) + 800; // 800-2300ms System.out.println("添加随机延迟: " + delay + "ms"); Thread.sleep(delay); return html; } finally { // 确保响应和客户端被关闭 if (response != null) { try { response.close(); } catch (Exception e) { System.out.println("关闭响应时出错: " + e.getMessage()); } } try { httpClient.close(); } catch (Exception e) { System.out.println("关闭HTTP客户端时出错: " + e.getMessage()); } } } catch (java.net.SocketTimeoutException e) { System.out.println("HTTP请求超时: " + e.getMessage()); return ""; } catch (java.io.IOException e) { System.out.println("HTTP请求IO错误: " + e.getMessage()); return ""; } catch (Exception e) { System.out.println("发送HTTP请求时出错: " + e.getMessage()); e.printStackTrace(); return ""; } } // 解析 HTML public static Document parseHtml(String html) { return Jsoup.parse(html); } // URL 编码 public static String urlEncode(String value) throws Exception { return URLEncoder.encode(value, "UTF-8"); } // 生成唯一文件名 public static String generateFileName(String keyword) { return keyword + "_" + System.currentTimeMillis() + ".json"; } // 清理文件名中的非法字符 public static String cleanFileName(String fileName) { return fileName.replaceAll("[\\/:*?\"<>|]", "_"); } // 清理论文标题用于文件名 public static String cleanTitleForFileName(String title) { if (title == null || title.isEmpty()) { return "untitled"; } String cleaned = title.trim() .replaceAll("[\\\\/:*?\"<>|]", "_") .replaceAll("\\s+", "_") .replaceAll("_+", "_"); if (cleaned.length() > 100) { cleaned = cleaned.substring(0, 100); } return cleaned; } }