You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

252 lines
11 KiB

package utils;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.protocol.BasicHttpContext;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class Utils {
// 随机User-Agent列表
private static final List<String> USER_AGENTS = new ArrayList<>();
static {
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36");
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36");
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/124.0");
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0");
USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15");
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36");
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/123.0");
USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15");
USER_AGENTS.add("Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36");
USER_AGENTS.add("Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1");
}
// 随机Referer列表
private static final List<String> REFERERS = new ArrayList<>();
static {
REFERERS.add("https://www.google.com/");
REFERERS.add("https://www.bing.com/");
REFERERS.add("https://www.baidu.com/");
REFERERS.add("https://scholar.google.com/");
REFERERS.add("https://www.sciencedirect.com/");
REFERERS.add("https://link.springer.com/");
REFERERS.add("https://ieeexplore.ieee.org/");
REFERERS.add("https://dl.acm.org/");
REFERERS.add("https://kns.cnki.net/");
REFERERS.add("https://www.google.com/search");
}
private static final Random RANDOM = new Random();
// 发送 HTTP GET 请求
public static String sendGetRequest(String urlString) throws Exception {
System.out.println("正在发送HTTP请求: " + urlString);
// 尝试多次普通HTTP请求,使用不同的User-Agent和Referer
for (int i = 0; i < 2; i++) { // 减少重试次数,避免卡住
String html = sendHttpGetRequest(urlString);
if (!html.isEmpty()) {
return html;
}
// 每次失败后添加更长的延迟
int delay = 2000 + i * 1000;
System.out.println("第 " + (i + 1) + " 次请求失败,添加延迟: " + delay + "ms");
Thread.sleep(delay);
}
// 暂时禁用Selenium,因为初始化可能会卡住
System.out.println("所有HTTP请求都失败,暂时跳过Selenium...");
return "";
}
// 使用普通HTTP请求
private static String sendHttpGetRequest(String urlString) throws Exception {
long startTime = System.currentTimeMillis();
// 设置请求超时时间
final int TIMEOUT = 15000; // 15秒
try {
// 使用默认的HttpClient
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(urlString);
// 随机选择User-Agent
String userAgent = USER_AGENTS.get(RANDOM.nextInt(USER_AGENTS.size()));
// 随机选择Referer
String referer = REFERERS.get(RANDOM.nextInt(REFERERS.size()));
// 添加更完整的HTTP头信息,模拟真实浏览器
httpGet.setHeader("User-Agent", userAgent);
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
httpGet.setHeader("Accept-Encoding", "gzip, deflate, br");
httpGet.setHeader("Connection", "keep-alive");
httpGet.setHeader("Referer", referer);
httpGet.setHeader("Upgrade-Insecure-Requests", "1");
httpGet.setHeader("Sec-Fetch-Dest", "document");
httpGet.setHeader("Sec-Fetch-Mode", "navigate");
httpGet.setHeader("Sec-Fetch-Site", "cross-site");
httpGet.setHeader("Sec-Fetch-User", "?1");
httpGet.setHeader("Cache-Control", "max-age=0");
httpGet.setHeader("DNT", "1");
httpGet.setHeader("TE", "trailers");
// 执行请求
System.out.println("开始执行HTTP请求...");
System.out.println("请求超时设置: " + TIMEOUT + "ms");
CloseableHttpResponse response = null;
try {
// 使用Future来处理超时
java.util.concurrent.Future<CloseableHttpResponse> future = java.util.concurrent.Executors.newSingleThreadExecutor().submit(new java.util.concurrent.Callable<CloseableHttpResponse>() {
@Override
public CloseableHttpResponse call() throws Exception {
try {
return (CloseableHttpResponse) httpClient.executeOpen(null, httpGet, new BasicHttpContext());
} catch (Exception e) {
throw new RuntimeException(e);
}
}
});
try {
response = future.get(TIMEOUT, java.util.concurrent.TimeUnit.MILLISECONDS);
} catch (java.util.concurrent.TimeoutException e) {
System.out.println("HTTP请求超时: " + e.getMessage());
future.cancel(true);
return "";
}
// 获取响应状态码
int statusCode = response.getCode();
System.out.println("HTTP响应状态码: " + statusCode);
System.out.println("使用的User-Agent: " + userAgent);
System.out.println("使用的Referer: " + referer);
if (statusCode != 200) {
System.out.println("HTTP请求失败,状态码: " + statusCode);
return "";
}
// 读取响应内容
System.out.println("正在读取响应内容...");
// 限制读取的内容长度,避免程序卡住
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
// 如果内容长度超过100000字符,只保留前100000字符
if (html.length() > 100000) {
html = html.substring(0, 100000);
System.out.println("响应内容过长,已截断为100000字符");
}
long endTime = System.currentTimeMillis();
System.out.println("HTTP请求完成,耗时: " + (endTime - startTime) + "ms");
System.out.println("响应内容长度: " + html.length() + " 字符");
// 检查响应内容是否为空或包含反爬信息
if (html == null || html.isEmpty()) {
System.out.println("响应内容为空");
return "";
}
// 检查是否是反爬页面
boolean isAntiCrawl = false;
String[] antiCrawlKeywords = {"captcha", "verify", "robot", "Robot", "reCAPTCHA", "blocked", "Blocked"};
for (String keyword : antiCrawlKeywords) {
if (html.contains(keyword)) {
isAntiCrawl = true;
break;
}
}
// 特殊处理arXiv,因为它的页面可能包含一些被误判为反爬的关键词
if (urlString.contains("arxiv.org")) {
isAntiCrawl = false; // 对于arXiv,我们信任它返回的内容
}
if (isAntiCrawl) {
System.out.println("检测到反爬页面");
return "";
}
// 智能延迟,模拟真实用户行为,使用更随机的延迟时间
int delay = RANDOM.nextInt(1500) + 800; // 800-2300ms
System.out.println("添加随机延迟: " + delay + "ms");
Thread.sleep(delay);
return html;
} finally {
// 确保响应和客户端被关闭
if (response != null) {
try {
response.close();
} catch (Exception e) {
System.out.println("关闭响应时出错: " + e.getMessage());
}
}
try {
httpClient.close();
} catch (Exception e) {
System.out.println("关闭HTTP客户端时出错: " + e.getMessage());
}
}
} catch (java.net.SocketTimeoutException e) {
System.out.println("HTTP请求超时: " + e.getMessage());
return "";
} catch (java.io.IOException e) {
System.out.println("HTTP请求IO错误: " + e.getMessage());
return "";
} catch (Exception e) {
System.out.println("发送HTTP请求时出错: " + e.getMessage());
e.printStackTrace();
return "";
}
}
// 解析 HTML
public static Document parseHtml(String html) {
return Jsoup.parse(html);
}
// URL 编码
public static String urlEncode(String value) throws Exception {
return URLEncoder.encode(value, "UTF-8");
}
// 生成唯一文件名
public static String generateFileName(String keyword) {
return keyword + "_" + System.currentTimeMillis() + ".json";
}
// 清理文件名中的非法字符
public static String cleanFileName(String fileName) {
return fileName.replaceAll("[\\/:*?\"<>|]", "_");
}
// 清理论文标题用于文件名
public static String cleanTitleForFileName(String title) {
if (title == null || title.isEmpty()) {
return "untitled";
}
String cleaned = title.trim()
.replaceAll("[\\\\/:*?\"<>|]", "_")
.replaceAll("\\s+", "_")
.replaceAll("_+", "_");
if (cleaned.length() > 100) {
cleaned = cleaned.substring(0, 100);
}
return cleaned;
}
}