You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
252 lines
11 KiB
252 lines
11 KiB
package utils;
|
|
|
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
|
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
|
import org.apache.hc.core5.http.protocol.BasicHttpContext;
|
|
|
|
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
|
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import java.net.URLEncoder;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.Random;
|
|
|
|
|
|
public class Utils {
|
|
// 随机User-Agent列表
|
|
private static final List<String> USER_AGENTS = new ArrayList<>();
|
|
static {
|
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36");
|
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36");
|
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/124.0");
|
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0");
|
|
USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15");
|
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36");
|
|
USER_AGENTS.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/123.0");
|
|
USER_AGENTS.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15");
|
|
USER_AGENTS.add("Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36");
|
|
USER_AGENTS.add("Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1");
|
|
}
|
|
|
|
// 随机Referer列表
|
|
private static final List<String> REFERERS = new ArrayList<>();
|
|
static {
|
|
REFERERS.add("https://www.google.com/");
|
|
REFERERS.add("https://www.bing.com/");
|
|
REFERERS.add("https://www.baidu.com/");
|
|
REFERERS.add("https://scholar.google.com/");
|
|
REFERERS.add("https://www.sciencedirect.com/");
|
|
REFERERS.add("https://link.springer.com/");
|
|
REFERERS.add("https://ieeexplore.ieee.org/");
|
|
REFERERS.add("https://dl.acm.org/");
|
|
REFERERS.add("https://kns.cnki.net/");
|
|
REFERERS.add("https://www.google.com/search");
|
|
}
|
|
|
|
private static final Random RANDOM = new Random();
|
|
|
|
// 发送 HTTP GET 请求
|
|
public static String sendGetRequest(String urlString) throws Exception {
|
|
System.out.println("正在发送HTTP请求: " + urlString);
|
|
|
|
// 尝试多次普通HTTP请求,使用不同的User-Agent和Referer
|
|
for (int i = 0; i < 2; i++) { // 减少重试次数,避免卡住
|
|
String html = sendHttpGetRequest(urlString);
|
|
if (!html.isEmpty()) {
|
|
return html;
|
|
}
|
|
// 每次失败后添加更长的延迟
|
|
int delay = 2000 + i * 1000;
|
|
System.out.println("第 " + (i + 1) + " 次请求失败,添加延迟: " + delay + "ms");
|
|
Thread.sleep(delay);
|
|
}
|
|
|
|
// 暂时禁用Selenium,因为初始化可能会卡住
|
|
System.out.println("所有HTTP请求都失败,暂时跳过Selenium...");
|
|
return "";
|
|
}
|
|
|
|
// 使用普通HTTP请求
|
|
private static String sendHttpGetRequest(String urlString) throws Exception {
|
|
long startTime = System.currentTimeMillis();
|
|
|
|
// 设置请求超时时间
|
|
final int TIMEOUT = 15000; // 15秒
|
|
|
|
try {
|
|
// 使用默认的HttpClient
|
|
CloseableHttpClient httpClient = HttpClients.createDefault();
|
|
|
|
HttpGet httpGet = new HttpGet(urlString);
|
|
|
|
// 随机选择User-Agent
|
|
String userAgent = USER_AGENTS.get(RANDOM.nextInt(USER_AGENTS.size()));
|
|
// 随机选择Referer
|
|
String referer = REFERERS.get(RANDOM.nextInt(REFERERS.size()));
|
|
|
|
// 添加更完整的HTTP头信息,模拟真实浏览器
|
|
httpGet.setHeader("User-Agent", userAgent);
|
|
httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
|
|
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
|
|
httpGet.setHeader("Accept-Encoding", "gzip, deflate, br");
|
|
httpGet.setHeader("Connection", "keep-alive");
|
|
httpGet.setHeader("Referer", referer);
|
|
httpGet.setHeader("Upgrade-Insecure-Requests", "1");
|
|
httpGet.setHeader("Sec-Fetch-Dest", "document");
|
|
httpGet.setHeader("Sec-Fetch-Mode", "navigate");
|
|
httpGet.setHeader("Sec-Fetch-Site", "cross-site");
|
|
httpGet.setHeader("Sec-Fetch-User", "?1");
|
|
httpGet.setHeader("Cache-Control", "max-age=0");
|
|
httpGet.setHeader("DNT", "1");
|
|
httpGet.setHeader("TE", "trailers");
|
|
|
|
// 执行请求
|
|
System.out.println("开始执行HTTP请求...");
|
|
System.out.println("请求超时设置: " + TIMEOUT + "ms");
|
|
|
|
CloseableHttpResponse response = null;
|
|
try {
|
|
// 使用Future来处理超时
|
|
java.util.concurrent.Future<CloseableHttpResponse> future = java.util.concurrent.Executors.newSingleThreadExecutor().submit(new java.util.concurrent.Callable<CloseableHttpResponse>() {
|
|
@Override
|
|
public CloseableHttpResponse call() throws Exception {
|
|
try {
|
|
return (CloseableHttpResponse) httpClient.executeOpen(null, httpGet, new BasicHttpContext());
|
|
} catch (Exception e) {
|
|
throw new RuntimeException(e);
|
|
}
|
|
}
|
|
});
|
|
|
|
try {
|
|
response = future.get(TIMEOUT, java.util.concurrent.TimeUnit.MILLISECONDS);
|
|
} catch (java.util.concurrent.TimeoutException e) {
|
|
System.out.println("HTTP请求超时: " + e.getMessage());
|
|
future.cancel(true);
|
|
return "";
|
|
}
|
|
|
|
// 获取响应状态码
|
|
int statusCode = response.getCode();
|
|
System.out.println("HTTP响应状态码: " + statusCode);
|
|
System.out.println("使用的User-Agent: " + userAgent);
|
|
System.out.println("使用的Referer: " + referer);
|
|
|
|
if (statusCode != 200) {
|
|
System.out.println("HTTP请求失败,状态码: " + statusCode);
|
|
return "";
|
|
}
|
|
|
|
// 读取响应内容
|
|
System.out.println("正在读取响应内容...");
|
|
// 限制读取的内容长度,避免程序卡住
|
|
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
|
|
// 如果内容长度超过100000字符,只保留前100000字符
|
|
if (html.length() > 100000) {
|
|
html = html.substring(0, 100000);
|
|
System.out.println("响应内容过长,已截断为100000字符");
|
|
}
|
|
|
|
long endTime = System.currentTimeMillis();
|
|
System.out.println("HTTP请求完成,耗时: " + (endTime - startTime) + "ms");
|
|
System.out.println("响应内容长度: " + html.length() + " 字符");
|
|
|
|
// 检查响应内容是否为空或包含反爬信息
|
|
if (html == null || html.isEmpty()) {
|
|
System.out.println("响应内容为空");
|
|
return "";
|
|
}
|
|
|
|
// 检查是否是反爬页面
|
|
boolean isAntiCrawl = false;
|
|
String[] antiCrawlKeywords = {"captcha", "verify", "robot", "Robot", "reCAPTCHA", "blocked", "Blocked"};
|
|
for (String keyword : antiCrawlKeywords) {
|
|
if (html.contains(keyword)) {
|
|
isAntiCrawl = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// 特殊处理arXiv,因为它的页面可能包含一些被误判为反爬的关键词
|
|
if (urlString.contains("arxiv.org")) {
|
|
isAntiCrawl = false; // 对于arXiv,我们信任它返回的内容
|
|
}
|
|
|
|
if (isAntiCrawl) {
|
|
System.out.println("检测到反爬页面");
|
|
return "";
|
|
}
|
|
|
|
// 智能延迟,模拟真实用户行为,使用更随机的延迟时间
|
|
int delay = RANDOM.nextInt(1500) + 800; // 800-2300ms
|
|
System.out.println("添加随机延迟: " + delay + "ms");
|
|
Thread.sleep(delay);
|
|
|
|
return html;
|
|
} finally {
|
|
// 确保响应和客户端被关闭
|
|
if (response != null) {
|
|
try {
|
|
response.close();
|
|
} catch (Exception e) {
|
|
System.out.println("关闭响应时出错: " + e.getMessage());
|
|
}
|
|
}
|
|
try {
|
|
httpClient.close();
|
|
} catch (Exception e) {
|
|
System.out.println("关闭HTTP客户端时出错: " + e.getMessage());
|
|
}
|
|
}
|
|
} catch (java.net.SocketTimeoutException e) {
|
|
System.out.println("HTTP请求超时: " + e.getMessage());
|
|
return "";
|
|
} catch (java.io.IOException e) {
|
|
System.out.println("HTTP请求IO错误: " + e.getMessage());
|
|
return "";
|
|
} catch (Exception e) {
|
|
System.out.println("发送HTTP请求时出错: " + e.getMessage());
|
|
e.printStackTrace();
|
|
return "";
|
|
}
|
|
}
|
|
|
|
// 解析 HTML
|
|
public static Document parseHtml(String html) {
|
|
return Jsoup.parse(html);
|
|
}
|
|
|
|
// URL 编码
|
|
public static String urlEncode(String value) throws Exception {
|
|
return URLEncoder.encode(value, "UTF-8");
|
|
}
|
|
|
|
// 生成唯一文件名
|
|
public static String generateFileName(String keyword) {
|
|
return keyword + "_" + System.currentTimeMillis() + ".json";
|
|
}
|
|
|
|
// 清理文件名中的非法字符
|
|
public static String cleanFileName(String fileName) {
|
|
return fileName.replaceAll("[\\/:*?\"<>|]", "_");
|
|
}
|
|
|
|
// 清理论文标题用于文件名
|
|
public static String cleanTitleForFileName(String title) {
|
|
if (title == null || title.isEmpty()) {
|
|
return "untitled";
|
|
}
|
|
String cleaned = title.trim()
|
|
.replaceAll("[\\\\/:*?\"<>|]", "_")
|
|
.replaceAll("\\s+", "_")
|
|
.replaceAll("_+", "_");
|
|
if (cleaned.length() > 100) {
|
|
cleaned = cleaned.substring(0, 100);
|
|
}
|
|
return cleaned;
|
|
}
|
|
}
|
|
|