You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
93 lines
3.6 KiB
93 lines
3.6 KiB
package util;
|
|
|
|
import java.io.*;
|
|
import java.net.*;
|
|
import java.util.zip.GZIPInputStream;
|
|
import exception.*;
|
|
|
|
public class HttpUtil {
|
|
private static final int TIMEOUT = 10000;
|
|
private static final String USER_AGENT =
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
|
|
|
|
public static String get(String urlStr, String encoding) throws SpiderException {
|
|
HttpURLConnection connection = null;
|
|
BufferedReader reader = null;
|
|
|
|
try {
|
|
URL url = new URL(urlStr);
|
|
connection = (HttpURLConnection) url.openConnection();
|
|
|
|
connection.setRequestMethod("GET");
|
|
connection.setConnectTimeout(TIMEOUT);
|
|
connection.setReadTimeout(TIMEOUT);
|
|
connection.setRequestProperty("User-Agent", USER_AGENT);
|
|
connection.setRequestProperty("Accept-Encoding", "gzip, deflate");
|
|
|
|
int responseCode = connection.getResponseCode();
|
|
if (responseCode != HttpURLConnection.HTTP_OK) {
|
|
throw new NetworkException("HTTP响应错误: " + responseCode,
|
|
NetworkException.ErrorType.RESPONSE_ERROR);
|
|
}
|
|
|
|
String contentEncoding = connection.getContentEncoding();
|
|
InputStream inputStream = connection.getInputStream();
|
|
|
|
if (contentEncoding != null && contentEncoding.toLowerCase().contains("gzip")) {
|
|
inputStream = new GZIPInputStream(inputStream);
|
|
}
|
|
|
|
reader = new BufferedReader(new InputStreamReader(inputStream, encoding));
|
|
StringBuilder result = new StringBuilder();
|
|
String line;
|
|
|
|
while ((line = reader.readLine()) != null) {
|
|
result.append(line).append("\n");
|
|
}
|
|
|
|
return result.toString();
|
|
|
|
} catch (MalformedURLException e) {
|
|
throw new NetworkException("URL格式错误: " + urlStr,
|
|
NetworkException.ErrorType.HOST_NOT_FOUND, e);
|
|
} catch (SocketTimeoutException e) {
|
|
throw new NetworkException("连接超时: " + urlStr,
|
|
NetworkException.ErrorType.CONNECTION_TIMEOUT, e);
|
|
} catch (IOException e) {
|
|
throw new NetworkException("网络IO错误: " + e.getMessage(),
|
|
NetworkException.ErrorType.CONNECTION_REFUSED, e);
|
|
} finally {
|
|
if (reader != null) {
|
|
try { reader.close(); } catch (IOException e) {}
|
|
}
|
|
if (connection != null) {
|
|
connection.disconnect();
|
|
}
|
|
}
|
|
}
|
|
|
|
public static String extractTag(String html, String startTag, String endTag)
|
|
throws ParseException {
|
|
int startIndex = html.indexOf(startTag);
|
|
if (startIndex == -1) {
|
|
throw new ParseException("未找到开始标签: " + startTag,
|
|
ParseException.ErrorType.TAG_NOT_FOUND);
|
|
}
|
|
|
|
int endIndex = html.indexOf(endTag, startIndex + startTag.length());
|
|
if (endIndex == -1) {
|
|
throw new ParseException("未找到结束标签: " + endTag,
|
|
ParseException.ErrorType.TAG_NOT_FOUND);
|
|
}
|
|
|
|
return html.substring(startIndex + startTag.length(), endIndex).trim();
|
|
}
|
|
|
|
public static String extractTagSafe(String html, String startTag, String endTag) {
|
|
try {
|
|
return extractTag(html, startTag, endTag);
|
|
} catch (ParseException e) {
|
|
return "未找到";
|
|
}
|
|
}
|
|
}
|
|
|