You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

93 lines
3.6 KiB

package util;
import java.io.*;
import java.net.*;
import java.util.zip.GZIPInputStream;
import exception.*;
public class HttpUtil {
private static final int TIMEOUT = 10000;
private static final String USER_AGENT =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
public static String get(String urlStr, String encoding) throws SpiderException {
HttpURLConnection connection = null;
BufferedReader reader = null;
try {
URL url = new URL(urlStr);
connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
connection.setConnectTimeout(TIMEOUT);
connection.setReadTimeout(TIMEOUT);
connection.setRequestProperty("User-Agent", USER_AGENT);
connection.setRequestProperty("Accept-Encoding", "gzip, deflate");
int responseCode = connection.getResponseCode();
if (responseCode != HttpURLConnection.HTTP_OK) {
throw new NetworkException("HTTP响应错误: " + responseCode,
NetworkException.ErrorType.RESPONSE_ERROR);
}
String contentEncoding = connection.getContentEncoding();
InputStream inputStream = connection.getInputStream();
if (contentEncoding != null && contentEncoding.toLowerCase().contains("gzip")) {
inputStream = new GZIPInputStream(inputStream);
}
reader = new BufferedReader(new InputStreamReader(inputStream, encoding));
StringBuilder result = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
result.append(line).append("\n");
}
return result.toString();
} catch (MalformedURLException e) {
throw new NetworkException("URL格式错误: " + urlStr,
NetworkException.ErrorType.HOST_NOT_FOUND, e);
} catch (SocketTimeoutException e) {
throw new NetworkException("连接超时: " + urlStr,
NetworkException.ErrorType.CONNECTION_TIMEOUT, e);
} catch (IOException e) {
throw new NetworkException("网络IO错误: " + e.getMessage(),
NetworkException.ErrorType.CONNECTION_REFUSED, e);
} finally {
if (reader != null) {
try { reader.close(); } catch (IOException e) {}
}
if (connection != null) {
connection.disconnect();
}
}
}
public static String extractTag(String html, String startTag, String endTag)
throws ParseException {
int startIndex = html.indexOf(startTag);
if (startIndex == -1) {
throw new ParseException("未找到开始标签: " + startTag,
ParseException.ErrorType.TAG_NOT_FOUND);
}
int endIndex = html.indexOf(endTag, startIndex + startTag.length());
if (endIndex == -1) {
throw new ParseException("未找到结束标签: " + endTag,
ParseException.ErrorType.TAG_NOT_FOUND);
}
return html.substring(startIndex + startTag.length(), endIndex).trim();
}
public static String extractTagSafe(String html, String startTag, String endTag) {
try {
return extractTag(html, startTag, endTag);
} catch (ParseException e) {
return "未找到";
}
}
}