package util; import java.io.*; import java.net.*; import java.util.zip.GZIPInputStream; import exception.*; public class HttpUtil { private static final int TIMEOUT = 10000; private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"; public static String get(String urlStr, String encoding) throws SpiderException { HttpURLConnection connection = null; BufferedReader reader = null; try { URL url = new URL(urlStr); connection = (HttpURLConnection) url.openConnection(); connection.setRequestMethod("GET"); connection.setConnectTimeout(TIMEOUT); connection.setReadTimeout(TIMEOUT); connection.setRequestProperty("User-Agent", USER_AGENT); connection.setRequestProperty("Accept-Encoding", "gzip, deflate"); int responseCode = connection.getResponseCode(); if (responseCode != HttpURLConnection.HTTP_OK) { throw new NetworkException("HTTP响应错误: " + responseCode, NetworkException.ErrorType.RESPONSE_ERROR); } String contentEncoding = connection.getContentEncoding(); InputStream inputStream = connection.getInputStream(); if (contentEncoding != null && contentEncoding.toLowerCase().contains("gzip")) { inputStream = new GZIPInputStream(inputStream); } reader = new BufferedReader(new InputStreamReader(inputStream, encoding)); StringBuilder result = new StringBuilder(); String line; while ((line = reader.readLine()) != null) { result.append(line).append("\n"); } return result.toString(); } catch (MalformedURLException e) { throw new NetworkException("URL格式错误: " + urlStr, NetworkException.ErrorType.HOST_NOT_FOUND, e); } catch (SocketTimeoutException e) { throw new NetworkException("连接超时: " + urlStr, NetworkException.ErrorType.CONNECTION_TIMEOUT, e); } catch (IOException e) { throw new NetworkException("网络IO错误: " + e.getMessage(), NetworkException.ErrorType.CONNECTION_REFUSED, e); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) {} } if (connection != null) { connection.disconnect(); } } } public static String extractTag(String html, String startTag, String endTag) throws ParseException { int startIndex = html.indexOf(startTag); if (startIndex == -1) { throw new ParseException("未找到开始标签: " + startTag, ParseException.ErrorType.TAG_NOT_FOUND); } int endIndex = html.indexOf(endTag, startIndex + startTag.length()); if (endIndex == -1) { throw new ParseException("未找到结束标签: " + endTag, ParseException.ErrorType.TAG_NOT_FOUND); } return html.substring(startIndex + startTag.length(), endIndex).trim(); } public static String extractTagSafe(String html, String startTag, String endTag) { try { return extractTag(html, startTag, endTag); } catch (ParseException e) { return "未找到"; } } }