From 5ea90604292a5391f492174d04bba493067bb54e Mon Sep 17 00:00:00 2001 From: Xingzhimeng <3408335915@qq.com> Date: Sat, 30 May 2026 17:33:56 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20'project/src/main/java/com/example'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/example/CsvSaveStrategy.java | 35 +++ .../java/com/example/DatabaseManager.java | 118 ++++++++ .../java/com/example/DoubanBookCrawler.java | 277 ++++++++++++++++++ .../java/com/example/DoubanMovieCrawler.java | 277 ++++++++++++++++++ .../main/java/com/example/HttpCrawler.java | 181 ++++++++++++ 5 files changed, 888 insertions(+) create mode 100644 project/src/main/java/com/example/CsvSaveStrategy.java create mode 100644 project/src/main/java/com/example/DatabaseManager.java create mode 100644 project/src/main/java/com/example/DoubanBookCrawler.java create mode 100644 project/src/main/java/com/example/DoubanMovieCrawler.java create mode 100644 project/src/main/java/com/example/HttpCrawler.java diff --git a/project/src/main/java/com/example/CsvSaveStrategy.java b/project/src/main/java/com/example/CsvSaveStrategy.java new file mode 100644 index 0000000..86e85e4 --- /dev/null +++ b/project/src/main/java/com/example/CsvSaveStrategy.java @@ -0,0 +1,35 @@ +package com.example; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.util.List; + +/** + * CSV格式保存策略 - 策略模式实现 + */ +public class CsvSaveStrategy implements SaveStrategy { + @Override + public void save(Object data, String filename) throws IOException { + try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { + if (data instanceof List) { + for (Object item : (List) data) { + String str = item.toString(); + if (str.contains(",") || str.contains("\"") || str.contains("\n")) { + writer.write("\"" + str.replace("\"", "\"\"") + "\""); + } else { + writer.write(str); + } + writer.newLine(); + } + } else { + writer.write(data.toString()); + } + } + } + + @Override + public String getStrategyName() { + return "CSV格式"; + } +} \ No newline at end of file diff --git a/project/src/main/java/com/example/DatabaseManager.java b/project/src/main/java/com/example/DatabaseManager.java new file mode 100644 index 0000000..56c78cd --- /dev/null +++ b/project/src/main/java/com/example/DatabaseManager.java @@ -0,0 +1,118 @@ +package com.example; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Statement; + +/** + * 数据库管理器 - 单例模式 + * 提供 SQLite 数据库连接和初始化 + * 课堂知识点:单例模式、JDBC、数据库操作 + */ +public class DatabaseManager { + + private static DatabaseManager instance; + private Connection connection; + private static final String DB_NAME = "crawler_data.db"; + + private DatabaseManager() { + try { + // 加载 SQLite JDBC 驱动 + Class.forName("org.sqlite.JDBC"); + // 连接到 SQLite 数据库(不存在则自动创建) + connection = DriverManager.getConnection("jdbc:sqlite:" + DB_NAME); + System.out.println("✅ 数据库连接成功"); + // 初始化表结构 + initializeTables(); + } catch (ClassNotFoundException | SQLException e) { + System.err.println("❌ 数据库连接失败: " + e.getMessage()); + throw new RuntimeException(e); + } + } + + /** + * 获取单例实例 + */ + public static synchronized DatabaseManager getInstance() { + if (instance == null) { + instance = new DatabaseManager(); + } + return instance; + } + + /** + * 获取数据库连接 + */ + public Connection getConnection() { + return connection; + } + + /** + * 初始化数据库表结构 + */ + private void initializeTables() { + try (Statement stmt = connection.createStatement()) { + // 创建游戏表 + String createGameTable = + "CREATE TABLE IF NOT EXISTS games (" + + "id INTEGER PRIMARY KEY AUTOINCREMENT, " + + "name TEXT NOT NULL, " + + "price TEXT, " + + "discount TEXT, " + + "originalPrice TEXT, " + + "releaseDate TEXT, " + + "tags TEXT, " + + "reviewScore TEXT, " + + "crawlTime TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + + ")"; + stmt.execute(createGameTable); + + // 创建电影表 + String createMovieTable = + "CREATE TABLE IF NOT EXISTS movies (" + + "id INTEGER PRIMARY KEY AUTOINCREMENT, " + + "title TEXT NOT NULL, " + + "rating TEXT, " + + "director TEXT, " + + "actors TEXT, " + + "year TEXT, " + + "type TEXT, " + + "crawlTime TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + + ")"; + stmt.execute(createMovieTable); + + // 创建书籍表 + String createBookTable = + "CREATE TABLE IF NOT EXISTS books (" + + "id INTEGER PRIMARY KEY AUTOINCREMENT, " + + "title TEXT NOT NULL, " + + "author TEXT, " + + "publisher TEXT, " + + "publishDate TEXT, " + + "rating TEXT, " + + "price TEXT, " + + "crawlTime TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + + ")"; + stmt.execute(createBookTable); + + System.out.println("✅ 数据库表初始化完成"); + } catch (SQLException e) { + System.err.println("❌ 表初始化失败: " + e.getMessage()); + } + } + + /** + * 关闭数据库连接 + */ + public void close() { + if (connection != null) { + try { + connection.close(); + System.out.println("✅ 数据库连接已关闭"); + } catch (SQLException e) { + System.err.println("❌ 关闭数据库连接失败: " + e.getMessage()); + } + } + } +} \ No newline at end of file diff --git a/project/src/main/java/com/example/DoubanBookCrawler.java b/project/src/main/java/com/example/DoubanBookCrawler.java new file mode 100644 index 0000000..d23756d --- /dev/null +++ b/project/src/main/java/com/example/DoubanBookCrawler.java @@ -0,0 +1,277 @@ +package com.example; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +/** + * 豆瓣读书Top250爬虫 + * 课堂知识点:类与对象、封装、继承、多态、集合框架、异常处理、文件IO + */ +public class DoubanBookCrawler extends Crawler { + + /** + * 爬取数据 - 重写父类方法 + * 课堂知识点:方法重写(多态) + */ + @Override + public void crawl() { + crawlTop250(); + } + + /** + * 打印结果 - 重写父类方法 + * 课堂知识点:方法重写(多态) + */ + @Override + public void printResults() { + printBooks(); + } + + // 课堂知识点:集合框架 List + private List books; + + /** + * 书籍数据模型 - 课堂知识点:内部类、封装 + */ + public static class Book { + // 私有属性 - 课堂知识点:封装 + private int rank; + private String title; + private String author; + private String publisher; + private double rating; + private String coverUrl; + + // 构造方法 + public Book(int rank, String title, String author, String publisher, double rating, String coverUrl) { + this.rank = rank; + this.title = title; + this.author = author; + this.publisher = publisher; + this.rating = rating; + this.coverUrl = coverUrl; + } + + // getter方法 - 课堂知识点:封装的访问接口 + public int getRank() { return rank; } + public String getTitle() { return title; } + public String getAuthor() { return author; } + public String getPublisher() { return publisher; } + public double getRating() { return rating; } + public String getCoverUrl() { return coverUrl; } + } + + /** + * 构造方法 - 初始化书籍列表 + */ + public DoubanBookCrawler() { + this.books = new ArrayList<>(); + } + + /** + * 爬取豆瓣读书Top250 + */ + public void crawlTop250() { + books.clear(); + System.out.println("\n========== 开始爬取豆瓣读书Top250 =========="); + + // 先检查网络状态 - 使用父类方法 + if (!isNetworkAvailable()) { + System.err.println("❌ 网络连接不可用!请检查网络设置"); + System.out.println("使用默认书籍数据..."); + loadDefaultBooks(); + setSuccess(true); + setDataCount(books.size()); + return; + } + + try { + int page = 0; + int count = 0; + + // 课堂知识点:while循环 + while (count < 25 && page < 3) { + String url = "https://book.douban.com/top250?start=" + (page * 25); + System.out.println("正在爬取第" + (page + 1) + "页: " + url); + + // 使用父类的延迟方法 + delay(); + + Document doc = HttpCrawler.get(url); + + Elements bookItems = doc.select("tr.item"); + + // 课堂知识点:增强for循环 + for (Element item : bookItems) { + if (count >= 25) break; + + Element rankElement = item.selectFirst("td:nth-child(1) div"); + Element titleElement = item.selectFirst("td:nth-child(2) a"); + Element infoElement = item.selectFirst("td:nth-child(2) p.pl"); + Element ratingElement = item.selectFirst("td:nth-child(2) span.rating_nums"); + + if (titleElement != null) { + int rank = count + 1; + if (rankElement != null) { + try { + rank = Integer.parseInt(rankElement.text().trim()); + } catch (NumberFormatException e) { + rank = count + 1; + } + } + + String title = titleElement.attr("title").trim(); + if (title.isEmpty()) { + title = titleElement.text().trim().replaceAll("\\s+", " "); + } + + String author = ""; + String publisher = ""; + if (infoElement != null) { + String info = infoElement.text().trim(); + // 课堂知识点:字符串分割 + String[] parts = info.split("/"); + if (parts.length >= 2) { + author = parts[0].trim(); + if (parts.length >= 3) { + publisher = parts[parts.length - 2].trim(); + } + } + } + + double rating = 0.0; + if (ratingElement != null) { + try { + rating = Double.parseDouble(ratingElement.text().trim()); + } catch (NumberFormatException e) { + rating = 0.0; + } + } + + if (!title.isEmpty()) { + books.add(new Book(rank, title, author, publisher, rating, "")); + count++; + } + } + } + + page++; + System.out.println("第" + page + "页完成,已获取" + count + "本书"); + } + + System.out.println("爬取完成!共获取" + books.size() + "本书"); + + } catch (java.io.IOException e) { + System.err.println("❌ 爬取豆瓣读书失败: " + e.getMessage()); + System.out.println("使用默认书籍数据..."); + loadDefaultBooks(); + } + } + + /** + * 加载默认书籍数据(离线备用) + */ + private void loadDefaultBooks() { + System.out.println("使用默认书籍数据..."); + // 课堂知识点:数组初始化 + String[] titles = {"活着", "百年孤独", "1984", "三体", "红楼梦", "围城", "追风筝的人", "小王子", "解忧杂货店", "白夜行"}; + String[] authors = {"余华", "加西亚·马尔克斯", "乔治·奥威尔", "刘慈欣", "曹雪芹", "钱钟书", "卡勒德·胡赛尼", "圣埃克苏佩里", "东野圭吾", "东野圭吾"}; + String[] publishers = {"作家出版社", "南海出版公司", "译林出版社", "重庆出版社", "人民文学出版社", "人民文学出版社", "上海文艺出版社", "人民文学出版社", "南海出版公司", "南海出版公司"}; + double[] ratings = {9.4, 9.3, 9.3, 9.2, 9.6, 9.3, 8.9, 9.1, 8.5, 9.1}; + + // 课堂知识点:for循环遍历数组 + for (int i = 0; i < titles.length; i++) { + books.add(new Book(i + 1, titles[i], authors[i], publishers[i], ratings[i], "")); + } + } + + /** + * 打印书籍列表 - 课堂知识点:视图展示(MVC中的View) + */ + public void printBooks() { + System.out.println("\n========== 豆瓣读书Top25精选 =========="); + for (Book book : books) { + System.out.printf("%3d. 《%s》\n", book.getRank(), book.getTitle()); + System.out.println(" 作者: " + book.getAuthor()); + System.out.println(" 出版社: " + book.getPublisher()); + System.out.println(" 评分: " + book.getRating()); + System.out.println(); + } + System.out.println("====================================="); + } + + /** + * 保存书籍数据到文件 - 课堂知识点:文件IO + * @param filename 文件名 + * @throws IOException 写入异常 + */ + public void saveToFile(String filename) throws IOException { + // 课堂知识点:try-with-resources(自动关闭资源) + try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { + writer.write("排名|书名|作者|出版社|评分"); + writer.newLine(); + writer.write("====================================="); + writer.newLine(); + + for (Book book : books) { + writer.write(String.format("%d|%s|%s|%s|%.1f", + book.getRank(), + book.getTitle(), + book.getAuthor(), + book.getPublisher(), + book.getRating())); + writer.newLine(); + } + + System.out.println("✅ 书籍数据已保存到文件: " + filename); + } + } + + /** + * 获取书籍列表 + * @return 书籍列表 + */ + public List getBooks() { + return books; + } + + /** + * 保存书籍数据到数据库 - 课堂知识点:JDBC、数据库持久化 + */ + @Override + public void saveToDatabase() { + String sql = + "INSERT INTO books (title, author, publisher, publishDate, rating, price, crawlTime) " + + "VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)"; + + try (Connection conn = DatabaseManager.getInstance().getConnection(); + PreparedStatement pstmt = conn.prepareStatement(sql)) { + + for (Book book : books) { + pstmt.setString(1, book.getTitle()); + pstmt.setString(2, book.getAuthor()); + pstmt.setString(3, book.getPublisher()); + pstmt.setString(4, null); // publishDate + pstmt.setString(5, String.valueOf(book.getRating())); + pstmt.setString(6, null); // price + + pstmt.addBatch(); + } + + int[] rowsAffected = pstmt.executeBatch(); + System.out.println("✅ " + rowsAffected.length + " 条书籍数据已保存到数据库"); + } catch (SQLException e) { + System.err.println("❌ 保存书籍数据到数据库失败: " + e.getMessage()); + } + } +} \ No newline at end of file diff --git a/project/src/main/java/com/example/DoubanMovieCrawler.java b/project/src/main/java/com/example/DoubanMovieCrawler.java new file mode 100644 index 0000000..35458e1 --- /dev/null +++ b/project/src/main/java/com/example/DoubanMovieCrawler.java @@ -0,0 +1,277 @@ +package com.example; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +/** + * 豆瓣电影Top250爬虫 + * 课堂知识点:类与对象、封装、继承、多态、集合框架、异常处理、文件IO + */ +public class DoubanMovieCrawler extends Crawler { + + /** + * 爬取数据 - 重写父类方法 + * 课堂知识点:方法重写(多态) + */ + @Override + public void crawl() { + crawlTop250(); + } + + /** + * 打印结果 - 重写父类方法 + * 课堂知识点:方法重写(多态) + */ + @Override + public void printResults() { + printMovies(); + } + + // 课堂知识点:集合框架 List + private List movies; + + /** + * 电影数据模型 - 课堂知识点:内部类、封装 + */ + public static class Movie { + // 私有属性 - 课堂知识点:封装 + private int rank; + private String title; + private String year; + private String director; + private double rating; + + // 构造方法 + public Movie(int rank, String title, String year, String director, double rating) { + this.rank = rank; + this.title = title; + this.year = year; + this.director = director; + this.rating = rating; + } + + // getter方法 - 课堂知识点:封装的访问接口 + public int getRank() { return rank; } + public String getTitle() { return title; } + public String getYear() { return year; } + public String getDirector() { return director; } + public double getRating() { return rating; } + } + + /** + * 构造方法 - 初始化电影列表 + */ + public DoubanMovieCrawler() { + this.movies = new ArrayList<>(); + } + + /** + * 爬取豆瓣电影Top250 + */ + public void crawlTop250() { + movies.clear(); + System.out.println("\n========== 开始爬取豆瓣电影Top250 =========="); + + // 先检查网络状态 - 使用父类方法 + if (!isNetworkAvailable()) { + System.err.println("❌ 网络连接不可用!请检查网络设置"); + System.out.println("使用默认电影数据..."); + loadDefaultMovies(); + setSuccess(true); + setDataCount(movies.size()); + return; + } + + try { + // 课堂知识点:for循环 + for (int page = 0; page < 10; page++) { + if (movies.size() >= 250) break; + + int start = page * 25; + String url = "https://movie.douban.com/top250?start=" + start; + System.out.println("正在爬取第" + (page + 1) + "页: " + url); + + // 使用父类的延迟方法 + delay(); + + Document doc = HttpCrawler.get(url); + + Elements items = doc.select("ol.grid_view li"); + + // 课堂知识点:增强for循环 + for (Element item : items) { + // 排名 + int rank = Integer.parseInt(item.selectFirst("em").text()); + + // 电影名 + String title = item.selectFirst("span.title").text(); + + // 年份和导演信息 + String info = item.selectFirst("div.bd p").text(); + String year = extractYear(info); + String director = extractDirector(info); + + // 评分 + double rating = Double.parseDouble(item.selectFirst("span.rating_num").text()); + + // 创建电影对象并添加到列表 + movies.add(new Movie(rank, title, year, director, rating)); + } + + System.out.println("第" + (page + 1) + "页完成,已获取" + movies.size() + "部电影"); + } + + System.out.println("爬取完成!共获取" + movies.size() + "部电影"); + + } catch (java.io.IOException e) { + System.err.println("❌ 爬取豆瓣电影Top250失败: " + e.getMessage()); + System.out.println("使用默认电影数据..."); + loadDefaultMovies(); + } + } + + /** + * 加载默认电影数据(离线备用) + */ + private void loadDefaultMovies() { + // 课堂知识点:数组初始化 + String[] titles = {"肖申克的救赎", "霸王别姬", "阿甘正传", "泰坦尼克号", "盗梦空间"}; + String[] years = {"1994", "1993", "1994", "1997", "2010"}; + String[] directors = {"弗兰克·德拉邦特", "陈凯歌", "罗伯特·泽米吉斯", "詹姆斯·卡梅隆", "克里斯托弗·诺兰"}; + double[] ratings = {9.7, 9.6, 9.5, 9.4, 9.3}; + + // 课堂知识点:for循环遍历数组 + for (int i = 0; i < titles.length; i++) { + movies.add(new Movie(i + 1, titles[i], years[i], directors[i], ratings[i])); + } + System.out.println("已加载" + movies.size() + "部默认电影数据"); + } + + /** + * 从信息字符串中提取年份 + * @param info 电影信息字符串 + * @return 年份 + */ + private String extractYear(String info) { + int start = info.indexOf("("); + int end = info.indexOf(")"); + if (start != -1 && end != -1) { + String yearStr = info.substring(start + 1, end); + // 提取年份数字 + StringBuilder year = new StringBuilder(); + for (char c : yearStr.toCharArray()) { + if (Character.isDigit(c)) { + year.append(c); + } + } + return year.toString(); + } + return "未知年份"; + } + + /** + * 从信息字符串中提取导演 + * @param info 电影信息字符串 + * @return 导演姓名 + */ + private String extractDirector(String info) { + int start = info.indexOf("导演:"); + if (start != -1) { + String remaining = info.substring(start + 3); + int end = remaining.indexOf("主演:"); + if (end != -1) { + return remaining.substring(0, end).trim(); + } + return remaining.trim(); + } + return "未知导演"; + } + + /** + * 打印电影列表 - 课堂知识点:视图展示(MVC中的View) + */ + public void printMovies() { + System.out.println("\n========== 豆瓣电影Top250榜单 =========="); + for (Movie movie : movies) { + System.out.printf("%2d. 《%s》 - %s年 - 导演: %s - 评分: %.1f\n", + movie.getRank(), movie.getTitle(), movie.getYear(), + movie.getDirector(), movie.getRating()); + } + System.out.println("=========================================="); + } + + /** + * 保存电影数据到文件 - 课堂知识点:文件IO + * @param filename 文件名 + * @throws IOException 写入异常 + */ + public void saveToFile(String filename) throws IOException { + // 课堂知识点:try-with-resources(自动关闭资源) + try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { + writer.write("排名|电影名|年份|导演|评分"); + writer.newLine(); + writer.write("=========================================="); + writer.newLine(); + + for (Movie movie : movies) { + writer.write(String.format("%d|%s|%s|%s|%.1f", + movie.getRank(), + movie.getTitle(), + movie.getYear(), + movie.getDirector(), + movie.getRating())); + writer.newLine(); + } + + System.out.println("✅ 电影数据已保存到文件: " + filename); + } + } + + /** + * 获取电影列表 + * @return 电影列表 + */ + public List getMovies() { + return movies; + } + + /** + * 保存电影数据到数据库 - 课堂知识点:JDBC、数据库持久化 + */ + @Override + public void saveToDatabase() { + String sql = + "INSERT INTO movies (title, rating, director, actors, year, type, crawlTime) " + + "VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)"; + + try (Connection conn = DatabaseManager.getInstance().getConnection(); + PreparedStatement pstmt = conn.prepareStatement(sql)) { + + for (Movie movie : movies) { + pstmt.setString(1, movie.getTitle()); + pstmt.setString(2, String.valueOf(movie.getRating())); + pstmt.setString(3, movie.getDirector()); + pstmt.setString(4, null); // actors + pstmt.setString(5, movie.getYear()); + pstmt.setString(6, "电影"); + + pstmt.addBatch(); + } + + int[] rowsAffected = pstmt.executeBatch(); + System.out.println("✅ " + rowsAffected.length + " 条电影数据已保存到数据库"); + } catch (SQLException e) { + System.err.println("❌ 保存电影数据到数据库失败: " + e.getMessage()); + } + } +} \ No newline at end of file diff --git a/project/src/main/java/com/example/HttpCrawler.java b/project/src/main/java/com/example/HttpCrawler.java new file mode 100644 index 0000000..90ed51d --- /dev/null +++ b/project/src/main/java/com/example/HttpCrawler.java @@ -0,0 +1,181 @@ +package com.example; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.IOException; +import java.net.InetAddress; + +/** + * HTTP爬虫工具类 - 提供带重试机制的网络请求 + * 课堂知识点:单例模式、静态方法、异常处理 + */ +public class HttpCrawler { + + /** + * 单例实例 - 课堂知识点:单例模式 + */ + private static HttpCrawler instance; + + private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; + private static final int DEFAULT_TIMEOUT = 30000; + private static final int DEFAULT_RETRY_COUNT = 3; + private static final long RETRY_DELAY_MS = 2000; + + /** + * 私有构造方法 - 防止外部实例化(单例模式) + */ + private HttpCrawler() { + // 私有构造,确保只能通过getInstance()获取实例 + } + + /** + * 获取单例实例 - 课堂知识点:单例模式 + * @return HttpCrawler实例 + */ + public static HttpCrawler getInstance() { + if (instance == null) { + synchronized (HttpCrawler.class) { + if (instance == null) { + instance = new HttpCrawler(); + } + } + } + return instance; + } + + /** + * 检查网络连接是否可用 + */ + public static boolean isNetworkAvailable() { + try { + // 尝试连接公共DNS服务器 + InetAddress address = InetAddress.getByName("8.8.8.8"); + return address.isReachable(3000); + } catch (IOException e) { + return false; + } + } + + /** + * 检查特定网站是否可访问 + */ + public static boolean isWebsiteReachable(String url) { + try { + String domain = extractDomain(url); + InetAddress address = InetAddress.getByName(domain); + return address.isReachable(5000); + } catch (Exception e) { + return false; + } + } + + private static String extractDomain(String url) { + if (url == null || url.isEmpty()) { + return ""; + } + // 移除协议部分 + String domain = url.replaceFirst("^https?://", ""); + // 移除路径部分 + int slashIndex = domain.indexOf('/'); + if (slashIndex > 0) { + domain = domain.substring(0, slashIndex); + } + // 移除端口部分 + int colonIndex = domain.indexOf(':'); + if (colonIndex > 0) { + domain = domain.substring(0, colonIndex); + } + return domain; + } + + /** + * 发送HTTP GET请求,带重试机制 + */ + public static Document get(String url) throws IOException { + return get(url, DEFAULT_TIMEOUT, DEFAULT_RETRY_COUNT); + } + + /** + * 发送HTTP GET请求,带重试机制 + * @param url 请求URL + * @param timeout 超时时间(毫秒) + * @param maxRetries 最大重试次数 + */ + public static Document get(String url, int timeout, int maxRetries) throws IOException { + IOException lastException = null; + + // 课堂知识点:for循环 + for (int attempt = 1; attempt <= maxRetries; attempt++) { + try { + System.out.println("[请求尝试 " + attempt + "/" + maxRetries + "] " + url); + + Document doc = Jsoup.connect(url) + .userAgent(USER_AGENT) + .header("Accept-Language", "zh-CN,zh;q=0.9") + .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + .header("Connection", "keep-alive") + .timeout(timeout) + .followRedirects(true) + .get(); + + System.out.println("[请求成功] " + url); + return doc; + + } catch (java.net.SocketTimeoutException e) { + lastException = e; + System.err.println("[请求超时 " + attempt + "/" + maxRetries + "] " + url); + printNetworkTips(); + } catch (java.net.ConnectException e) { + lastException = e; + System.err.println("[连接失败 " + attempt + "/" + maxRetries + "] " + url); + printNetworkTips(); + } catch (java.net.UnknownHostException e) { + lastException = e; + System.err.println("[域名解析失败 " + attempt + "/" + maxRetries + "] " + url); + printNetworkTips(); + } catch (IOException e) { + lastException = e; + System.err.println("[请求异常 " + attempt + "/" + maxRetries + "] " + url + ": " + e.getMessage()); + } + + // 如果不是最后一次尝试,等待后重试 + if (attempt < maxRetries) { + try { + System.out.println("等待 " + RETRY_DELAY_MS + "ms 后重试..."); + Thread.sleep(RETRY_DELAY_MS); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new IOException("请求被中断", ie); + } + } + } + + // 所有重试都失败 + throw new IOException("请求失败,已重试 " + maxRetries + " 次: " + url, lastException); + } + + /** + * 打印网络连接提示 + */ + public static void printNetworkTips() { + System.err.println("====================================="); + System.err.println("网络连接问题排查:"); + System.err.println(" 1. 请检查您的网络连接是否正常"); + System.err.println(" 2. 尝试访问其他网站测试网络"); + System.err.println(" 3. 如果访问境外网站,可能需要VPN"); + System.err.println(" 4. 检查防火墙是否阻止了连接"); + System.err.println(" 5. 检查DNS设置是否正确"); + System.err.println("====================================="); + } + + /** + * 打印网络恢复提示 + */ + public static void printNetworkRecovery() { + System.out.println("====================================="); + System.out.println("✓ 网络连接已恢复!"); + System.out.println("✓ 正在重新尝试获取数据..."); + System.out.println("====================================="); + } +} \ No newline at end of file