5 changed files with 888 additions and 0 deletions
@ -0,0 +1,35 @@ |
|||
package com.example; |
|||
|
|||
import java.io.BufferedWriter; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.util.List; |
|||
|
|||
/** |
|||
* CSV格式保存策略 - 策略模式实现 |
|||
*/ |
|||
public class CsvSaveStrategy implements SaveStrategy { |
|||
@Override |
|||
public void save(Object data, String filename) throws IOException { |
|||
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { |
|||
if (data instanceof List) { |
|||
for (Object item : (List<?>) data) { |
|||
String str = item.toString(); |
|||
if (str.contains(",") || str.contains("\"") || str.contains("\n")) { |
|||
writer.write("\"" + str.replace("\"", "\"\"") + "\""); |
|||
} else { |
|||
writer.write(str); |
|||
} |
|||
writer.newLine(); |
|||
} |
|||
} else { |
|||
writer.write(data.toString()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "CSV格式"; |
|||
} |
|||
} |
|||
@ -0,0 +1,118 @@ |
|||
package com.example; |
|||
|
|||
import java.sql.Connection; |
|||
import java.sql.DriverManager; |
|||
import java.sql.SQLException; |
|||
import java.sql.Statement; |
|||
|
|||
/** |
|||
* 数据库管理器 - 单例模式 |
|||
* 提供 SQLite 数据库连接和初始化 |
|||
* 课堂知识点:单例模式、JDBC、数据库操作 |
|||
*/ |
|||
public class DatabaseManager { |
|||
|
|||
private static DatabaseManager instance; |
|||
private Connection connection; |
|||
private static final String DB_NAME = "crawler_data.db"; |
|||
|
|||
private DatabaseManager() { |
|||
try { |
|||
// 加载 SQLite JDBC 驱动
|
|||
Class.forName("org.sqlite.JDBC"); |
|||
// 连接到 SQLite 数据库(不存在则自动创建)
|
|||
connection = DriverManager.getConnection("jdbc:sqlite:" + DB_NAME); |
|||
System.out.println("✅ 数据库连接成功"); |
|||
// 初始化表结构
|
|||
initializeTables(); |
|||
} catch (ClassNotFoundException | SQLException e) { |
|||
System.err.println("❌ 数据库连接失败: " + e.getMessage()); |
|||
throw new RuntimeException(e); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 获取单例实例 |
|||
*/ |
|||
public static synchronized DatabaseManager getInstance() { |
|||
if (instance == null) { |
|||
instance = new DatabaseManager(); |
|||
} |
|||
return instance; |
|||
} |
|||
|
|||
/** |
|||
* 获取数据库连接 |
|||
*/ |
|||
public Connection getConnection() { |
|||
return connection; |
|||
} |
|||
|
|||
/** |
|||
* 初始化数据库表结构 |
|||
*/ |
|||
private void initializeTables() { |
|||
try (Statement stmt = connection.createStatement()) { |
|||
// 创建游戏表
|
|||
String createGameTable = |
|||
"CREATE TABLE IF NOT EXISTS games (" + |
|||
"id INTEGER PRIMARY KEY AUTOINCREMENT, " + |
|||
"name TEXT NOT NULL, " + |
|||
"price TEXT, " + |
|||
"discount TEXT, " + |
|||
"originalPrice TEXT, " + |
|||
"releaseDate TEXT, " + |
|||
"tags TEXT, " + |
|||
"reviewScore TEXT, " + |
|||
"crawlTime TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + |
|||
")"; |
|||
stmt.execute(createGameTable); |
|||
|
|||
// 创建电影表
|
|||
String createMovieTable = |
|||
"CREATE TABLE IF NOT EXISTS movies (" + |
|||
"id INTEGER PRIMARY KEY AUTOINCREMENT, " + |
|||
"title TEXT NOT NULL, " + |
|||
"rating TEXT, " + |
|||
"director TEXT, " + |
|||
"actors TEXT, " + |
|||
"year TEXT, " + |
|||
"type TEXT, " + |
|||
"crawlTime TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + |
|||
")"; |
|||
stmt.execute(createMovieTable); |
|||
|
|||
// 创建书籍表
|
|||
String createBookTable = |
|||
"CREATE TABLE IF NOT EXISTS books (" + |
|||
"id INTEGER PRIMARY KEY AUTOINCREMENT, " + |
|||
"title TEXT NOT NULL, " + |
|||
"author TEXT, " + |
|||
"publisher TEXT, " + |
|||
"publishDate TEXT, " + |
|||
"rating TEXT, " + |
|||
"price TEXT, " + |
|||
"crawlTime TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + |
|||
")"; |
|||
stmt.execute(createBookTable); |
|||
|
|||
System.out.println("✅ 数据库表初始化完成"); |
|||
} catch (SQLException e) { |
|||
System.err.println("❌ 表初始化失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 关闭数据库连接 |
|||
*/ |
|||
public void close() { |
|||
if (connection != null) { |
|||
try { |
|||
connection.close(); |
|||
System.out.println("✅ 数据库连接已关闭"); |
|||
} catch (SQLException e) { |
|||
System.err.println("❌ 关闭数据库连接失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,277 @@ |
|||
package com.example; |
|||
|
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.BufferedWriter; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.sql.Connection; |
|||
import java.sql.PreparedStatement; |
|||
import java.sql.SQLException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
/** |
|||
* 豆瓣读书Top250爬虫 |
|||
* 课堂知识点:类与对象、封装、继承、多态、集合框架、异常处理、文件IO |
|||
*/ |
|||
public class DoubanBookCrawler extends Crawler { |
|||
|
|||
/** |
|||
* 爬取数据 - 重写父类方法 |
|||
* 课堂知识点:方法重写(多态) |
|||
*/ |
|||
@Override |
|||
public void crawl() { |
|||
crawlTop250(); |
|||
} |
|||
|
|||
/** |
|||
* 打印结果 - 重写父类方法 |
|||
* 课堂知识点:方法重写(多态) |
|||
*/ |
|||
@Override |
|||
public void printResults() { |
|||
printBooks(); |
|||
} |
|||
|
|||
// 课堂知识点:集合框架 List
|
|||
private List<Book> books; |
|||
|
|||
/** |
|||
* 书籍数据模型 - 课堂知识点:内部类、封装 |
|||
*/ |
|||
public static class Book { |
|||
// 私有属性 - 课堂知识点:封装
|
|||
private int rank; |
|||
private String title; |
|||
private String author; |
|||
private String publisher; |
|||
private double rating; |
|||
private String coverUrl; |
|||
|
|||
// 构造方法
|
|||
public Book(int rank, String title, String author, String publisher, double rating, String coverUrl) { |
|||
this.rank = rank; |
|||
this.title = title; |
|||
this.author = author; |
|||
this.publisher = publisher; |
|||
this.rating = rating; |
|||
this.coverUrl = coverUrl; |
|||
} |
|||
|
|||
// getter方法 - 课堂知识点:封装的访问接口
|
|||
public int getRank() { return rank; } |
|||
public String getTitle() { return title; } |
|||
public String getAuthor() { return author; } |
|||
public String getPublisher() { return publisher; } |
|||
public double getRating() { return rating; } |
|||
public String getCoverUrl() { return coverUrl; } |
|||
} |
|||
|
|||
/** |
|||
* 构造方法 - 初始化书籍列表 |
|||
*/ |
|||
public DoubanBookCrawler() { |
|||
this.books = new ArrayList<>(); |
|||
} |
|||
|
|||
/** |
|||
* 爬取豆瓣读书Top250 |
|||
*/ |
|||
public void crawlTop250() { |
|||
books.clear(); |
|||
System.out.println("\n========== 开始爬取豆瓣读书Top250 =========="); |
|||
|
|||
// 先检查网络状态 - 使用父类方法
|
|||
if (!isNetworkAvailable()) { |
|||
System.err.println("❌ 网络连接不可用!请检查网络设置"); |
|||
System.out.println("使用默认书籍数据..."); |
|||
loadDefaultBooks(); |
|||
setSuccess(true); |
|||
setDataCount(books.size()); |
|||
return; |
|||
} |
|||
|
|||
try { |
|||
int page = 0; |
|||
int count = 0; |
|||
|
|||
// 课堂知识点:while循环
|
|||
while (count < 25 && page < 3) { |
|||
String url = "https://book.douban.com/top250?start=" + (page * 25); |
|||
System.out.println("正在爬取第" + (page + 1) + "页: " + url); |
|||
|
|||
// 使用父类的延迟方法
|
|||
delay(); |
|||
|
|||
Document doc = HttpCrawler.get(url); |
|||
|
|||
Elements bookItems = doc.select("tr.item"); |
|||
|
|||
// 课堂知识点:增强for循环
|
|||
for (Element item : bookItems) { |
|||
if (count >= 25) break; |
|||
|
|||
Element rankElement = item.selectFirst("td:nth-child(1) div"); |
|||
Element titleElement = item.selectFirst("td:nth-child(2) a"); |
|||
Element infoElement = item.selectFirst("td:nth-child(2) p.pl"); |
|||
Element ratingElement = item.selectFirst("td:nth-child(2) span.rating_nums"); |
|||
|
|||
if (titleElement != null) { |
|||
int rank = count + 1; |
|||
if (rankElement != null) { |
|||
try { |
|||
rank = Integer.parseInt(rankElement.text().trim()); |
|||
} catch (NumberFormatException e) { |
|||
rank = count + 1; |
|||
} |
|||
} |
|||
|
|||
String title = titleElement.attr("title").trim(); |
|||
if (title.isEmpty()) { |
|||
title = titleElement.text().trim().replaceAll("\\s+", " "); |
|||
} |
|||
|
|||
String author = ""; |
|||
String publisher = ""; |
|||
if (infoElement != null) { |
|||
String info = infoElement.text().trim(); |
|||
// 课堂知识点:字符串分割
|
|||
String[] parts = info.split("/"); |
|||
if (parts.length >= 2) { |
|||
author = parts[0].trim(); |
|||
if (parts.length >= 3) { |
|||
publisher = parts[parts.length - 2].trim(); |
|||
} |
|||
} |
|||
} |
|||
|
|||
double rating = 0.0; |
|||
if (ratingElement != null) { |
|||
try { |
|||
rating = Double.parseDouble(ratingElement.text().trim()); |
|||
} catch (NumberFormatException e) { |
|||
rating = 0.0; |
|||
} |
|||
} |
|||
|
|||
if (!title.isEmpty()) { |
|||
books.add(new Book(rank, title, author, publisher, rating, "")); |
|||
count++; |
|||
} |
|||
} |
|||
} |
|||
|
|||
page++; |
|||
System.out.println("第" + page + "页完成,已获取" + count + "本书"); |
|||
} |
|||
|
|||
System.out.println("爬取完成!共获取" + books.size() + "本书"); |
|||
|
|||
} catch (java.io.IOException e) { |
|||
System.err.println("❌ 爬取豆瓣读书失败: " + e.getMessage()); |
|||
System.out.println("使用默认书籍数据..."); |
|||
loadDefaultBooks(); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 加载默认书籍数据(离线备用) |
|||
*/ |
|||
private void loadDefaultBooks() { |
|||
System.out.println("使用默认书籍数据..."); |
|||
// 课堂知识点:数组初始化
|
|||
String[] titles = {"活着", "百年孤独", "1984", "三体", "红楼梦", "围城", "追风筝的人", "小王子", "解忧杂货店", "白夜行"}; |
|||
String[] authors = {"余华", "加西亚·马尔克斯", "乔治·奥威尔", "刘慈欣", "曹雪芹", "钱钟书", "卡勒德·胡赛尼", "圣埃克苏佩里", "东野圭吾", "东野圭吾"}; |
|||
String[] publishers = {"作家出版社", "南海出版公司", "译林出版社", "重庆出版社", "人民文学出版社", "人民文学出版社", "上海文艺出版社", "人民文学出版社", "南海出版公司", "南海出版公司"}; |
|||
double[] ratings = {9.4, 9.3, 9.3, 9.2, 9.6, 9.3, 8.9, 9.1, 8.5, 9.1}; |
|||
|
|||
// 课堂知识点:for循环遍历数组
|
|||
for (int i = 0; i < titles.length; i++) { |
|||
books.add(new Book(i + 1, titles[i], authors[i], publishers[i], ratings[i], "")); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 打印书籍列表 - 课堂知识点:视图展示(MVC中的View) |
|||
*/ |
|||
public void printBooks() { |
|||
System.out.println("\n========== 豆瓣读书Top25精选 =========="); |
|||
for (Book book : books) { |
|||
System.out.printf("%3d. 《%s》\n", book.getRank(), book.getTitle()); |
|||
System.out.println(" 作者: " + book.getAuthor()); |
|||
System.out.println(" 出版社: " + book.getPublisher()); |
|||
System.out.println(" 评分: " + book.getRating()); |
|||
System.out.println(); |
|||
} |
|||
System.out.println("====================================="); |
|||
} |
|||
|
|||
/** |
|||
* 保存书籍数据到文件 - 课堂知识点:文件IO |
|||
* @param filename 文件名 |
|||
* @throws IOException 写入异常 |
|||
*/ |
|||
public void saveToFile(String filename) throws IOException { |
|||
// 课堂知识点:try-with-resources(自动关闭资源)
|
|||
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { |
|||
writer.write("排名|书名|作者|出版社|评分"); |
|||
writer.newLine(); |
|||
writer.write("====================================="); |
|||
writer.newLine(); |
|||
|
|||
for (Book book : books) { |
|||
writer.write(String.format("%d|%s|%s|%s|%.1f", |
|||
book.getRank(), |
|||
book.getTitle(), |
|||
book.getAuthor(), |
|||
book.getPublisher(), |
|||
book.getRating())); |
|||
writer.newLine(); |
|||
} |
|||
|
|||
System.out.println("✅ 书籍数据已保存到文件: " + filename); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 获取书籍列表 |
|||
* @return 书籍列表 |
|||
*/ |
|||
public List<Book> getBooks() { |
|||
return books; |
|||
} |
|||
|
|||
/** |
|||
* 保存书籍数据到数据库 - 课堂知识点:JDBC、数据库持久化 |
|||
*/ |
|||
@Override |
|||
public void saveToDatabase() { |
|||
String sql = |
|||
"INSERT INTO books (title, author, publisher, publishDate, rating, price, crawlTime) " + |
|||
"VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)"; |
|||
|
|||
try (Connection conn = DatabaseManager.getInstance().getConnection(); |
|||
PreparedStatement pstmt = conn.prepareStatement(sql)) { |
|||
|
|||
for (Book book : books) { |
|||
pstmt.setString(1, book.getTitle()); |
|||
pstmt.setString(2, book.getAuthor()); |
|||
pstmt.setString(3, book.getPublisher()); |
|||
pstmt.setString(4, null); // publishDate
|
|||
pstmt.setString(5, String.valueOf(book.getRating())); |
|||
pstmt.setString(6, null); // price
|
|||
|
|||
pstmt.addBatch(); |
|||
} |
|||
|
|||
int[] rowsAffected = pstmt.executeBatch(); |
|||
System.out.println("✅ " + rowsAffected.length + " 条书籍数据已保存到数据库"); |
|||
} catch (SQLException e) { |
|||
System.err.println("❌ 保存书籍数据到数据库失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,277 @@ |
|||
package com.example; |
|||
|
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.BufferedWriter; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.sql.Connection; |
|||
import java.sql.PreparedStatement; |
|||
import java.sql.SQLException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
/** |
|||
* 豆瓣电影Top250爬虫 |
|||
* 课堂知识点:类与对象、封装、继承、多态、集合框架、异常处理、文件IO |
|||
*/ |
|||
public class DoubanMovieCrawler extends Crawler { |
|||
|
|||
/** |
|||
* 爬取数据 - 重写父类方法 |
|||
* 课堂知识点:方法重写(多态) |
|||
*/ |
|||
@Override |
|||
public void crawl() { |
|||
crawlTop250(); |
|||
} |
|||
|
|||
/** |
|||
* 打印结果 - 重写父类方法 |
|||
* 课堂知识点:方法重写(多态) |
|||
*/ |
|||
@Override |
|||
public void printResults() { |
|||
printMovies(); |
|||
} |
|||
|
|||
// 课堂知识点:集合框架 List
|
|||
private List<Movie> movies; |
|||
|
|||
/** |
|||
* 电影数据模型 - 课堂知识点:内部类、封装 |
|||
*/ |
|||
public static class Movie { |
|||
// 私有属性 - 课堂知识点:封装
|
|||
private int rank; |
|||
private String title; |
|||
private String year; |
|||
private String director; |
|||
private double rating; |
|||
|
|||
// 构造方法
|
|||
public Movie(int rank, String title, String year, String director, double rating) { |
|||
this.rank = rank; |
|||
this.title = title; |
|||
this.year = year; |
|||
this.director = director; |
|||
this.rating = rating; |
|||
} |
|||
|
|||
// getter方法 - 课堂知识点:封装的访问接口
|
|||
public int getRank() { return rank; } |
|||
public String getTitle() { return title; } |
|||
public String getYear() { return year; } |
|||
public String getDirector() { return director; } |
|||
public double getRating() { return rating; } |
|||
} |
|||
|
|||
/** |
|||
* 构造方法 - 初始化电影列表 |
|||
*/ |
|||
public DoubanMovieCrawler() { |
|||
this.movies = new ArrayList<>(); |
|||
} |
|||
|
|||
/** |
|||
* 爬取豆瓣电影Top250 |
|||
*/ |
|||
public void crawlTop250() { |
|||
movies.clear(); |
|||
System.out.println("\n========== 开始爬取豆瓣电影Top250 =========="); |
|||
|
|||
// 先检查网络状态 - 使用父类方法
|
|||
if (!isNetworkAvailable()) { |
|||
System.err.println("❌ 网络连接不可用!请检查网络设置"); |
|||
System.out.println("使用默认电影数据..."); |
|||
loadDefaultMovies(); |
|||
setSuccess(true); |
|||
setDataCount(movies.size()); |
|||
return; |
|||
} |
|||
|
|||
try { |
|||
// 课堂知识点:for循环
|
|||
for (int page = 0; page < 10; page++) { |
|||
if (movies.size() >= 250) break; |
|||
|
|||
int start = page * 25; |
|||
String url = "https://movie.douban.com/top250?start=" + start; |
|||
System.out.println("正在爬取第" + (page + 1) + "页: " + url); |
|||
|
|||
// 使用父类的延迟方法
|
|||
delay(); |
|||
|
|||
Document doc = HttpCrawler.get(url); |
|||
|
|||
Elements items = doc.select("ol.grid_view li"); |
|||
|
|||
// 课堂知识点:增强for循环
|
|||
for (Element item : items) { |
|||
// 排名
|
|||
int rank = Integer.parseInt(item.selectFirst("em").text()); |
|||
|
|||
// 电影名
|
|||
String title = item.selectFirst("span.title").text(); |
|||
|
|||
// 年份和导演信息
|
|||
String info = item.selectFirst("div.bd p").text(); |
|||
String year = extractYear(info); |
|||
String director = extractDirector(info); |
|||
|
|||
// 评分
|
|||
double rating = Double.parseDouble(item.selectFirst("span.rating_num").text()); |
|||
|
|||
// 创建电影对象并添加到列表
|
|||
movies.add(new Movie(rank, title, year, director, rating)); |
|||
} |
|||
|
|||
System.out.println("第" + (page + 1) + "页完成,已获取" + movies.size() + "部电影"); |
|||
} |
|||
|
|||
System.out.println("爬取完成!共获取" + movies.size() + "部电影"); |
|||
|
|||
} catch (java.io.IOException e) { |
|||
System.err.println("❌ 爬取豆瓣电影Top250失败: " + e.getMessage()); |
|||
System.out.println("使用默认电影数据..."); |
|||
loadDefaultMovies(); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 加载默认电影数据(离线备用) |
|||
*/ |
|||
private void loadDefaultMovies() { |
|||
// 课堂知识点:数组初始化
|
|||
String[] titles = {"肖申克的救赎", "霸王别姬", "阿甘正传", "泰坦尼克号", "盗梦空间"}; |
|||
String[] years = {"1994", "1993", "1994", "1997", "2010"}; |
|||
String[] directors = {"弗兰克·德拉邦特", "陈凯歌", "罗伯特·泽米吉斯", "詹姆斯·卡梅隆", "克里斯托弗·诺兰"}; |
|||
double[] ratings = {9.7, 9.6, 9.5, 9.4, 9.3}; |
|||
|
|||
// 课堂知识点:for循环遍历数组
|
|||
for (int i = 0; i < titles.length; i++) { |
|||
movies.add(new Movie(i + 1, titles[i], years[i], directors[i], ratings[i])); |
|||
} |
|||
System.out.println("已加载" + movies.size() + "部默认电影数据"); |
|||
} |
|||
|
|||
/** |
|||
* 从信息字符串中提取年份 |
|||
* @param info 电影信息字符串 |
|||
* @return 年份 |
|||
*/ |
|||
private String extractYear(String info) { |
|||
int start = info.indexOf("("); |
|||
int end = info.indexOf(")"); |
|||
if (start != -1 && end != -1) { |
|||
String yearStr = info.substring(start + 1, end); |
|||
// 提取年份数字
|
|||
StringBuilder year = new StringBuilder(); |
|||
for (char c : yearStr.toCharArray()) { |
|||
if (Character.isDigit(c)) { |
|||
year.append(c); |
|||
} |
|||
} |
|||
return year.toString(); |
|||
} |
|||
return "未知年份"; |
|||
} |
|||
|
|||
/** |
|||
* 从信息字符串中提取导演 |
|||
* @param info 电影信息字符串 |
|||
* @return 导演姓名 |
|||
*/ |
|||
private String extractDirector(String info) { |
|||
int start = info.indexOf("导演:"); |
|||
if (start != -1) { |
|||
String remaining = info.substring(start + 3); |
|||
int end = remaining.indexOf("主演:"); |
|||
if (end != -1) { |
|||
return remaining.substring(0, end).trim(); |
|||
} |
|||
return remaining.trim(); |
|||
} |
|||
return "未知导演"; |
|||
} |
|||
|
|||
/** |
|||
* 打印电影列表 - 课堂知识点:视图展示(MVC中的View) |
|||
*/ |
|||
public void printMovies() { |
|||
System.out.println("\n========== 豆瓣电影Top250榜单 =========="); |
|||
for (Movie movie : movies) { |
|||
System.out.printf("%2d. 《%s》 - %s年 - 导演: %s - 评分: %.1f\n", |
|||
movie.getRank(), movie.getTitle(), movie.getYear(), |
|||
movie.getDirector(), movie.getRating()); |
|||
} |
|||
System.out.println("=========================================="); |
|||
} |
|||
|
|||
/** |
|||
* 保存电影数据到文件 - 课堂知识点:文件IO |
|||
* @param filename 文件名 |
|||
* @throws IOException 写入异常 |
|||
*/ |
|||
public void saveToFile(String filename) throws IOException { |
|||
// 课堂知识点:try-with-resources(自动关闭资源)
|
|||
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { |
|||
writer.write("排名|电影名|年份|导演|评分"); |
|||
writer.newLine(); |
|||
writer.write("=========================================="); |
|||
writer.newLine(); |
|||
|
|||
for (Movie movie : movies) { |
|||
writer.write(String.format("%d|%s|%s|%s|%.1f", |
|||
movie.getRank(), |
|||
movie.getTitle(), |
|||
movie.getYear(), |
|||
movie.getDirector(), |
|||
movie.getRating())); |
|||
writer.newLine(); |
|||
} |
|||
|
|||
System.out.println("✅ 电影数据已保存到文件: " + filename); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 获取电影列表 |
|||
* @return 电影列表 |
|||
*/ |
|||
public List<Movie> getMovies() { |
|||
return movies; |
|||
} |
|||
|
|||
/** |
|||
* 保存电影数据到数据库 - 课堂知识点:JDBC、数据库持久化 |
|||
*/ |
|||
@Override |
|||
public void saveToDatabase() { |
|||
String sql = |
|||
"INSERT INTO movies (title, rating, director, actors, year, type, crawlTime) " + |
|||
"VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)"; |
|||
|
|||
try (Connection conn = DatabaseManager.getInstance().getConnection(); |
|||
PreparedStatement pstmt = conn.prepareStatement(sql)) { |
|||
|
|||
for (Movie movie : movies) { |
|||
pstmt.setString(1, movie.getTitle()); |
|||
pstmt.setString(2, String.valueOf(movie.getRating())); |
|||
pstmt.setString(3, movie.getDirector()); |
|||
pstmt.setString(4, null); // actors
|
|||
pstmt.setString(5, movie.getYear()); |
|||
pstmt.setString(6, "电影"); |
|||
|
|||
pstmt.addBatch(); |
|||
} |
|||
|
|||
int[] rowsAffected = pstmt.executeBatch(); |
|||
System.out.println("✅ " + rowsAffected.length + " 条电影数据已保存到数据库"); |
|||
} catch (SQLException e) { |
|||
System.err.println("❌ 保存电影数据到数据库失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,181 @@ |
|||
package com.example; |
|||
|
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
|
|||
import java.io.IOException; |
|||
import java.net.InetAddress; |
|||
|
|||
/** |
|||
* HTTP爬虫工具类 - 提供带重试机制的网络请求 |
|||
* 课堂知识点:单例模式、静态方法、异常处理 |
|||
*/ |
|||
public class HttpCrawler { |
|||
|
|||
/** |
|||
* 单例实例 - 课堂知识点:单例模式 |
|||
*/ |
|||
private static HttpCrawler instance; |
|||
|
|||
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; |
|||
private static final int DEFAULT_TIMEOUT = 30000; |
|||
private static final int DEFAULT_RETRY_COUNT = 3; |
|||
private static final long RETRY_DELAY_MS = 2000; |
|||
|
|||
/** |
|||
* 私有构造方法 - 防止外部实例化(单例模式) |
|||
*/ |
|||
private HttpCrawler() { |
|||
// 私有构造,确保只能通过getInstance()获取实例
|
|||
} |
|||
|
|||
/** |
|||
* 获取单例实例 - 课堂知识点:单例模式 |
|||
* @return HttpCrawler实例 |
|||
*/ |
|||
public static HttpCrawler getInstance() { |
|||
if (instance == null) { |
|||
synchronized (HttpCrawler.class) { |
|||
if (instance == null) { |
|||
instance = new HttpCrawler(); |
|||
} |
|||
} |
|||
} |
|||
return instance; |
|||
} |
|||
|
|||
/** |
|||
* 检查网络连接是否可用 |
|||
*/ |
|||
public static boolean isNetworkAvailable() { |
|||
try { |
|||
// 尝试连接公共DNS服务器
|
|||
InetAddress address = InetAddress.getByName("8.8.8.8"); |
|||
return address.isReachable(3000); |
|||
} catch (IOException e) { |
|||
return false; |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 检查特定网站是否可访问 |
|||
*/ |
|||
public static boolean isWebsiteReachable(String url) { |
|||
try { |
|||
String domain = extractDomain(url); |
|||
InetAddress address = InetAddress.getByName(domain); |
|||
return address.isReachable(5000); |
|||
} catch (Exception e) { |
|||
return false; |
|||
} |
|||
} |
|||
|
|||
private static String extractDomain(String url) { |
|||
if (url == null || url.isEmpty()) { |
|||
return ""; |
|||
} |
|||
// 移除协议部分
|
|||
String domain = url.replaceFirst("^https?://", ""); |
|||
// 移除路径部分
|
|||
int slashIndex = domain.indexOf('/'); |
|||
if (slashIndex > 0) { |
|||
domain = domain.substring(0, slashIndex); |
|||
} |
|||
// 移除端口部分
|
|||
int colonIndex = domain.indexOf(':'); |
|||
if (colonIndex > 0) { |
|||
domain = domain.substring(0, colonIndex); |
|||
} |
|||
return domain; |
|||
} |
|||
|
|||
/** |
|||
* 发送HTTP GET请求,带重试机制 |
|||
*/ |
|||
public static Document get(String url) throws IOException { |
|||
return get(url, DEFAULT_TIMEOUT, DEFAULT_RETRY_COUNT); |
|||
} |
|||
|
|||
/** |
|||
* 发送HTTP GET请求,带重试机制 |
|||
* @param url 请求URL |
|||
* @param timeout 超时时间(毫秒) |
|||
* @param maxRetries 最大重试次数 |
|||
*/ |
|||
public static Document get(String url, int timeout, int maxRetries) throws IOException { |
|||
IOException lastException = null; |
|||
|
|||
// 课堂知识点:for循环
|
|||
for (int attempt = 1; attempt <= maxRetries; attempt++) { |
|||
try { |
|||
System.out.println("[请求尝试 " + attempt + "/" + maxRetries + "] " + url); |
|||
|
|||
Document doc = Jsoup.connect(url) |
|||
.userAgent(USER_AGENT) |
|||
.header("Accept-Language", "zh-CN,zh;q=0.9") |
|||
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
|||
.header("Connection", "keep-alive") |
|||
.timeout(timeout) |
|||
.followRedirects(true) |
|||
.get(); |
|||
|
|||
System.out.println("[请求成功] " + url); |
|||
return doc; |
|||
|
|||
} catch (java.net.SocketTimeoutException e) { |
|||
lastException = e; |
|||
System.err.println("[请求超时 " + attempt + "/" + maxRetries + "] " + url); |
|||
printNetworkTips(); |
|||
} catch (java.net.ConnectException e) { |
|||
lastException = e; |
|||
System.err.println("[连接失败 " + attempt + "/" + maxRetries + "] " + url); |
|||
printNetworkTips(); |
|||
} catch (java.net.UnknownHostException e) { |
|||
lastException = e; |
|||
System.err.println("[域名解析失败 " + attempt + "/" + maxRetries + "] " + url); |
|||
printNetworkTips(); |
|||
} catch (IOException e) { |
|||
lastException = e; |
|||
System.err.println("[请求异常 " + attempt + "/" + maxRetries + "] " + url + ": " + e.getMessage()); |
|||
} |
|||
|
|||
// 如果不是最后一次尝试,等待后重试
|
|||
if (attempt < maxRetries) { |
|||
try { |
|||
System.out.println("等待 " + RETRY_DELAY_MS + "ms 后重试..."); |
|||
Thread.sleep(RETRY_DELAY_MS); |
|||
} catch (InterruptedException ie) { |
|||
Thread.currentThread().interrupt(); |
|||
throw new IOException("请求被中断", ie); |
|||
} |
|||
} |
|||
} |
|||
|
|||
// 所有重试都失败
|
|||
throw new IOException("请求失败,已重试 " + maxRetries + " 次: " + url, lastException); |
|||
} |
|||
|
|||
/** |
|||
* 打印网络连接提示 |
|||
*/ |
|||
public static void printNetworkTips() { |
|||
System.err.println("====================================="); |
|||
System.err.println("网络连接问题排查:"); |
|||
System.err.println(" 1. 请检查您的网络连接是否正常"); |
|||
System.err.println(" 2. 尝试访问其他网站测试网络"); |
|||
System.err.println(" 3. 如果访问境外网站,可能需要VPN"); |
|||
System.err.println(" 4. 检查防火墙是否阻止了连接"); |
|||
System.err.println(" 5. 检查DNS设置是否正确"); |
|||
System.err.println("====================================="); |
|||
} |
|||
|
|||
/** |
|||
* 打印网络恢复提示 |
|||
*/ |
|||
public static void printNetworkRecovery() { |
|||
System.out.println("====================================="); |
|||
System.out.println("✓ 网络连接已恢复!"); |
|||
System.out.println("✓ 正在重新尝试获取数据..."); |
|||
System.out.println("====================================="); |
|||
} |
|||
} |
|||
Loading…
Reference in new issue