5 changed files with 888 additions and 0 deletions
@ -0,0 +1,35 @@ |
|||||
|
package com.example; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* CSV格式保存策略 - 策略模式实现 |
||||
|
*/ |
||||
|
public class CsvSaveStrategy implements SaveStrategy { |
||||
|
@Override |
||||
|
public void save(Object data, String filename) throws IOException { |
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { |
||||
|
if (data instanceof List) { |
||||
|
for (Object item : (List<?>) data) { |
||||
|
String str = item.toString(); |
||||
|
if (str.contains(",") || str.contains("\"") || str.contains("\n")) { |
||||
|
writer.write("\"" + str.replace("\"", "\"\"") + "\""); |
||||
|
} else { |
||||
|
writer.write(str); |
||||
|
} |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
} else { |
||||
|
writer.write(data.toString()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getStrategyName() { |
||||
|
return "CSV格式"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,118 @@ |
|||||
|
package com.example; |
||||
|
|
||||
|
import java.sql.Connection; |
||||
|
import java.sql.DriverManager; |
||||
|
import java.sql.SQLException; |
||||
|
import java.sql.Statement; |
||||
|
|
||||
|
/** |
||||
|
* 数据库管理器 - 单例模式 |
||||
|
* 提供 SQLite 数据库连接和初始化 |
||||
|
* 课堂知识点:单例模式、JDBC、数据库操作 |
||||
|
*/ |
||||
|
public class DatabaseManager { |
||||
|
|
||||
|
private static DatabaseManager instance; |
||||
|
private Connection connection; |
||||
|
private static final String DB_NAME = "crawler_data.db"; |
||||
|
|
||||
|
private DatabaseManager() { |
||||
|
try { |
||||
|
// 加载 SQLite JDBC 驱动
|
||||
|
Class.forName("org.sqlite.JDBC"); |
||||
|
// 连接到 SQLite 数据库(不存在则自动创建)
|
||||
|
connection = DriverManager.getConnection("jdbc:sqlite:" + DB_NAME); |
||||
|
System.out.println("✅ 数据库连接成功"); |
||||
|
// 初始化表结构
|
||||
|
initializeTables(); |
||||
|
} catch (ClassNotFoundException | SQLException e) { |
||||
|
System.err.println("❌ 数据库连接失败: " + e.getMessage()); |
||||
|
throw new RuntimeException(e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 获取单例实例 |
||||
|
*/ |
||||
|
public static synchronized DatabaseManager getInstance() { |
||||
|
if (instance == null) { |
||||
|
instance = new DatabaseManager(); |
||||
|
} |
||||
|
return instance; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 获取数据库连接 |
||||
|
*/ |
||||
|
public Connection getConnection() { |
||||
|
return connection; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 初始化数据库表结构 |
||||
|
*/ |
||||
|
private void initializeTables() { |
||||
|
try (Statement stmt = connection.createStatement()) { |
||||
|
// 创建游戏表
|
||||
|
String createGameTable = |
||||
|
"CREATE TABLE IF NOT EXISTS games (" + |
||||
|
"id INTEGER PRIMARY KEY AUTOINCREMENT, " + |
||||
|
"name TEXT NOT NULL, " + |
||||
|
"price TEXT, " + |
||||
|
"discount TEXT, " + |
||||
|
"originalPrice TEXT, " + |
||||
|
"releaseDate TEXT, " + |
||||
|
"tags TEXT, " + |
||||
|
"reviewScore TEXT, " + |
||||
|
"crawlTime TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + |
||||
|
")"; |
||||
|
stmt.execute(createGameTable); |
||||
|
|
||||
|
// 创建电影表
|
||||
|
String createMovieTable = |
||||
|
"CREATE TABLE IF NOT EXISTS movies (" + |
||||
|
"id INTEGER PRIMARY KEY AUTOINCREMENT, " + |
||||
|
"title TEXT NOT NULL, " + |
||||
|
"rating TEXT, " + |
||||
|
"director TEXT, " + |
||||
|
"actors TEXT, " + |
||||
|
"year TEXT, " + |
||||
|
"type TEXT, " + |
||||
|
"crawlTime TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + |
||||
|
")"; |
||||
|
stmt.execute(createMovieTable); |
||||
|
|
||||
|
// 创建书籍表
|
||||
|
String createBookTable = |
||||
|
"CREATE TABLE IF NOT EXISTS books (" + |
||||
|
"id INTEGER PRIMARY KEY AUTOINCREMENT, " + |
||||
|
"title TEXT NOT NULL, " + |
||||
|
"author TEXT, " + |
||||
|
"publisher TEXT, " + |
||||
|
"publishDate TEXT, " + |
||||
|
"rating TEXT, " + |
||||
|
"price TEXT, " + |
||||
|
"crawlTime TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + |
||||
|
")"; |
||||
|
stmt.execute(createBookTable); |
||||
|
|
||||
|
System.out.println("✅ 数据库表初始化完成"); |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("❌ 表初始化失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 关闭数据库连接 |
||||
|
*/ |
||||
|
public void close() { |
||||
|
if (connection != null) { |
||||
|
try { |
||||
|
connection.close(); |
||||
|
System.out.println("✅ 数据库连接已关闭"); |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("❌ 关闭数据库连接失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,277 @@ |
|||||
|
package com.example; |
||||
|
|
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.sql.Connection; |
||||
|
import java.sql.PreparedStatement; |
||||
|
import java.sql.SQLException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 豆瓣读书Top250爬虫 |
||||
|
* 课堂知识点:类与对象、封装、继承、多态、集合框架、异常处理、文件IO |
||||
|
*/ |
||||
|
public class DoubanBookCrawler extends Crawler { |
||||
|
|
||||
|
/** |
||||
|
* 爬取数据 - 重写父类方法 |
||||
|
* 课堂知识点:方法重写(多态) |
||||
|
*/ |
||||
|
@Override |
||||
|
public void crawl() { |
||||
|
crawlTop250(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 打印结果 - 重写父类方法 |
||||
|
* 课堂知识点:方法重写(多态) |
||||
|
*/ |
||||
|
@Override |
||||
|
public void printResults() { |
||||
|
printBooks(); |
||||
|
} |
||||
|
|
||||
|
// 课堂知识点:集合框架 List
|
||||
|
private List<Book> books; |
||||
|
|
||||
|
/** |
||||
|
* 书籍数据模型 - 课堂知识点:内部类、封装 |
||||
|
*/ |
||||
|
public static class Book { |
||||
|
// 私有属性 - 课堂知识点:封装
|
||||
|
private int rank; |
||||
|
private String title; |
||||
|
private String author; |
||||
|
private String publisher; |
||||
|
private double rating; |
||||
|
private String coverUrl; |
||||
|
|
||||
|
// 构造方法
|
||||
|
public Book(int rank, String title, String author, String publisher, double rating, String coverUrl) { |
||||
|
this.rank = rank; |
||||
|
this.title = title; |
||||
|
this.author = author; |
||||
|
this.publisher = publisher; |
||||
|
this.rating = rating; |
||||
|
this.coverUrl = coverUrl; |
||||
|
} |
||||
|
|
||||
|
// getter方法 - 课堂知识点:封装的访问接口
|
||||
|
public int getRank() { return rank; } |
||||
|
public String getTitle() { return title; } |
||||
|
public String getAuthor() { return author; } |
||||
|
public String getPublisher() { return publisher; } |
||||
|
public double getRating() { return rating; } |
||||
|
public String getCoverUrl() { return coverUrl; } |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 构造方法 - 初始化书籍列表 |
||||
|
*/ |
||||
|
public DoubanBookCrawler() { |
||||
|
this.books = new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 爬取豆瓣读书Top250 |
||||
|
*/ |
||||
|
public void crawlTop250() { |
||||
|
books.clear(); |
||||
|
System.out.println("\n========== 开始爬取豆瓣读书Top250 =========="); |
||||
|
|
||||
|
// 先检查网络状态 - 使用父类方法
|
||||
|
if (!isNetworkAvailable()) { |
||||
|
System.err.println("❌ 网络连接不可用!请检查网络设置"); |
||||
|
System.out.println("使用默认书籍数据..."); |
||||
|
loadDefaultBooks(); |
||||
|
setSuccess(true); |
||||
|
setDataCount(books.size()); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
int page = 0; |
||||
|
int count = 0; |
||||
|
|
||||
|
// 课堂知识点:while循环
|
||||
|
while (count < 25 && page < 3) { |
||||
|
String url = "https://book.douban.com/top250?start=" + (page * 25); |
||||
|
System.out.println("正在爬取第" + (page + 1) + "页: " + url); |
||||
|
|
||||
|
// 使用父类的延迟方法
|
||||
|
delay(); |
||||
|
|
||||
|
Document doc = HttpCrawler.get(url); |
||||
|
|
||||
|
Elements bookItems = doc.select("tr.item"); |
||||
|
|
||||
|
// 课堂知识点:增强for循环
|
||||
|
for (Element item : bookItems) { |
||||
|
if (count >= 25) break; |
||||
|
|
||||
|
Element rankElement = item.selectFirst("td:nth-child(1) div"); |
||||
|
Element titleElement = item.selectFirst("td:nth-child(2) a"); |
||||
|
Element infoElement = item.selectFirst("td:nth-child(2) p.pl"); |
||||
|
Element ratingElement = item.selectFirst("td:nth-child(2) span.rating_nums"); |
||||
|
|
||||
|
if (titleElement != null) { |
||||
|
int rank = count + 1; |
||||
|
if (rankElement != null) { |
||||
|
try { |
||||
|
rank = Integer.parseInt(rankElement.text().trim()); |
||||
|
} catch (NumberFormatException e) { |
||||
|
rank = count + 1; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
String title = titleElement.attr("title").trim(); |
||||
|
if (title.isEmpty()) { |
||||
|
title = titleElement.text().trim().replaceAll("\\s+", " "); |
||||
|
} |
||||
|
|
||||
|
String author = ""; |
||||
|
String publisher = ""; |
||||
|
if (infoElement != null) { |
||||
|
String info = infoElement.text().trim(); |
||||
|
// 课堂知识点:字符串分割
|
||||
|
String[] parts = info.split("/"); |
||||
|
if (parts.length >= 2) { |
||||
|
author = parts[0].trim(); |
||||
|
if (parts.length >= 3) { |
||||
|
publisher = parts[parts.length - 2].trim(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
double rating = 0.0; |
||||
|
if (ratingElement != null) { |
||||
|
try { |
||||
|
rating = Double.parseDouble(ratingElement.text().trim()); |
||||
|
} catch (NumberFormatException e) { |
||||
|
rating = 0.0; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (!title.isEmpty()) { |
||||
|
books.add(new Book(rank, title, author, publisher, rating, "")); |
||||
|
count++; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
page++; |
||||
|
System.out.println("第" + page + "页完成,已获取" + count + "本书"); |
||||
|
} |
||||
|
|
||||
|
System.out.println("爬取完成!共获取" + books.size() + "本书"); |
||||
|
|
||||
|
} catch (java.io.IOException e) { |
||||
|
System.err.println("❌ 爬取豆瓣读书失败: " + e.getMessage()); |
||||
|
System.out.println("使用默认书籍数据..."); |
||||
|
loadDefaultBooks(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 加载默认书籍数据(离线备用) |
||||
|
*/ |
||||
|
private void loadDefaultBooks() { |
||||
|
System.out.println("使用默认书籍数据..."); |
||||
|
// 课堂知识点:数组初始化
|
||||
|
String[] titles = {"活着", "百年孤独", "1984", "三体", "红楼梦", "围城", "追风筝的人", "小王子", "解忧杂货店", "白夜行"}; |
||||
|
String[] authors = {"余华", "加西亚·马尔克斯", "乔治·奥威尔", "刘慈欣", "曹雪芹", "钱钟书", "卡勒德·胡赛尼", "圣埃克苏佩里", "东野圭吾", "东野圭吾"}; |
||||
|
String[] publishers = {"作家出版社", "南海出版公司", "译林出版社", "重庆出版社", "人民文学出版社", "人民文学出版社", "上海文艺出版社", "人民文学出版社", "南海出版公司", "南海出版公司"}; |
||||
|
double[] ratings = {9.4, 9.3, 9.3, 9.2, 9.6, 9.3, 8.9, 9.1, 8.5, 9.1}; |
||||
|
|
||||
|
// 课堂知识点:for循环遍历数组
|
||||
|
for (int i = 0; i < titles.length; i++) { |
||||
|
books.add(new Book(i + 1, titles[i], authors[i], publishers[i], ratings[i], "")); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 打印书籍列表 - 课堂知识点:视图展示(MVC中的View) |
||||
|
*/ |
||||
|
public void printBooks() { |
||||
|
System.out.println("\n========== 豆瓣读书Top25精选 =========="); |
||||
|
for (Book book : books) { |
||||
|
System.out.printf("%3d. 《%s》\n", book.getRank(), book.getTitle()); |
||||
|
System.out.println(" 作者: " + book.getAuthor()); |
||||
|
System.out.println(" 出版社: " + book.getPublisher()); |
||||
|
System.out.println(" 评分: " + book.getRating()); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
System.out.println("====================================="); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 保存书籍数据到文件 - 课堂知识点:文件IO |
||||
|
* @param filename 文件名 |
||||
|
* @throws IOException 写入异常 |
||||
|
*/ |
||||
|
public void saveToFile(String filename) throws IOException { |
||||
|
// 课堂知识点:try-with-resources(自动关闭资源)
|
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { |
||||
|
writer.write("排名|书名|作者|出版社|评分"); |
||||
|
writer.newLine(); |
||||
|
writer.write("====================================="); |
||||
|
writer.newLine(); |
||||
|
|
||||
|
for (Book book : books) { |
||||
|
writer.write(String.format("%d|%s|%s|%s|%.1f", |
||||
|
book.getRank(), |
||||
|
book.getTitle(), |
||||
|
book.getAuthor(), |
||||
|
book.getPublisher(), |
||||
|
book.getRating())); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
|
||||
|
System.out.println("✅ 书籍数据已保存到文件: " + filename); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 获取书籍列表 |
||||
|
* @return 书籍列表 |
||||
|
*/ |
||||
|
public List<Book> getBooks() { |
||||
|
return books; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 保存书籍数据到数据库 - 课堂知识点:JDBC、数据库持久化 |
||||
|
*/ |
||||
|
@Override |
||||
|
public void saveToDatabase() { |
||||
|
String sql = |
||||
|
"INSERT INTO books (title, author, publisher, publishDate, rating, price, crawlTime) " + |
||||
|
"VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)"; |
||||
|
|
||||
|
try (Connection conn = DatabaseManager.getInstance().getConnection(); |
||||
|
PreparedStatement pstmt = conn.prepareStatement(sql)) { |
||||
|
|
||||
|
for (Book book : books) { |
||||
|
pstmt.setString(1, book.getTitle()); |
||||
|
pstmt.setString(2, book.getAuthor()); |
||||
|
pstmt.setString(3, book.getPublisher()); |
||||
|
pstmt.setString(4, null); // publishDate
|
||||
|
pstmt.setString(5, String.valueOf(book.getRating())); |
||||
|
pstmt.setString(6, null); // price
|
||||
|
|
||||
|
pstmt.addBatch(); |
||||
|
} |
||||
|
|
||||
|
int[] rowsAffected = pstmt.executeBatch(); |
||||
|
System.out.println("✅ " + rowsAffected.length + " 条书籍数据已保存到数据库"); |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("❌ 保存书籍数据到数据库失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,277 @@ |
|||||
|
package com.example; |
||||
|
|
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.sql.Connection; |
||||
|
import java.sql.PreparedStatement; |
||||
|
import java.sql.SQLException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 豆瓣电影Top250爬虫 |
||||
|
* 课堂知识点:类与对象、封装、继承、多态、集合框架、异常处理、文件IO |
||||
|
*/ |
||||
|
public class DoubanMovieCrawler extends Crawler { |
||||
|
|
||||
|
/** |
||||
|
* 爬取数据 - 重写父类方法 |
||||
|
* 课堂知识点:方法重写(多态) |
||||
|
*/ |
||||
|
@Override |
||||
|
public void crawl() { |
||||
|
crawlTop250(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 打印结果 - 重写父类方法 |
||||
|
* 课堂知识点:方法重写(多态) |
||||
|
*/ |
||||
|
@Override |
||||
|
public void printResults() { |
||||
|
printMovies(); |
||||
|
} |
||||
|
|
||||
|
// 课堂知识点:集合框架 List
|
||||
|
private List<Movie> movies; |
||||
|
|
||||
|
/** |
||||
|
* 电影数据模型 - 课堂知识点:内部类、封装 |
||||
|
*/ |
||||
|
public static class Movie { |
||||
|
// 私有属性 - 课堂知识点:封装
|
||||
|
private int rank; |
||||
|
private String title; |
||||
|
private String year; |
||||
|
private String director; |
||||
|
private double rating; |
||||
|
|
||||
|
// 构造方法
|
||||
|
public Movie(int rank, String title, String year, String director, double rating) { |
||||
|
this.rank = rank; |
||||
|
this.title = title; |
||||
|
this.year = year; |
||||
|
this.director = director; |
||||
|
this.rating = rating; |
||||
|
} |
||||
|
|
||||
|
// getter方法 - 课堂知识点:封装的访问接口
|
||||
|
public int getRank() { return rank; } |
||||
|
public String getTitle() { return title; } |
||||
|
public String getYear() { return year; } |
||||
|
public String getDirector() { return director; } |
||||
|
public double getRating() { return rating; } |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 构造方法 - 初始化电影列表 |
||||
|
*/ |
||||
|
public DoubanMovieCrawler() { |
||||
|
this.movies = new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 爬取豆瓣电影Top250 |
||||
|
*/ |
||||
|
public void crawlTop250() { |
||||
|
movies.clear(); |
||||
|
System.out.println("\n========== 开始爬取豆瓣电影Top250 =========="); |
||||
|
|
||||
|
// 先检查网络状态 - 使用父类方法
|
||||
|
if (!isNetworkAvailable()) { |
||||
|
System.err.println("❌ 网络连接不可用!请检查网络设置"); |
||||
|
System.out.println("使用默认电影数据..."); |
||||
|
loadDefaultMovies(); |
||||
|
setSuccess(true); |
||||
|
setDataCount(movies.size()); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
// 课堂知识点:for循环
|
||||
|
for (int page = 0; page < 10; page++) { |
||||
|
if (movies.size() >= 250) break; |
||||
|
|
||||
|
int start = page * 25; |
||||
|
String url = "https://movie.douban.com/top250?start=" + start; |
||||
|
System.out.println("正在爬取第" + (page + 1) + "页: " + url); |
||||
|
|
||||
|
// 使用父类的延迟方法
|
||||
|
delay(); |
||||
|
|
||||
|
Document doc = HttpCrawler.get(url); |
||||
|
|
||||
|
Elements items = doc.select("ol.grid_view li"); |
||||
|
|
||||
|
// 课堂知识点:增强for循环
|
||||
|
for (Element item : items) { |
||||
|
// 排名
|
||||
|
int rank = Integer.parseInt(item.selectFirst("em").text()); |
||||
|
|
||||
|
// 电影名
|
||||
|
String title = item.selectFirst("span.title").text(); |
||||
|
|
||||
|
// 年份和导演信息
|
||||
|
String info = item.selectFirst("div.bd p").text(); |
||||
|
String year = extractYear(info); |
||||
|
String director = extractDirector(info); |
||||
|
|
||||
|
// 评分
|
||||
|
double rating = Double.parseDouble(item.selectFirst("span.rating_num").text()); |
||||
|
|
||||
|
// 创建电影对象并添加到列表
|
||||
|
movies.add(new Movie(rank, title, year, director, rating)); |
||||
|
} |
||||
|
|
||||
|
System.out.println("第" + (page + 1) + "页完成,已获取" + movies.size() + "部电影"); |
||||
|
} |
||||
|
|
||||
|
System.out.println("爬取完成!共获取" + movies.size() + "部电影"); |
||||
|
|
||||
|
} catch (java.io.IOException e) { |
||||
|
System.err.println("❌ 爬取豆瓣电影Top250失败: " + e.getMessage()); |
||||
|
System.out.println("使用默认电影数据..."); |
||||
|
loadDefaultMovies(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 加载默认电影数据(离线备用) |
||||
|
*/ |
||||
|
private void loadDefaultMovies() { |
||||
|
// 课堂知识点:数组初始化
|
||||
|
String[] titles = {"肖申克的救赎", "霸王别姬", "阿甘正传", "泰坦尼克号", "盗梦空间"}; |
||||
|
String[] years = {"1994", "1993", "1994", "1997", "2010"}; |
||||
|
String[] directors = {"弗兰克·德拉邦特", "陈凯歌", "罗伯特·泽米吉斯", "詹姆斯·卡梅隆", "克里斯托弗·诺兰"}; |
||||
|
double[] ratings = {9.7, 9.6, 9.5, 9.4, 9.3}; |
||||
|
|
||||
|
// 课堂知识点:for循环遍历数组
|
||||
|
for (int i = 0; i < titles.length; i++) { |
||||
|
movies.add(new Movie(i + 1, titles[i], years[i], directors[i], ratings[i])); |
||||
|
} |
||||
|
System.out.println("已加载" + movies.size() + "部默认电影数据"); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 从信息字符串中提取年份 |
||||
|
* @param info 电影信息字符串 |
||||
|
* @return 年份 |
||||
|
*/ |
||||
|
private String extractYear(String info) { |
||||
|
int start = info.indexOf("("); |
||||
|
int end = info.indexOf(")"); |
||||
|
if (start != -1 && end != -1) { |
||||
|
String yearStr = info.substring(start + 1, end); |
||||
|
// 提取年份数字
|
||||
|
StringBuilder year = new StringBuilder(); |
||||
|
for (char c : yearStr.toCharArray()) { |
||||
|
if (Character.isDigit(c)) { |
||||
|
year.append(c); |
||||
|
} |
||||
|
} |
||||
|
return year.toString(); |
||||
|
} |
||||
|
return "未知年份"; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 从信息字符串中提取导演 |
||||
|
* @param info 电影信息字符串 |
||||
|
* @return 导演姓名 |
||||
|
*/ |
||||
|
private String extractDirector(String info) { |
||||
|
int start = info.indexOf("导演:"); |
||||
|
if (start != -1) { |
||||
|
String remaining = info.substring(start + 3); |
||||
|
int end = remaining.indexOf("主演:"); |
||||
|
if (end != -1) { |
||||
|
return remaining.substring(0, end).trim(); |
||||
|
} |
||||
|
return remaining.trim(); |
||||
|
} |
||||
|
return "未知导演"; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 打印电影列表 - 课堂知识点:视图展示(MVC中的View) |
||||
|
*/ |
||||
|
public void printMovies() { |
||||
|
System.out.println("\n========== 豆瓣电影Top250榜单 =========="); |
||||
|
for (Movie movie : movies) { |
||||
|
System.out.printf("%2d. 《%s》 - %s年 - 导演: %s - 评分: %.1f\n", |
||||
|
movie.getRank(), movie.getTitle(), movie.getYear(), |
||||
|
movie.getDirector(), movie.getRating()); |
||||
|
} |
||||
|
System.out.println("=========================================="); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 保存电影数据到文件 - 课堂知识点:文件IO |
||||
|
* @param filename 文件名 |
||||
|
* @throws IOException 写入异常 |
||||
|
*/ |
||||
|
public void saveToFile(String filename) throws IOException { |
||||
|
// 课堂知识点:try-with-resources(自动关闭资源)
|
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { |
||||
|
writer.write("排名|电影名|年份|导演|评分"); |
||||
|
writer.newLine(); |
||||
|
writer.write("=========================================="); |
||||
|
writer.newLine(); |
||||
|
|
||||
|
for (Movie movie : movies) { |
||||
|
writer.write(String.format("%d|%s|%s|%s|%.1f", |
||||
|
movie.getRank(), |
||||
|
movie.getTitle(), |
||||
|
movie.getYear(), |
||||
|
movie.getDirector(), |
||||
|
movie.getRating())); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
|
||||
|
System.out.println("✅ 电影数据已保存到文件: " + filename); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 获取电影列表 |
||||
|
* @return 电影列表 |
||||
|
*/ |
||||
|
public List<Movie> getMovies() { |
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 保存电影数据到数据库 - 课堂知识点:JDBC、数据库持久化 |
||||
|
*/ |
||||
|
@Override |
||||
|
public void saveToDatabase() { |
||||
|
String sql = |
||||
|
"INSERT INTO movies (title, rating, director, actors, year, type, crawlTime) " + |
||||
|
"VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)"; |
||||
|
|
||||
|
try (Connection conn = DatabaseManager.getInstance().getConnection(); |
||||
|
PreparedStatement pstmt = conn.prepareStatement(sql)) { |
||||
|
|
||||
|
for (Movie movie : movies) { |
||||
|
pstmt.setString(1, movie.getTitle()); |
||||
|
pstmt.setString(2, String.valueOf(movie.getRating())); |
||||
|
pstmt.setString(3, movie.getDirector()); |
||||
|
pstmt.setString(4, null); // actors
|
||||
|
pstmt.setString(5, movie.getYear()); |
||||
|
pstmt.setString(6, "电影"); |
||||
|
|
||||
|
pstmt.addBatch(); |
||||
|
} |
||||
|
|
||||
|
int[] rowsAffected = pstmt.executeBatch(); |
||||
|
System.out.println("✅ " + rowsAffected.length + " 条电影数据已保存到数据库"); |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("❌ 保存电影数据到数据库失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,181 @@ |
|||||
|
package com.example; |
||||
|
|
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.net.InetAddress; |
||||
|
|
||||
|
/** |
||||
|
* HTTP爬虫工具类 - 提供带重试机制的网络请求 |
||||
|
* 课堂知识点:单例模式、静态方法、异常处理 |
||||
|
*/ |
||||
|
public class HttpCrawler { |
||||
|
|
||||
|
/** |
||||
|
* 单例实例 - 课堂知识点:单例模式 |
||||
|
*/ |
||||
|
private static HttpCrawler instance; |
||||
|
|
||||
|
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; |
||||
|
private static final int DEFAULT_TIMEOUT = 30000; |
||||
|
private static final int DEFAULT_RETRY_COUNT = 3; |
||||
|
private static final long RETRY_DELAY_MS = 2000; |
||||
|
|
||||
|
/** |
||||
|
* 私有构造方法 - 防止外部实例化(单例模式) |
||||
|
*/ |
||||
|
private HttpCrawler() { |
||||
|
// 私有构造,确保只能通过getInstance()获取实例
|
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 获取单例实例 - 课堂知识点:单例模式 |
||||
|
* @return HttpCrawler实例 |
||||
|
*/ |
||||
|
public static HttpCrawler getInstance() { |
||||
|
if (instance == null) { |
||||
|
synchronized (HttpCrawler.class) { |
||||
|
if (instance == null) { |
||||
|
instance = new HttpCrawler(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
return instance; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 检查网络连接是否可用 |
||||
|
*/ |
||||
|
public static boolean isNetworkAvailable() { |
||||
|
try { |
||||
|
// 尝试连接公共DNS服务器
|
||||
|
InetAddress address = InetAddress.getByName("8.8.8.8"); |
||||
|
return address.isReachable(3000); |
||||
|
} catch (IOException e) { |
||||
|
return false; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 检查特定网站是否可访问 |
||||
|
*/ |
||||
|
public static boolean isWebsiteReachable(String url) { |
||||
|
try { |
||||
|
String domain = extractDomain(url); |
||||
|
InetAddress address = InetAddress.getByName(domain); |
||||
|
return address.isReachable(5000); |
||||
|
} catch (Exception e) { |
||||
|
return false; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static String extractDomain(String url) { |
||||
|
if (url == null || url.isEmpty()) { |
||||
|
return ""; |
||||
|
} |
||||
|
// 移除协议部分
|
||||
|
String domain = url.replaceFirst("^https?://", ""); |
||||
|
// 移除路径部分
|
||||
|
int slashIndex = domain.indexOf('/'); |
||||
|
if (slashIndex > 0) { |
||||
|
domain = domain.substring(0, slashIndex); |
||||
|
} |
||||
|
// 移除端口部分
|
||||
|
int colonIndex = domain.indexOf(':'); |
||||
|
if (colonIndex > 0) { |
||||
|
domain = domain.substring(0, colonIndex); |
||||
|
} |
||||
|
return domain; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 发送HTTP GET请求,带重试机制 |
||||
|
*/ |
||||
|
public static Document get(String url) throws IOException { |
||||
|
return get(url, DEFAULT_TIMEOUT, DEFAULT_RETRY_COUNT); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 发送HTTP GET请求,带重试机制 |
||||
|
* @param url 请求URL |
||||
|
* @param timeout 超时时间(毫秒) |
||||
|
* @param maxRetries 最大重试次数 |
||||
|
*/ |
||||
|
public static Document get(String url, int timeout, int maxRetries) throws IOException { |
||||
|
IOException lastException = null; |
||||
|
|
||||
|
// 课堂知识点:for循环
|
||||
|
for (int attempt = 1; attempt <= maxRetries; attempt++) { |
||||
|
try { |
||||
|
System.out.println("[请求尝试 " + attempt + "/" + maxRetries + "] " + url); |
||||
|
|
||||
|
Document doc = Jsoup.connect(url) |
||||
|
.userAgent(USER_AGENT) |
||||
|
.header("Accept-Language", "zh-CN,zh;q=0.9") |
||||
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
||||
|
.header("Connection", "keep-alive") |
||||
|
.timeout(timeout) |
||||
|
.followRedirects(true) |
||||
|
.get(); |
||||
|
|
||||
|
System.out.println("[请求成功] " + url); |
||||
|
return doc; |
||||
|
|
||||
|
} catch (java.net.SocketTimeoutException e) { |
||||
|
lastException = e; |
||||
|
System.err.println("[请求超时 " + attempt + "/" + maxRetries + "] " + url); |
||||
|
printNetworkTips(); |
||||
|
} catch (java.net.ConnectException e) { |
||||
|
lastException = e; |
||||
|
System.err.println("[连接失败 " + attempt + "/" + maxRetries + "] " + url); |
||||
|
printNetworkTips(); |
||||
|
} catch (java.net.UnknownHostException e) { |
||||
|
lastException = e; |
||||
|
System.err.println("[域名解析失败 " + attempt + "/" + maxRetries + "] " + url); |
||||
|
printNetworkTips(); |
||||
|
} catch (IOException e) { |
||||
|
lastException = e; |
||||
|
System.err.println("[请求异常 " + attempt + "/" + maxRetries + "] " + url + ": " + e.getMessage()); |
||||
|
} |
||||
|
|
||||
|
// 如果不是最后一次尝试,等待后重试
|
||||
|
if (attempt < maxRetries) { |
||||
|
try { |
||||
|
System.out.println("等待 " + RETRY_DELAY_MS + "ms 后重试..."); |
||||
|
Thread.sleep(RETRY_DELAY_MS); |
||||
|
} catch (InterruptedException ie) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
throw new IOException("请求被中断", ie); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 所有重试都失败
|
||||
|
throw new IOException("请求失败,已重试 " + maxRetries + " 次: " + url, lastException); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 打印网络连接提示 |
||||
|
*/ |
||||
|
public static void printNetworkTips() { |
||||
|
System.err.println("====================================="); |
||||
|
System.err.println("网络连接问题排查:"); |
||||
|
System.err.println(" 1. 请检查您的网络连接是否正常"); |
||||
|
System.err.println(" 2. 尝试访问其他网站测试网络"); |
||||
|
System.err.println(" 3. 如果访问境外网站,可能需要VPN"); |
||||
|
System.err.println(" 4. 检查防火墙是否阻止了连接"); |
||||
|
System.err.println(" 5. 检查DNS设置是否正确"); |
||||
|
System.err.println("====================================="); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 打印网络恢复提示 |
||||
|
*/ |
||||
|
public static void printNetworkRecovery() { |
||||
|
System.out.println("====================================="); |
||||
|
System.out.println("✓ 网络连接已恢复!"); |
||||
|
System.out.println("✓ 正在重新尝试获取数据..."); |
||||
|
System.out.println("====================================="); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue