Browse Source

爬虫项目报告

main
Luojiale 3 weeks ago
parent
commit
a830272dba
  1. 39
      project/.gitignore
  2. 10
      project/.idea/.gitignore
  3. 7
      project/.idea/encodings.xml
  4. 14
      project/.idea/misc.xml
  5. 87
      project/README.md
  6. 96
      project/build.log
  7. 92
      project/build_log.txt
  8. BIN
      project/build_output.txt
  9. 22
      project/crawler.properties
  10. 162
      project/data/books_to_scrape.json
  11. 82
      project/data/douban_books.json
  12. 62
      project/data/douban_movies.json
  13. 250
      project/douban_top250.txt
  14. 9002
      project/output/BooksToScrape_20260531_233112.json
  15. 5006
      project/output/BooksToScrape_20260531_233112.txt
  16. 74
      project/pom.xml
  17. 24
      project/run.bat
  18. 1
      project/src/main/java/CrawlerManager.java
  19. 133
      project/src/main/java/cli/CrawlerCLI.java
  20. 34
      project/src/main/java/com/crawler/Main.java
  21. 8
      project/src/main/java/com/crawler/command/Command.java
  22. 37
      project/src/main/java/com/crawler/command/CrawlAllCommand.java
  23. 28
      project/src/main/java/com/crawler/command/CrawlCommand.java
  24. 119
      project/src/main/java/com/crawler/controller/CrawlerController.java
  25. 11
      project/src/main/java/com/crawler/exception/CrawlerException.java
  26. 11
      project/src/main/java/com/crawler/exception/FileException.java
  27. 11
      project/src/main/java/com/crawler/exception/NetworkException.java
  28. 11
      project/src/main/java/com/crawler/exception/ParseException.java
  29. 105
      project/src/main/java/com/crawler/model/Book.java
  30. 96
      project/src/main/java/com/crawler/model/Movie.java
  31. 69
      project/src/main/java/com/crawler/model/ScrapeBook.java
  32. 72
      project/src/main/java/com/crawler/strategy/BooksToScrapeStrategy.java
  33. 9
      project/src/main/java/com/crawler/strategy/CrawlerStrategy.java
  34. 69
      project/src/main/java/com/crawler/strategy/DoubanBookStrategy.java
  35. 74
      project/src/main/java/com/crawler/strategy/DoubanMovieStrategy.java
  36. 60
      project/src/main/java/com/crawler/util/FileUtil.java
  37. 24
      project/src/main/java/com/crawler/util/HttpUtil.java
  38. 59
      project/src/main/java/com/crawler/view/ConsoleView.java
  39. 38
      project/src/main/java/command/ClearCommand.java
  40. 11
      project/src/main/java/command/Command.java
  41. 41
      project/src/main/java/command/CommandRegistry.java
  42. 37
      project/src/main/java/command/ListCrawlersCommand.java
  43. 59
      project/src/main/java/command/RunAllCommand.java
  44. 37
      project/src/main/java/command/RunSingleCommand.java
  45. 38
      project/src/main/java/command/StatsCommand.java
  46. 73
      project/src/main/java/config/CrawlerConfig.java
  47. 177
      project/src/main/java/controller/CrawlerController.java
  48. 139
      project/src/main/java/crawler/BaseCrawler.java
  49. 113
      project/src/main/java/crawler/DoubanCrawler.java
  50. 100
      project/src/main/java/crawler/ImdbCrawler.java
  51. 92
      project/src/main/java/crawler/MaoyanCrawler.java
  52. 102
      project/src/main/java/crawler/RottenTomatoesCrawler.java
  53. 55
      project/src/main/java/exception/CrawlerException.java
  54. 103
      project/src/main/java/exception/CrawlerResult.java
  55. 20
      project/src/main/java/exception/NetworkException.java
  56. 20
      project/src/main/java/exception/ParseException.java
  57. 20
      project/src/main/java/exception/StorageException.java
  58. 20
      project/src/main/java/exception/ValidationException.java
  59. 236
      project/src/main/java/main/CrawlerManager.java
  60. 86
      project/src/main/java/model/Book.java
  61. 78
      project/src/main/java/model/Movie.java
  62. 34
      project/src/main/java/storage/DataStorage.java
  63. 237
      project/src/main/java/storage/FileStorage.java
  64. 25
      project/src/main/java/storage/LocalDateTimeAdapter.java
  65. 414
      project/src/main/java/storage/SQLiteStorage.java
  66. 23
      project/src/main/java/storage/StorageStats.java
  67. 115
      project/src/main/java/strategy/AbstractBookCrawlerStrategy.java
  68. 114
      project/src/main/java/strategy/AbstractCrawlerStrategy.java
  69. 15
      project/src/main/java/strategy/BookCrawlerStrategy.java
  70. 16
      project/src/main/java/strategy/CrawlerStrategy.java
  71. 116
      project/src/main/java/strategy/impl/BooksToScrapeStrategy.java
  72. 159
      project/src/main/java/strategy/impl/DoubanBookStrategy.java
  73. 111
      project/src/main/java/strategy/impl/DoubanStrategy.java
  74. 94
      project/src/main/java/strategy/impl/MaoyanStrategy.java
  75. 105
      project/src/main/java/strategy/impl/RottenTomatoesStrategy.java
  76. 54
      project/src/main/java/util/Logger.java
  77. 109
      project/src/main/java/view/CrawlerView.java
  78. 88
      project/src/test/TestMain.java/TestMain.java
  79. BIN
      project/项目报告v1(1).docx
  80. 27
      w1/BMICalculator.java
  81. 0
      w1/Student.java
  82. 4
      w5/ShapeTest.java
  83. 4
      w6/AnimalTest.java
  84. 4
      爬虫/DoubanMovieCrawler.java

39
project/.gitignore

@ -0,0 +1,39 @@
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
.kotlin
### IntelliJ IDEA ###
.idea/modules.xml
.idea/jarRepositories.xml
.idea/compiler.xml
.idea/libraries/
*.iws
*.iml
*.ipr
### Eclipse ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/
### Mac OS ###
.DS_Store

10
project/.idea/.gitignore

@ -0,0 +1,10 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 已忽略包含查询文件的默认文件夹
/queries/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/

7
project/.idea/encodings.xml

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/src/main/Douban.java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
</component>
</project>

14
project/.idea/misc.xml

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="25" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

87
project/README.md

@ -0,0 +1,87 @@
# Web Crawler Application
基于 Java 的多网站爬虫应用,采用 MVC、Command 模式、策略模式和完整的异常体系设计。
## 功能特性
- 爬取豆瓣电影排行榜
- 爬取豆瓣读书排行榜
- 爬取 Books to Scrape 网站
- 数据保存为 JSON 格式文件
- 支持交互式和命令行模式
## 项目架构
### 设计模式
1. **MVC 模式**
- Model: `Movie`, `Book`, `ScrapeBook`
- View: `ConsoleView`
- Controller: `CrawlerController`
2. **策略模式 (Strategy Pattern)**
- `CrawlerStrategy` 接口
- `DoubanMovieStrategy` - 豆瓣电影策略
- `DoubanBookStrategy` - 豆瓣读书策略
- `BooksToScrapeStrategy` - Books to Scrape 策略
3. **命令模式 (Command Pattern)**
- `Command` 接口
- `CrawlCommand` - 单个爬虫命令
- `CrawlAllCommand` - 组合命令,执行所有爬虫
4. **异常体系**
- `CrawlerException` - 基类异常
- `NetworkException` - 网络异常
- `ParseException` - 解析异常
- `FileException` - 文件操作异常
## 使用方法
### 编译项目
```bash
mvn clean package
```
### 运行方式
#### 1. 交互式模式
```bash
java -jar target/web-crawler-1.0-SNAPSHOT.jar -i
```
#### 2. 命令行模式
爬取所有网站:
```bash
java -jar target/web-crawler-1.0-SNAPSHOT.jar
```
爬取指定网站:
```bash
java -jar target/web-crawler-1.0-SNAPSHOT.jar -s douban-movie
java -jar target/web-crawler-1.0-SNAPSHOT.jar -s douban-book
java -jar target/web-crawler-1.0-SNAPSHOT.jar -s books-to-scrape
```
查看帮助:
```bash
java -jar target/web-crawler-1.0-SNAPSHOT.jar --help
```
## 输出文件
爬取的数据将保存到 `data/` 目录下:
- `douban_movies.json` - 豆瓣电影数据
- `douban_books.json` - 豆瓣读书数据
- `books_to_scrape.json` - Books to Scrape 数据
## 依赖项
- Jsoup - HTML 解析
- Gson - JSON 处理
- Picocli - 命令行解析
- SLF4J - 日志框架

96
project/build.log

@ -0,0 +1,96 @@
mvn : WARNING: A restricted method in java.lang.System has been called
所在位置 行:1 字符: 72
+ ... 嘻哈哈\Git\java爬虫\TestMaven"; mvn clean package -DskipTests 2>&1 | Out-F ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ CategoryInfo : NotSpecified: (WARNING: A rest...has been called
:String) [], RemoteException
+ FullyQualifiedErrorId : NativeCommandError
WARNING: java.lang.System::load has been called by org.fusesource.jansi.interna
l.JansiLoader in an unnamed module (file:/D:/嘻嘻哈哈/Git/java/apache-maven-3.9.6/l
ib/jansi-2.4.0.jar)
WARNING: Use --enable-native-access=ALL-UNNAMED to avoid a warning for callers
in this module
WARNING: Restricted methods will be blocked in a future release unless native a
ccess is enabled
WARNING: A terminally deprecated method in sun.misc.Unsafe has been called
WARNING: sun.misc.Unsafe::objectFieldOffset has been called by com.google.commo
n.util.concurrent.AbstractFuture$UnsafeAtomicHelper (file:/D:/嘻嘻哈哈/Git/java/apa
che-maven-3.9.6/lib/guava-32.0.1-jre.jar)
WARNING: Please consider reporting this to the maintainers of class com.google.
common.util.concurrent.AbstractFuture$UnsafeAtomicHelper
WARNING: sun.misc.Unsafe::objectFieldOffset will be removed in a future release
[INFO] Scanning for projects...
[INFO]
[INFO] -----------------------< com.example:TestMaven >------------------------
[INFO] Building TestMaven 1.0-SNAPSHOT
[INFO] from pom.xml
[INFO] --------------------------------[ jar ]---------------------------------
[INFO]
[INFO] --- clean:3.2.0:clean (default-clean) @ TestMaven ---
[INFO] Deleting D:\鍢诲樆鍝堝搱\Git\java鐖櫕\TestMaven\target
[INFO]
[INFO] --- resources:3.3.1:resources (default-resources) @ TestMaven ---
[INFO] Copying 0 resource from src\main\resources to target\classes
[INFO]
[INFO] --- compiler:3.11.0:compile (default-compile) @ TestMaven ---
[INFO] Changes detected - recompiling the module! :source
[INFO] Compiling 41 source files with javac [debug target 8] to target\classes
[INFO] -------------------------------------------------------------
[WARNING] COMPILATION WARNING :
[INFO] -------------------------------------------------------------
[WARNING] 鏈笌 -source 8 涓€璧疯缃紩瀵肩被璺緞
涓嶈缃紩瀵肩被璺緞鍙兘浼氬鑷寸被鏂囦欢鏃犳硶鍦?JDK 8 涓婅繍琛? 寤鸿浣跨敤 --release 8 鑰屼笉鏄?-source 8 -target 8锛屽洜涓哄畠浼氳嚜鍔ㄨ缃紩瀵肩被璺緞
[WARNING] 婧愬€?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎
[WARNING] 鐩爣鍊?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎
[WARNING] 瑕侀殣钘忔湁鍏冲凡杩囨椂閫夐」鐨勮鍛? 璇蜂娇鐢?-Xlint:-options銆?
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨?
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨?
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨?
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨?
[INFO] 8 warnings
[INFO] -------------------------------------------------------------
[INFO] -------------------------------------------------------------
[ERROR] COMPILATION ERROR :
[INFO] -------------------------------------------------------------
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/main/CrawlerManager.java:[178,20] main.CrawlerManager.MultiStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String)
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var
浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var
浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var
浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var
浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/storage/SQLiteStorage.java:[12,8] storage.SQLiteStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String)
[INFO] 6 errors
[INFO] -------------------------------------------------------------
[INFO] ------------------------------------------------------------------------
[INFO] BUILD FAILURE
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 15.493 s
[INFO] Finished at: 2026-05-31T23:13:59+08:00
[INFO] ------------------------------------------------------------------------
[ERROR] Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.11.0:compile (default-compile) on project TestMaven: Compilation failure: Compilation failure:
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/main/CrawlerManager.java:[178,20] main.CrawlerManager.MultiStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String)
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙?
[ERROR] 绗﹀彿: 绫?var
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙?
[ERROR] 绗﹀彿: 绫?var
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙?
[ERROR] 绗﹀彿: 绫?var
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙?
[ERROR] 绗﹀彿: 绫?var
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/storage/SQLiteStorage.java:[12,8] storage.SQLiteStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String)
[ERROR] -> [Help 1]
[ERROR]
[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch.
[ERROR] Re-run Maven using the -X switch to enable full debug logging.
[ERROR]
[ERROR] For more information about the errors and possible solutions, please read the following articles:
[ERROR] [Help 1] http://cwiki.apache.org/confluence/display/MAVEN/MojoFailureException

92
project/build_log.txt

@ -0,0 +1,92 @@
mvn : WARNING: A restricted method in java.lang.System has been called
所在位置 行:1 字符: 72
+ ... JAVA_HOME = "D:\嘻嘻哈哈\Git"; mvn clean package -DskipTests 2>&1 | Out-F ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ CategoryInfo : NotSpecified: (WARNING: A rest...has been called
:String) [], RemoteException
+ FullyQualifiedErrorId : NativeCommandError
WARNING: java.lang.System::load has been called by org.fusesource.jansi.interna
l.JansiLoader in an unnamed module (file:/D:/嘻嘻哈哈/Git/java/apache-maven-3.9.6/l
ib/jansi-2.4.0.jar)
WARNING: Use --enable-native-access=ALL-UNNAMED to avoid a warning for callers
in this module
WARNING: Restricted methods will be blocked in a future release unless native a
ccess is enabled
WARNING: A terminally deprecated method in sun.misc.Unsafe has been called
WARNING: sun.misc.Unsafe::objectFieldOffset has been called by com.google.commo
n.util.concurrent.AbstractFuture$UnsafeAtomicHelper (file:/D:/嘻嘻哈哈/Git/java/apa
che-maven-3.9.6/lib/guava-32.0.1-jre.jar)
WARNING: Please consider reporting this to the maintainers of class com.google.
common.util.concurrent.AbstractFuture$UnsafeAtomicHelper
WARNING: sun.misc.Unsafe::objectFieldOffset will be removed in a future release
[INFO] Scanning for projects...
[INFO]
[INFO] -----------------------< com.example:TestMaven >------------------------
[INFO] Building TestMaven 1.0-SNAPSHOT
[INFO] from pom.xml
[INFO] --------------------------------[ jar ]---------------------------------
[INFO]
[INFO] --- clean:3.2.0:clean (default-clean) @ TestMaven ---
[INFO] Deleting D:\鍢诲樆鍝堝搱\Git\java鐖櫕\TestMaven\target
[INFO]
[INFO] --- resources:3.3.1:resources (default-resources) @ TestMaven ---
[INFO] Copying 0 resource from src\main\resources to target\classes
[INFO]
[INFO] --- compiler:3.11.0:compile (default-compile) @ TestMaven ---
[INFO] Changes detected - recompiling the module! :source
[INFO] Compiling 36 source files with javac [debug target 8] to target\classes
[INFO] -------------------------------------------------------------
[WARNING] COMPILATION WARNING :
[INFO] -------------------------------------------------------------
[WARNING] 鏈笌 -source 8 涓€璧疯缃紩瀵肩被璺緞
涓嶈缃紩瀵肩被璺緞鍙兘浼氬鑷寸被鏂囦欢鏃犳硶鍦?JDK 8 涓婅繍琛? 寤鸿浣跨敤 --release 8 鑰屼笉鏄?-source 8 -target 8锛屽洜涓哄畠浼氳嚜鍔ㄨ缃紩瀵肩被璺緞
[WARNING] 婧愬€?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎
[WARNING] 鐩爣鍊?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎
[WARNING] 瑕侀殣钘忔湁鍏冲凡杩囨椂閫夐」鐨勮鍛? 璇蜂娇鐢?-Xlint:-options銆?
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨?
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨?
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨?
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨?
[INFO] 8 warnings
[INFO] -------------------------------------------------------------
[INFO] -------------------------------------------------------------
[ERROR] COMPILATION ERROR :
[INFO] -------------------------------------------------------------
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var
浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var
浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var
浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var
浣嶇疆: 绫?cli.CrawlerCLI
[INFO] 4 errors
[INFO] -------------------------------------------------------------
[INFO] ------------------------------------------------------------------------
[INFO] BUILD FAILURE
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 15.527 s
[INFO] Finished at: 2026-05-31T22:16:51+08:00
[INFO] ------------------------------------------------------------------------
[ERROR] Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.11.0:compile (default-compile) on project TestMaven: Compilation failure: Compilation failure:
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙?
[ERROR] 绗﹀彿: 绫?var
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙?
[ERROR] 绗﹀彿: 绫?var
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙?
[ERROR] 绗﹀彿: 绫?var
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙?
[ERROR] 绗﹀彿: 绫?var
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI
[ERROR] -> [Help 1]
[ERROR]
[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch.
[ERROR] Re-run Maven using the -X switch to enable full debug logging.
[ERROR]
[ERROR] For more information about the errors and possible solutions, please read the following articles:
[ERROR] [Help 1] http://cwiki.apache.org/confluence/display/MAVEN/MojoFailureException

BIN
project/build_output.txt

Binary file not shown.

22
project/crawler.properties

@ -0,0 +1,22 @@
# 爬虫配置文件
# 请求延迟(毫秒)- 避免请求过快被封
delay.ms=1500
# 请求超时时间(毫秒)
timeout.ms=15000
# User-Agent
user.agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36
# 数据库路径
db.path=crawler.db
# 输出目录
output.dir=output
# 是否启用数据库存储
enable.database=true
# 是否启用文件输出
enable.file=true

162
project/data/books_to_scrape.json

@ -0,0 +1,162 @@
[
{
"title": "A Light in the Attic",
"price": "£51.77",
"rating": "3",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg",
"productUrl": "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html"
},
{
"title": "Tipping the Velvet",
"price": "£53.74",
"rating": "1",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg",
"productUrl": "http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html"
},
{
"title": "Soumission",
"price": "£50.10",
"rating": "1",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg",
"productUrl": "http://books.toscrape.com/catalogue/soumission_998/index.html"
},
{
"title": "Sharp Objects",
"price": "£47.82",
"rating": "4",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg",
"productUrl": "http://books.toscrape.com/catalogue/sharp-objects_997/index.html"
},
{
"title": "Sapiens: A Brief History of Humankind",
"price": "£54.23",
"rating": "5",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg",
"productUrl": "http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html"
},
{
"title": "The Requiem Red",
"price": "£22.65",
"rating": "1",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg",
"productUrl": "http://books.toscrape.com/catalogue/the-requiem-red_995/index.html"
},
{
"title": "The Dirty Little Secrets of Getting Your Dream Job",
"price": "£33.34",
"rating": "4",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg",
"productUrl": "http://books.toscrape.com/catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html"
},
{
"title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull",
"price": "£17.93",
"rating": "3",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg",
"productUrl": "http://books.toscrape.com/catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html"
},
{
"title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics",
"price": "£22.60",
"rating": "4",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg",
"productUrl": "http://books.toscrape.com/catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html"
},
{
"title": "The Black Maria",
"price": "£52.15",
"rating": "1",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/58/46/5846057e28022268153beff6d352b06c.jpg",
"productUrl": "http://books.toscrape.com/catalogue/the-black-maria_991/index.html"
},
{
"title": "Starving Hearts (Triangular Trade Trilogy, #1)",
"price": "£13.99",
"rating": "2",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg",
"productUrl": "http://books.toscrape.com/catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html"
},
{
"title": "Shakespeare\u0027s Sonnets",
"price": "£20.66",
"rating": "4",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg",
"productUrl": "http://books.toscrape.com/catalogue/shakespeares-sonnets_989/index.html"
},
{
"title": "Set Me Free",
"price": "£17.46",
"rating": "5",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg",
"productUrl": "http://books.toscrape.com/catalogue/set-me-free_988/index.html"
},
{
"title": "Scott Pilgrim\u0027s Precious Little Life (Scott Pilgrim #1)",
"price": "£52.29",
"rating": "5",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg",
"productUrl": "http://books.toscrape.com/catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html"
},
{
"title": "Rip it Up and Start Again",
"price": "£35.02",
"rating": "5",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg",
"productUrl": "http://books.toscrape.com/catalogue/rip-it-up-and-start-again_986/index.html"
},
{
"title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991",
"price": "£57.25",
"rating": "3",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg",
"productUrl": "http://books.toscrape.com/catalogue/our-band-could-be-your-life-scenes-from-the-american-indie-underground-1981-1991_985/index.html"
},
{
"title": "Olio",
"price": "£23.88",
"rating": "1",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg",
"productUrl": "http://books.toscrape.com/catalogue/olio_984/index.html"
},
{
"title": "Mesaerion: The Best Science Fiction Stories 1800-1849",
"price": "£37.59",
"rating": "1",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg",
"productUrl": "http://books.toscrape.com/catalogue/mesaerion-the-best-science-fiction-stories-1800-1849_983/index.html"
},
{
"title": "Libertarianism for Beginners",
"price": "£51.33",
"rating": "2",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg",
"productUrl": "http://books.toscrape.com/catalogue/libertarianism-for-beginners_982/index.html"
},
{
"title": "It\u0027s Only the Himalayas",
"price": "£45.17",
"rating": "2",
"availability": "In stock",
"imageUrl": "http://books.toscrape.com/media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg",
"productUrl": "http://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html"
}
]

82
project/data/douban_books.json

@ -0,0 +1,82 @@
[
{
"title": "安定此心:我当精神科医生的12000天",
"url": "https://book.douban.com/subject/37502923/"
},
{
"title": "挽救计划",
"url": "https://book.douban.com/subject/38210508/"
},
{
"title": "咸的玩笑",
"url": "https://book.douban.com/subject/37833272/"
},
{
"title": "真事隐:康熙废储与正史虚构",
"url": "https://book.douban.com/subject/37920184/"
},
{
"title": "大厂小民:我在互联网公司的1480天",
"url": "https://book.douban.com/subject/38208793/"
},
{
"title": "天色已晚",
"url": "https://book.douban.com/subject/37890167/"
},
{
"title": "她和她的决心",
"url": "https://book.douban.com/subject/38178826/"
},
{
"title": "凯罗斯",
"url": "https://book.douban.com/subject/37825000/"
},
{
"title": "幸福蒙太奇",
"url": "https://book.douban.com/subject/37841159/"
},
{
"title": "螃蟹的邀请",
"url": "https://book.douban.com/subject/37496217/"
},
{
"title": "抄写员巴托比",
"url": "https://book.douban.com/subject/38392174/"
},
{
"title": "我收养了一个朋友",
"url": "https://book.douban.com/subject/37938861/"
},
{
"title": "哈萨比斯:谷歌AI之脑",
"url": "https://book.douban.com/subject/38357884/"
},
{
"title": "像女孩那样丢球",
"url": "https://book.douban.com/subject/37126780/"
},
{
"title": "刚刚离开的世界",
"url": "https://book.douban.com/subject/37447242/"
},
{
"title": "故纸浮生.1-2",
"url": "https://book.douban.com/subject/37648813/"
},
{
"title": "在世与认知",
"url": "https://book.douban.com/subject/37112076/"
},
{
"title": "呼啸山庄",
"url": "https://book.douban.com/subject/30471282/"
},
{
"title": "我们如何理解这个世界:与齐格蒙特·鲍曼对谈",
"url": "https://book.douban.com/subject/37930972/"
},
{
"title": "刮风下雨",
"url": "https://book.douban.com/subject/38240709/"
}
]

62
project/data/douban_movies.json

@ -0,0 +1,62 @@
[
{
"title": "爱情抓马",
"rating": "6.9",
"ratingCount": "(34363人评价)",
"url": "https://movie.douban.com/subject/36995126/"
},
{
"title": "世界的主人",
"rating": "9.1",
"ratingCount": "(116736人评价)",
"url": "https://movie.douban.com/subject/37116612/"
},
{
"title": "木乃伊",
"rating": "6.2",
"ratingCount": "(13705人评价)",
"url": "https://movie.douban.com/subject/36929221/"
},
{
"title": "蜂蜜的针",
"rating": "6.7",
"ratingCount": "(48214人评价)",
"url": "https://movie.douban.com/subject/26022233/"
},
{
"title": "杀的就是你",
"rating": "6.9",
"ratingCount": "(21794人评价)",
"url": "https://movie.douban.com/subject/36926954/"
},
{
"title": "惩罚者:最后一击",
"rating": "6.8",
"ratingCount": "(5478人评价)",
"url": "https://movie.douban.com/subject/37259325/"
},
{
"title": "蒙特利尔,我的美人",
"rating": "7.6",
"ratingCount": "(14162人评价)",
"url": "https://movie.douban.com/subject/37019075/"
},
{
"title": "与王生活的男人",
"rating": "7.4",
"ratingCount": "(10007人评价)",
"url": "https://movie.douban.com/subject/36978169/"
},
{
"title": "挽救计划",
"rating": "8.6",
"ratingCount": "(463129人评价)",
"url": "https://movie.douban.com/subject/35010610/"
},
{
"title": "长夜将尽",
"rating": "6.5",
"ratingCount": "(10878人评价)",
"url": "https://movie.douban.com/subject/35590993/"
}
]

250
project/douban_top250.txt

@ -0,0 +1,250 @@
排名:1 电影:肖申克的救赎 评分:9.7
排名:2 电影:霸王别姬 评分:9.6
排名:3 电影:泰坦尼克号 评分:9.5
排名:4 电影:阿甘正传 评分:9.5
排名:5 电影:千与千寻 评分:9.4
排名:6 电影:美丽人生 评分:9.5
排名:7 电影:星际穿越 评分:9.4
排名:8 电影:这个杀手不太冷 评分:9.4
排名:9 电影:盗梦空间 评分:9.4
排名:10 电影:楚门的世界 评分:9.4
排名:11 电影:辛德勒的名单 评分:9.5
排名:12 电影:忠犬八公的故事 评分:9.4
排名:13 电影:海上钢琴师 评分:9.3
排名:14 电影:疯狂动物城 评分:9.3
排名:15 电影:三傻大闹宝莱坞 评分:9.2
排名:16 电影:机器人总动员 评分:9.3
排名:17 电影:放牛班的春天 评分:9.3
排名:18 电影:无间道 评分:9.3
排名:19 电影:控方证人 评分:9.6
排名:20 电影:寻梦环游记 评分:9.1
排名:21 电影:大话西游之大圣娶亲 评分:9.2
排名:22 电影:熔炉 评分:9.3
排名:23 电影:触不可及 评分:9.3
排名:24 电影:教父 评分:9.3
排名:25 电影:末代皇帝 评分:9.3
排名:26 电影:哈利·波特与魔法石 评分:9.2
排名:27 电影:当幸福来敲门 评分:9.1
排名:28 电影:龙猫 评分:9.2
排名:29 电影:活着 评分:9.3
排名:30 电影:怦然心动 评分:9.1
排名:31 电影:蝙蝠侠:黑暗骑士 评分:9.2
排名:32 电影:指环王3:王者无敌 评分:9.3
排名:33 电影:我不是药神 评分:9.0
排名:34 电影:乱世佳人 评分:9.3
排名:35 电影:飞屋环游记 评分:9.1
排名:36 电影:让子弹飞 评分:9.0
排名:37 电影:哈尔的移动城堡 评分:9.1
排名:38 电影:十二怒汉 评分:9.4
排名:39 电影:海蒂和爷爷 评分:9.3
排名:40 电影:素媛 评分:9.3
排名:41 电影:猫鼠游戏 评分:9.1
排名:42 电影:天空之城 评分:9.2
排名:43 电影:鬼子来了 评分:9.3
排名:44 电影:摔跤吧!爸爸 评分:9.0
排名:45 电影:少年派的奇幻漂流 评分:9.1
排名:46 电影:钢琴家 评分:9.3
排名:47 电影:指环王2:双塔奇兵 评分:9.2
排名:48 电影:死亡诗社 评分:9.2
排名:49 电影:大话西游之月光宝盒 评分:9.0
排名:50 电影:绿皮书 评分:8.9
排名:51 电影:何以为家 评分:9.1
排名:52 电影:闻香识女人 评分:9.1
排名:53 电影:大闹天宫 评分:9.4
排名:54 电影:黑客帝国 评分:9.1
排名:55 电影:指环王1:护戒使者 评分:9.1
排名:56 电影:罗马假日 评分:9.1
排名:57 电影:教父2 评分:9.3
排名:58 电影:狮子王 评分:9.1
排名:59 电影:天堂电影院 评分:9.2
排名:60 电影:饮食男女 评分:9.2
排名:61 电影:辩护人 评分:9.2
排名:62 电影:本杰明·巴顿奇事 评分:9.0
排名:63 电影:搏击俱乐部 评分:9.0
排名:64 电影:美丽心灵 评分:9.1
排名:65 电影:穿条纹睡衣的男孩 评分:9.2
排名:66 电影:哈利·波特与死亡圣器(下) 评分:9.0
排名:67 电影:情书 评分:8.9
排名:68 电影:两杆大烟枪 评分:9.1
排名:69 电影:窃听风暴 评分:9.2
排名:70 电影:音乐之声 评分:9.1
排名:71 电影:功夫 评分:8.9
排名:72 电影:哈利·波特与阿兹卡班的囚徒 评分:9.0
排名:73 电影:阿凡达 评分:8.8
排名:74 电影:西西里的美丽传说 评分:8.9
排名:75 电影:看不见的客人 评分:8.8
排名:76 电影:拯救大兵瑞恩 评分:9.1
排名:77 电影:沉默的羔羊 评分:8.9
排名:78 电影:小鞋子 评分:9.2
排名:79 电影:布达佩斯大饭店 评分:8.9
排名:80 电影:蝴蝶效应 评分:8.9
排名:81 电影:飞越疯人院 评分:9.1
排名:82 电影:还有明天 评分:9.3
排名:83 电影:禁闭岛 评分:8.9
排名:84 电影:心灵捕手 评分:9.0
排名:85 电影:致命魔术 评分:8.9
排名:86 电影:低俗小说 评分:8.9
排名:87 电影:哈利·波特与密室 评分:8.9
排名:88 电影:超脱 评分:9.0
排名:89 电影:一一 评分:9.1
排名:90 电影:喜剧之王 评分:8.8
排名:91 电影:杀人回忆 评分:8.9
排名:92 电影:致命ID 评分:8.9
排名:93 电影:摩登时代 评分:9.3
排名:94 电影:春光乍泄 评分:9.0
排名:95 电影:加勒比海盗 评分:8.8
排名:96 电影:海豚湾 评分:9.3
排名:97 电影:美国往事 评分:9.1
排名:98 电影:红辣椒 评分:9.0
排名:99 电影:七宗罪 评分:8.8
排名:100 电影:唐伯虎点秋香 评分:8.8
排名:101 电影:狩猎 评分:9.1
排名:102 电影:幽灵公主 评分:8.9
排名:103 电影:甜蜜蜜 评分:8.9
排名:104 电影:寄生虫 评分:8.8
排名:105 电影:天书奇谭 评分:9.2
排名:106 电影:蝙蝠侠:黑暗骑士崛起 评分:8.9
排名:107 电影:超能陆战队 评分:8.8
排名:108 电影:7号房的礼物 评分:8.9
排名:109 电影:茶馆 评分:9.5
排名:110 电影:第六感 评分:8.9
排名:111 电影:爱在黎明破晓前 评分:8.8
排名:112 电影:爱在日落黄昏时 评分:8.9
排名:113 电影:被嫌弃的松子的一生 评分:8.8
排名:114 电影:头脑特工队 评分:8.8
排名:115 电影:哈利·波特与火焰杯 评分:8.8
排名:116 电影:未麻的部屋 评分:9.1
排名:117 电影:重庆森林 评分:8.8
排名:118 电影:借东西的小人阿莉埃蒂 评分:8.9
排名:119 电影:菊次郎的夏天 评分:8.9
排名:120 电影:入殓师 评分:8.9
排名:121 电影:断背山 评分:8.8
排名:122 电影:剪刀手爱德华 评分:8.7
排名:123 电影:勇敢的心 评分:8.9
排名:124 电影:时空恋旅人 评分:8.8
排名:125 电影:驯龙高手 评分:8.8
排名:126 电影:消失的爱人 评分:8.7
排名:127 电影:无人知晓 评分:9.1
排名:128 电影:傲慢与偏见 评分:8.7
排名:129 电影:倩女幽魂 评分:8.8
排名:130 电影:新世界 评分:8.9
排名:131 电影:花样年华 评分:8.8
排名:132 电影:玩具总动员3 评分:8.9
排名:133 电影:一个叫欧维的男人决定去死 评分:8.9
排名:134 电影:色,戒 评分:8.7
排名:135 电影:完美的世界 评分:9.1
排名:136 电影:阳光灿烂的日子 评分:8.8
排名:137 电影:怪兽电力公司 评分:8.8
排名:138 电影:教父3 评分:9.0
排名:139 电影:小森林 夏秋篇 评分:9.0
排名:140 电影:天使爱美丽 评分:8.7
排名:141 电影:侧耳倾听 评分:8.9
排名:142 电影:哪吒闹海 评分:9.2
排名:143 电影:九品芝麻官 评分:8.8
排名:144 电影:被解救的姜戈 评分:8.8
排名:145 电影:请以你的名字呼唤我 评分:8.8
排名:146 电影:幸福终点站 评分:8.8
排名:147 电影:釜山行 评分:8.6
排名:148 电影:神偷奶爸 评分:8.7
排名:149 电影:小森林 冬春篇 评分:9.0
排名:150 电影:喜宴 评分:9.0
排名:151 电影:萤火之森 评分:8.8
排名:152 电影:告白 评分:8.8
排名:153 电影:玛丽和麦克斯 评分:9.0
排名:154 电影:七武士 评分:9.3
排名:155 电影:头号玩家 评分:8.6
排名:156 电影:模仿游戏 评分:8.8
排名:157 电影:惊魂记 评分:9.0
排名:158 电影:大鱼 评分:8.8
排名:159 电影:机器人之梦 评分:9.1
排名:160 电影:心灵奇旅 评分:8.7
排名:161 电影:背靠背,脸对脸 评分:9.5
排名:162 电影:射雕英雄传之东成西就 评分:8.7
排名:163 电影:血战钢锯岭 评分:8.7
排名:164 电影:你的名字。 评分:8.5
排名:165 电影:我是山姆 评分:9.0
排名:166 电影:阳光姐妹淘 评分:8.8
排名:167 电影:恐怖直播 评分:8.7
排名:168 电影:黑客帝国3:矩阵革命 评分:8.8
排名:169 电影:末路狂花 评分:9.0
排名:170 电影:高山下的花环 评分:9.5
排名:171 电影:小丑 评分:8.7
排名:172 电影:谍影重重3 评分:8.9
排名:173 电影:三块广告牌 评分:8.7
排名:174 电影:电锯惊魂 评分:8.7
排名:175 电影:无间道2 评分:8.8
排名:176 电影:达拉斯买家俱乐部 评分:8.8
排名:177 电影:疯狂原始人 评分:8.7
排名:178 电影:绿里奇迹 评分:8.9
排名:179 电影:爱在午夜降临前 评分:8.9
排名:180 电影:疯狂的石头 评分:8.6
排名:181 电影:雨中曲 评分:9.1
排名:182 电影:2001太空漫游 评分:8.9
排名:183 电影:海街日记 评分:8.8
排名:184 电影:风之谷 评分:8.9
排名:185 电影:上帝之城 评分:9.0
排名:186 电影:心迷宫 评分:8.7
排名:187 电影:英雄本色 评分:8.6
排名:188 电影:记忆碎片 评分:8.7
排名:189 电影:纵横四海 评分:8.8
排名:190 电影:无敌破坏王 评分:8.7
排名:191 电影:卢旺达饭店 评分:8.9
排名:192 电影:牯岭街少年杀人事件 评分:8.9
排名:193 电影:恐怖游轮 评分:8.5
排名:194 电影:东京教父 评分:9.0
排名:195 电影:小偷家族 评分:8.7
排名:196 电影:魔女宅急便 评分:8.7
排名:197 电影:冰川时代 评分:8.7
排名:198 电影:芙蓉镇 评分:9.3
排名:199 电影:忠犬八公物语 评分:9.2
排名:200 电影:岁月神偷 评分:8.7
排名:201 电影:遗愿清单 评分:8.7
排名:202 电影:荒蛮故事 评分:8.7
排名:203 电影:大佛普拉斯 评分:8.7
排名:204 电影:源代码 评分:8.6
排名:205 电影:花束般的恋爱 评分:8.6
排名:206 电影:白日梦想家 评分:8.6
排名:207 电影:爱乐之城 评分:8.4
排名:208 电影:疯狂的麦克斯4:狂暴之路 评分:8.7
排名:209 电影:可可西里 评分:8.9
排名:210 电影:你看起来好像很好吃 评分:8.9
排名:211 电影:贫民窟的百万富翁 评分:8.6
排名:212 电影:波西米亚狂想曲 评分:8.6
排名:213 电影:城市之光 评分:9.3
排名:214 电影:爆裂鼓手 评分:8.6
排名:215 电影:青蛇 评分:8.6
排名:216 电影:哈利·波特与死亡圣器(上) 评分:8.6
排名:217 电影:无耻混蛋 评分:8.7
排名:218 电影:东邪西毒 评分:8.6
排名:219 电影:终结者2:审判日 评分:8.8
排名:220 电影:大红灯笼高高挂 评分:8.8
排名:221 电影:黑天鹅 评分:8.6
排名:222 电影:新龙门客栈 评分:8.7
排名:223 电影:初恋这件小事 评分:8.5
排名:224 电影:千钧一发 评分:8.8
排名:225 电影:人工智能 评分:8.7
排名:226 电影:崖上的波妞 评分:8.6
排名:227 电影:雨人 评分:8.7
排名:228 电影:虎口脱险 评分:8.9
排名:229 电影:哈利·波特与凤凰社 评分:8.6
排名:230 电影:彗星来的那一夜 评分:8.6
排名:231 电影:罗生门 评分:8.8
排名:232 电影:海边的曼彻斯特 评分:8.6
排名:233 电影:恋恋笔记本 评分:8.5
排名:234 电影:火星救援 评分:8.5
排名:235 电影:真爱至上 评分:8.5
排名:236 电影:黑客帝国2:重装上阵 评分:8.7
排名:237 电影:冰雪奇缘 评分:8.5
排名:238 电影:步履不停 评分:8.8
排名:239 电影:奇迹男孩 评分:8.6
排名:240 电影:千年女优 评分:8.8
排名:241 电影:谍影重重2 评分:8.7
排名:242 电影:战争之王 评分:8.7
排名:243 电影:蜘蛛侠:平行宇宙 评分:8.6
排名:244 电影:攻壳机动队 评分:9.0
排名:245 电影:血钻 评分:8.7
排名:246 电影:小姐 评分:8.5
排名:247 电影:隐藏人物 评分:8.9
排名:248 电影:血观音 评分:8.6
排名:249 电影:魂断蓝桥 评分:8.8
排名:250 电影:房间 评分:8.7

9002
project/output/BooksToScrape_20260531_233112.json

File diff suppressed because it is too large

5006
project/output/BooksToScrape_20260531_233112.txt

File diff suppressed because it is too large

74
project/pom.xml

@ -0,0 +1,74 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.crawler</groupId>
<artifactId>web-crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>Web Crawler</name>
<description>Multi-site web crawler with CLI, MVC, Command pattern and Strategy pattern</description>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.16.1</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>2.0.9</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>2.0.9</version>
</dependency>
<dependency>
<groupId>info.picocli</groupId>
<artifactId>picocli</artifactId>
<version>4.7.5</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<archive>
<manifest>
<mainClass>com.crawler.Main</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
<configuration>
<source>11</source>
<target>11</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

24
project/run.bat

@ -0,0 +1,24 @@
@echo off
echo ========================================
echo Web Crawler Application
echo ========================================
echo.
if not exist "target\web-crawler-1.0-SNAPSHOT.jar" (
echo Compiling project...
mvn clean package
if errorlevel 1 (
echo Compilation failed!
pause
exit /b 1
)
echo.
)
echo Running crawler...
echo.
java -jar target\web-crawler-1.0-SNAPSHOT.jar %*
echo.
pause

1
project/src/main/java/CrawlerManager.java

@ -0,0 +1 @@
// 此文件已废弃,请使用 main.CrawlerManager

133
project/src/main/java/cli/CrawlerCLI.java

@ -0,0 +1,133 @@
package cli;
import command.*;
import controller.CrawlerController;
import exception.CrawlerException;
import exception.CrawlerResult;
import exception.ValidationException;
import view.CrawlerView;
import java.util.Arrays;
public class CrawlerCLI {
private final CrawlerController controller;
private final CrawlerView view;
private final CommandRegistry commandRegistry;
public CrawlerCLI() {
this.controller = new CrawlerController();
this.view = new CrawlerView();
this.commandRegistry = new CommandRegistry();
initCommands();
}
public CrawlerCLI(String outputDir) {
this.controller = new CrawlerController(outputDir);
this.view = new CrawlerView();
this.commandRegistry = new CommandRegistry();
initCommands();
}
private void initCommands() {
commandRegistry.register(new RunAllCommand(controller));
commandRegistry.register(new ListCrawlersCommand(controller));
commandRegistry.register(new StatsCommand(controller));
commandRegistry.register(new ClearCommand(controller));
}
public void run(String[] args) {
view.showWelcome();
if (args == null || args.length == 0) {
view.showHelp();
return;
}
String commandName = args[0].toLowerCase().trim();
try {
switch (commandName) {
case "help":
case "-h":
case "--help":
view.showHelp();
break;
case "list":
case "ls":
handleList();
break;
case "run":
handleRun(args);
break;
case "run-all":
case "all":
handleRunAll();
break;
case "stats":
handleStats();
break;
case "clear":
handleClear();
break;
default:
view.showError("未知命令: " + commandName + "\n使用 'help' 查看可用命令");
}
} catch (ValidationException e) {
view.showError(e.getMessage());
view.showHelp();
} catch (CrawlerException e) {
view.showError("爬虫错误 [" + e.getErrorCode() + "]: " + e.getMessage());
} catch (Exception e) {
view.showError("系统错误: " + e.getMessage());
e.printStackTrace();
}
}
private void handleList() {
Command cmd = commandRegistry.getCommand("list");
cmd.execute();
view.showCrawlerList(controller.getAllCrawlerNames());
}
private void handleRun(String[] args) {
if (args.length < 2) {
view.showError("请指定爬虫名称\n示例: run 豆瓣电影Top250");
view.showCrawlerList(controller.getAllCrawlerNames());
return;
}
String crawlerName = args[1];
Command cmd = new RunSingleCommand(controller, crawlerName);
CrawlerResult result = cmd.execute();
view.showResult(result);
}
private void handleRunAll() {
Command cmd = commandRegistry.getCommand("run-all");
CrawlerResult result = cmd.execute();
view.showResult(result);
}
private void handleStats() {
Command cmd = commandRegistry.getCommand("stats");
CrawlerResult result = cmd.execute();
view.showMessage(result.getMessage());
}
private void handleClear() {
Command cmd = commandRegistry.getCommand("clear");
CrawlerResult result = cmd.execute();
view.showResult(result);
}
public static void main(String[] args) {
CrawlerCLI cli = new CrawlerCLI();
cli.run(args);
}
}

34
project/src/main/java/com/crawler/Main.java

@ -0,0 +1,34 @@
package com.crawler;
import com.crawler.controller.CrawlerController;
import picocli.CommandLine;
import picocli.CommandLine.Command;
import picocli.CommandLine.Option;
@Command(name = "crawler", mixinStandardHelpOptions = true, version = "1.0",
description = "Web Crawler - Crawl Douban Movies, Douban Books, and Books to Scrape")
public class Main implements Runnable {
@Option(names = {"-s", "--site"}, description = "Site to crawl: douban-movie, douban-book, books-to-scrape, all",
defaultValue = "all")
private String site;
@Option(names = {"-i", "--interactive"}, description = "Run in interactive mode")
private boolean interactive;
public static void main(String[] args) {
int exitCode = new CommandLine(new Main()).execute(args);
System.exit(exitCode);
}
@Override
public void run() {
CrawlerController controller = new CrawlerController();
if (interactive) {
controller.runInteractive();
} else {
controller.crawlBySite(site);
}
}
}

8
project/src/main/java/com/crawler/command/Command.java

@ -0,0 +1,8 @@
package com.crawler.command;
import com.crawler.exception.CrawlerException;
public interface Command {
void execute() throws CrawlerException;
String getDescription();
}

37
project/src/main/java/com/crawler/command/CrawlAllCommand.java

@ -0,0 +1,37 @@
package com.crawler.command;
import com.crawler.exception.CrawlerException;
import java.util.ArrayList;
import java.util.List;
public class CrawlAllCommand implements Command {
private final List<Command> commands;
private final String description;
public CrawlAllCommand(String description) {
this.commands = new ArrayList<>();
this.description = description;
}
public void addCommand(Command command) {
commands.add(command);
}
@Override
public void execute() throws CrawlerException {
for (Command command : commands) {
try {
command.execute();
} catch (CrawlerException e) {
System.err.println("Error executing command: " + command.getDescription());
System.err.println("Error: " + e.getMessage());
}
}
}
@Override
public String getDescription() {
return description;
}
}

28
project/src/main/java/com/crawler/command/CrawlCommand.java

@ -0,0 +1,28 @@
package com.crawler.command;
import com.crawler.exception.CrawlerException;
import com.crawler.strategy.CrawlerStrategy;
import com.crawler.util.FileUtil;
import java.util.List;
public class CrawlCommand<T> implements Command {
private final CrawlerStrategy<T> strategy;
private final String description;
public CrawlCommand(CrawlerStrategy<T> strategy, String description) {
this.strategy = strategy;
this.description = description;
}
@Override
public void execute() throws CrawlerException {
List<T> data = strategy.crawl();
FileUtil.saveToJsonFile(data, strategy.getOutputFileName());
}
@Override
public String getDescription() {
return description;
}
}

119
project/src/main/java/com/crawler/controller/CrawlerController.java

@ -0,0 +1,119 @@
package com.crawler.controller;
import com.crawler.command.Command;
import com.crawler.command.CrawlAllCommand;
import com.crawler.command.CrawlCommand;
import com.crawler.exception.CrawlerException;
import com.crawler.strategy.BooksToScrapeStrategy;
import com.crawler.strategy.CrawlerStrategy;
import com.crawler.strategy.DoubanBookStrategy;
import com.crawler.strategy.DoubanMovieStrategy;
import com.crawler.view.ConsoleView;
import java.util.Scanner;
public class CrawlerController {
private final ConsoleView view;
private final Scanner scanner;
public CrawlerController() {
this.view = new ConsoleView();
this.scanner = new Scanner(System.in);
}
public void runInteractive() {
view.displayWelcome();
while (true) {
view.displayMenu();
String input = scanner.nextLine().trim();
try {
int choice = Integer.parseInt(input);
switch (choice) {
case 1:
crawlDoubanMovies();
break;
case 2:
crawlDoubanBooks();
break;
case 3:
crawlBooksToScrape();
break;
case 4:
crawlAll();
break;
case 0:
view.displayGoodbye();
return;
default:
view.displayInvalidChoice();
}
} catch (NumberFormatException e) {
view.displayInvalidChoice();
}
}
}
public void crawlDoubanMovies() {
CrawlerStrategy<?> strategy = new DoubanMovieStrategy();
Command command = new CrawlCommand<>(strategy, "Douban Movies");
executeCommand(command, strategy.getOutputFileName());
}
public void crawlDoubanBooks() {
CrawlerStrategy<?> strategy = new DoubanBookStrategy();
Command command = new CrawlCommand<>(strategy, "Douban Books");
executeCommand(command, strategy.getOutputFileName());
}
public void crawlBooksToScrape() {
CrawlerStrategy<?> strategy = new BooksToScrapeStrategy();
Command command = new CrawlCommand<>(strategy, "Books to Scrape");
executeCommand(command, strategy.getOutputFileName());
}
public void crawlAll() {
CrawlAllCommand allCommand = new CrawlAllCommand("Crawl All");
allCommand.addCommand(new CrawlCommand<>(new DoubanMovieStrategy(), "Douban Movies"));
allCommand.addCommand(new CrawlCommand<>(new DoubanBookStrategy(), "Douban Books"));
allCommand.addCommand(new CrawlCommand<>(new BooksToScrapeStrategy(), "Books to Scrape"));
try {
view.displayCrawling("All Websites");
allCommand.execute();
view.displaySuccess("data/ (all files)");
} catch (CrawlerException e) {
view.displayError(e.getMessage());
}
}
public void crawlBySite(String site) {
switch (site.toLowerCase()) {
case "douban-movie":
crawlDoubanMovies();
break;
case "douban-book":
crawlDoubanBooks();
break;
case "books-to-scrape":
crawlBooksToScrape();
break;
case "all":
crawlAll();
break;
default:
view.displayError("Unknown site: " + site);
}
}
private void executeCommand(Command command, String fileName) {
try {
view.displayCrawling(command.getDescription());
command.execute();
view.displaySuccess(fileName);
} catch (CrawlerException e) {
view.displayError(e.getMessage());
}
}
}

11
project/src/main/java/com/crawler/exception/CrawlerException.java

@ -0,0 +1,11 @@
package com.crawler.exception;
public class CrawlerException extends Exception {
public CrawlerException(String message) {
super(message);
}
public CrawlerException(String message, Throwable cause) {
super(message, cause);
}
}

11
project/src/main/java/com/crawler/exception/FileException.java

@ -0,0 +1,11 @@
package com.crawler.exception;
public class FileException extends CrawlerException {
public FileException(String message) {
super(message);
}
public FileException(String message, Throwable cause) {
super(message, cause);
}
}

11
project/src/main/java/com/crawler/exception/NetworkException.java

@ -0,0 +1,11 @@
package com.crawler.exception;
public class NetworkException extends CrawlerException {
public NetworkException(String message) {
super(message);
}
public NetworkException(String message, Throwable cause) {
super(message, cause);
}
}

11
project/src/main/java/com/crawler/exception/ParseException.java

@ -0,0 +1,11 @@
package com.crawler.exception;
public class ParseException extends CrawlerException {
public ParseException(String message) {
super(message);
}
public ParseException(String message, Throwable cause) {
super(message, cause);
}
}

105
project/src/main/java/com/crawler/model/Book.java

@ -0,0 +1,105 @@
package com.crawler.model;
public class Book {
private String title;
private String author;
private String rating;
private String ratingCount;
private String publisher;
private String publishDate;
private String price;
private String isbn;
private String summary;
private String url;
public Book() {}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getRating() {
return rating;
}
public void setRating(String rating) {
this.rating = rating;
}
public String getRatingCount() {
return ratingCount;
}
public void setRatingCount(String ratingCount) {
this.ratingCount = ratingCount;
}
public String getPublisher() {
return publisher;
}
public void setPublisher(String publisher) {
this.publisher = publisher;
}
public String getPublishDate() {
return publishDate;
}
public void setPublishDate(String publishDate) {
this.publishDate = publishDate;
}
public String getPrice() {
return price;
}
public void setPrice(String price) {
this.price = price;
}
public String getIsbn() {
return isbn;
}
public void setIsbn(String isbn) {
this.isbn = isbn;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
@Override
public String toString() {
return "Book{" +
"title='" + title + '\'' +
", author='" + author + '\'' +
", rating='" + rating + '\'' +
'}';
}
}

96
project/src/main/java/com/crawler/model/Movie.java

@ -0,0 +1,96 @@
package com.crawler.model;
public class Movie {
private String title;
private String rating;
private String ratingCount;
private String year;
private String director;
private String actors;
private String genre;
private String summary;
private String url;
public Movie() {}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getRating() {
return rating;
}
public void setRating(String rating) {
this.rating = rating;
}
public String getRatingCount() {
return ratingCount;
}
public void setRatingCount(String ratingCount) {
this.ratingCount = ratingCount;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getDirector() {
return director;
}
public void setDirector(String director) {
this.director = director;
}
public String getActors() {
return actors;
}
public void setActors(String actors) {
this.actors = actors;
}
public String getGenre() {
return genre;
}
public void setGenre(String genre) {
this.genre = genre;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
@Override
public String toString() {
return "Movie{" +
"title='" + title + '\'' +
", rating='" + rating + '\'' +
", year='" + year + '\'' +
'}';
}
}

69
project/src/main/java/com/crawler/model/ScrapeBook.java

@ -0,0 +1,69 @@
package com.crawler.model;
public class ScrapeBook {
private String title;
private String price;
private String rating;
private String availability;
private String imageUrl;
private String productUrl;
public ScrapeBook() {}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getPrice() {
return price;
}
public void setPrice(String price) {
this.price = price;
}
public String getRating() {
return rating;
}
public void setRating(String rating) {
this.rating = rating;
}
public String getAvailability() {
return availability;
}
public void setAvailability(String availability) {
this.availability = availability;
}
public String getImageUrl() {
return imageUrl;
}
public void setImageUrl(String imageUrl) {
this.imageUrl = imageUrl;
}
public String getProductUrl() {
return productUrl;
}
public void setProductUrl(String productUrl) {
this.productUrl = productUrl;
}
@Override
public String toString() {
return "ScrapeBook{" +
"title='" + title + '\'' +
", price='" + price + '\'' +
", rating='" + rating + '\'' +
'}';
}
}

72
project/src/main/java/com/crawler/strategy/BooksToScrapeStrategy.java

@ -0,0 +1,72 @@
package com.crawler.strategy;
import com.crawler.exception.CrawlerException;
import com.crawler.exception.NetworkException;
import com.crawler.exception.ParseException;
import com.crawler.model.ScrapeBook;
import com.crawler.util.HttpUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class BooksToScrapeStrategy implements CrawlerStrategy<ScrapeBook> {
private static final String URL = "http://books.toscrape.com/";
@Override
public List<ScrapeBook> crawl() throws CrawlerException {
List<ScrapeBook> books = new ArrayList<>();
try {
Document doc = HttpUtil.getDocument(URL);
Elements items = doc.select("article.product_pod");
for (Element item : items) {
ScrapeBook book = new ScrapeBook();
Element titleEl = item.selectFirst("h3 a");
if (titleEl != null) {
book.setTitle(titleEl.attr("title"));
book.setProductUrl(URL + titleEl.attr("href"));
}
Element priceEl = item.selectFirst("p.price_color");
if (priceEl != null) {
book.setPrice(priceEl.text());
}
Element availabilityEl = item.selectFirst("p.instock");
if (availabilityEl != null) {
book.setAvailability(availabilityEl.text().trim());
}
Element starRatingEl = item.selectFirst("p.star-rating");
if (starRatingEl != null) {
String classes = starRatingEl.className();
if (classes.contains("One")) book.setRating("1");
else if (classes.contains("Two")) book.setRating("2");
else if (classes.contains("Three")) book.setRating("3");
else if (classes.contains("Four")) book.setRating("4");
else if (classes.contains("Five")) book.setRating("5");
}
Element imgEl = item.selectFirst("img");
if (imgEl != null) {
book.setImageUrl(URL + imgEl.attr("src"));
}
books.add(book);
}
} catch (NetworkException e) {
throw e;
} catch (Exception e) {
throw new ParseException("Failed to parse Books to Scrape page", e);
}
return books;
}
@Override
public String getOutputFileName() {
return "data/books_to_scrape.json";
}
}

9
project/src/main/java/com/crawler/strategy/CrawlerStrategy.java

@ -0,0 +1,9 @@
package com.crawler.strategy;
import com.crawler.exception.CrawlerException;
import java.util.List;
public interface CrawlerStrategy<T> {
List<T> crawl() throws CrawlerException;
String getOutputFileName();
}

69
project/src/main/java/com/crawler/strategy/DoubanBookStrategy.java

@ -0,0 +1,69 @@
package com.crawler.strategy;
import com.crawler.exception.CrawlerException;
import com.crawler.exception.NetworkException;
import com.crawler.exception.ParseException;
import com.crawler.model.Book;
import com.crawler.util.HttpUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class DoubanBookStrategy implements CrawlerStrategy<Book> {
private static final String URL = "https://book.douban.com/chart";
@Override
public List<Book> crawl() throws CrawlerException {
List<Book> books = new ArrayList<>();
try {
Document doc = HttpUtil.getDocument(URL);
Elements items = doc.select("li.media");
for (Element item : items) {
Book book = new Book();
Element titleEl = item.selectFirst("h2 a");
if (titleEl != null) {
book.setTitle(titleEl.text().trim());
book.setUrl(titleEl.attr("href"));
}
Element ratingEl = item.selectFirst("span.rating_nums");
if (ratingEl != null) {
book.setRating(ratingEl.text());
}
Element ratingCountEl = item.selectFirst("span.pl");
if (ratingCountEl != null) {
book.setRatingCount(ratingCountEl.text());
}
Element infoEl = item.selectFirst("div.pub");
if (infoEl != null) {
String info = infoEl.text();
String[] parts = info.split("/");
if (parts.length >= 3) {
book.setAuthor(parts[0].trim());
book.setPublisher(parts[parts.length - 3].trim());
book.setPublishDate(parts[parts.length - 2].trim());
book.setPrice(parts[parts.length - 1].trim());
}
}
books.add(book);
}
} catch (NetworkException e) {
throw e;
} catch (Exception e) {
throw new ParseException("Failed to parse Douban book page", e);
}
return books;
}
@Override
public String getOutputFileName() {
return "data/douban_books.json";
}
}

74
project/src/main/java/com/crawler/strategy/DoubanMovieStrategy.java

@ -0,0 +1,74 @@
package com.crawler.strategy;
import com.crawler.exception.CrawlerException;
import com.crawler.exception.NetworkException;
import com.crawler.exception.ParseException;
import com.crawler.model.Movie;
import com.crawler.util.HttpUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class DoubanMovieStrategy implements CrawlerStrategy<Movie> {
private static final String URL = "https://movie.douban.com/chart";
@Override
public List<Movie> crawl() throws CrawlerException {
List<Movie> movies = new ArrayList<>();
try {
Document doc = HttpUtil.getDocument(URL);
Elements items = doc.select("tr.item");
for (Element item : items) {
Movie movie = new Movie();
Element titleEl = item.selectFirst("div.pl2 a");
if (titleEl != null) {
movie.setTitle(titleEl.text().split("/")[0].trim());
movie.setUrl(titleEl.attr("href"));
}
Element ratingEl = item.selectFirst("span.rating_nums");
if (ratingEl != null) {
movie.setRating(ratingEl.text());
}
Element ratingCountEl = item.selectFirst("span.pl");
if (ratingCountEl != null) {
movie.setRatingCount(ratingCountEl.text());
}
Element infoEl = item.selectFirst("p.pl");
if (infoEl != null) {
String info = infoEl.text();
movie.setYear(extractYear(info));
}
movies.add(movie);
}
} catch (NetworkException e) {
throw e;
} catch (Exception e) {
throw new ParseException("Failed to parse Douban movie page", e);
}
return movies;
}
private String extractYear(String info) {
String[] parts = info.split("/");
for (String part : parts) {
part = part.trim();
if (part.matches("\\d{4}.*")) {
return part;
}
}
return "";
}
@Override
public String getOutputFileName() {
return "data/douban_movies.json";
}
}

60
project/src/main/java/com/crawler/util/FileUtil.java

@ -0,0 +1,60 @@
package com.crawler.util;
import com.crawler.exception.FileException;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.reflect.TypeToken;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.lang.reflect.Type;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
public class FileUtil {
private static final Gson GSON = new GsonBuilder().setPrettyPrinting().create();
public static <T> void saveToJsonFile(List<T> data, String filePath) throws FileException {
try {
Path path = Paths.get(filePath);
Path parentDir = path.getParent();
if (parentDir != null && !Files.exists(parentDir)) {
Files.createDirectories(parentDir);
}
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) {
GSON.toJson(data, writer);
}
} catch (IOException e) {
throw new FileException("Failed to save data to file: " + filePath, e);
}
}
public static <T> void saveToCsvFile(List<T> data, String filePath, String[] headers, CsvRowMapper<T> rowMapper) throws FileException {
try {
Path path = Paths.get(filePath);
Path parentDir = path.getParent();
if (parentDir != null && !Files.exists(parentDir)) {
Files.createDirectories(parentDir);
}
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) {
writer.write(String.join(",", headers));
writer.newLine();
for (T item : data) {
writer.write(rowMapper.mapToCsvRow(item));
writer.newLine();
}
}
} catch (IOException e) {
throw new FileException("Failed to save data to CSV file: " + filePath, e);
}
}
public interface CsvRowMapper<T> {
String mapToCsvRow(T item);
}
}

24
project/src/main/java/com/crawler/util/HttpUtil.java

@ -0,0 +1,24 @@
package com.crawler.util;
import com.crawler.exception.NetworkException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
public class HttpUtil {
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
private static final int TIMEOUT = 10000;
public static Document getDocument(String url) throws NetworkException {
try {
return Jsoup.connect(url)
.userAgent(USER_AGENT)
.timeout(TIMEOUT)
.ignoreHttpErrors(true)
.get();
} catch (IOException e) {
throw new NetworkException("Failed to fetch URL: " + url, e);
}
}
}

59
project/src/main/java/com/crawler/view/ConsoleView.java

@ -0,0 +1,59 @@
package com.crawler.view;
import java.util.List;
public class ConsoleView {
public void displayWelcome() {
System.out.println("========================================");
System.out.println(" Web Crawler Application");
System.out.println("========================================");
System.out.println();
}
public void displayMenu() {
System.out.println("Please select an option:");
System.out.println("1. Crawl Douban Movies");
System.out.println("2. Crawl Douban Books");
System.out.println("3. Crawl Books to Scrape");
System.out.println("4. Crawl All");
System.out.println("0. Exit");
System.out.println();
System.out.print("Enter your choice: ");
}
public void displayCrawling(String description) {
System.out.println();
System.out.println("----------------------------------------");
System.out.println("Crawling: " + description);
System.out.println("----------------------------------------");
}
public void displaySuccess(String fileName) {
System.out.println("✓ Data saved to: " + fileName);
System.out.println();
}
public void displayError(String message) {
System.err.println("✗ Error: " + message);
System.err.println();
}
public void displayResults(List<?> data) {
System.out.println("Found " + data.size() + " items:");
for (Object item : data) {
System.out.println("- " + item);
}
System.out.println();
}
public void displayGoodbye() {
System.out.println("========================================");
System.out.println(" Goodbye!");
System.out.println("========================================");
}
public void displayInvalidChoice() {
System.out.println("Invalid choice. Please try again.");
System.out.println();
}
}

38
project/src/main/java/command/ClearCommand.java

@ -0,0 +1,38 @@
package command;
import controller.CrawlerController;
import exception.CrawlerResult;
import java.util.Collections;
import java.util.List;
public class ClearCommand implements Command {
private final CrawlerController controller;
public ClearCommand(CrawlerController controller) {
this.controller = controller;
}
@Override
public String getName() {
return "clear";
}
@Override
public String getDescription() {
return "清空所有数据";
}
@Override
public CrawlerResult execute() {
controller.clearAllData();
return CrawlerResult.success("SYSTEM")
.message("数据已清空")
.dataCount(0)
.build();
}
@Override
public List<String> getRequiredSources() {
return Collections.emptyList();
}
}

11
project/src/main/java/command/Command.java

@ -0,0 +1,11 @@
package command;
import exception.CrawlerResult;
import java.util.List;
public interface Command {
String getName();
String getDescription();
CrawlerResult execute();
List<String> getRequiredSources();
}

41
project/src/main/java/command/CommandRegistry.java

@ -0,0 +1,41 @@
package command;
import exception.CrawlerResult;
import exception.ValidationException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class CommandRegistry {
private final Map<String, Command> commands;
public CommandRegistry() {
this.commands = new HashMap<>();
}
public void register(Command command) {
commands.put(command.getName(), command);
}
public Command getCommand(String name) {
Command command = commands.get(name);
if (command == null) {
throw new ValidationException("未知命令: " + name);
}
return command;
}
public List<Command> getAllCommands() {
return new ArrayList<>(commands.values());
}
public String getHelpText() {
StringBuilder sb = new StringBuilder();
sb.append("可用命令:\n");
for (Command cmd : commands.values()) {
sb.append(String.format(" %-15s - %s\n", cmd.getName(), cmd.getDescription()));
}
return sb.toString();
}
}

37
project/src/main/java/command/ListCrawlersCommand.java

@ -0,0 +1,37 @@
package command;
import controller.CrawlerController;
import exception.CrawlerResult;
import java.util.List;
public class ListCrawlersCommand implements Command {
private final CrawlerController controller;
public ListCrawlersCommand(CrawlerController controller) {
this.controller = controller;
}
@Override
public String getName() {
return "list";
}
@Override
public String getDescription() {
return "列出所有可用爬虫";
}
@Override
public CrawlerResult execute() {
List<String> crawlers = controller.getAllCrawlerNames();
return CrawlerResult.success("SYSTEM")
.message("获取爬虫列表成功")
.dataCount(crawlers.size())
.build();
}
@Override
public List<String> getRequiredSources() {
return controller.getAllCrawlerNames();
}
}

59
project/src/main/java/command/RunAllCommand.java

@ -0,0 +1,59 @@
package command;
import controller.CrawlerController;
import exception.CrawlerResult;
import java.util.List;
public class RunAllCommand implements Command {
private final CrawlerController controller;
public RunAllCommand(CrawlerController controller) {
this.controller = controller;
}
@Override
public String getName() {
return "run-all";
}
@Override
public String getDescription() {
return "运行所有爬虫";
}
@Override
public CrawlerResult execute() {
long startTime = System.currentTimeMillis();
List<CrawlerResult> results = controller.runAllCrawlers();
long elapsedTime = System.currentTimeMillis() - startTime;
int successCount = 0;
int totalCount = results.size();
int totalData = 0;
for (CrawlerResult result : results) {
if (result.isSuccess()) {
successCount++;
totalData += result.getDataCount();
}
}
if (successCount == totalCount) {
return CrawlerResult.success("ALL")
.message("所有爬虫执行成功")
.dataCount(totalData)
.elapsedTime(elapsedTime)
.build();
} else {
return CrawlerResult.failure("ALL", "PARTIAL_FAIL",
String.format("执行完成: %d/%d 成功, 获取 %d 条数据", successCount, totalCount, totalData))
.elapsedTime(elapsedTime)
.build();
}
}
@Override
public List<String> getRequiredSources() {
return controller.getAllCrawlerNames();
}
}

37
project/src/main/java/command/RunSingleCommand.java

@ -0,0 +1,37 @@
package command;
import controller.CrawlerController;
import exception.CrawlerResult;
import exception.ValidationException;
import java.util.Collections;
import java.util.List;
public class RunSingleCommand implements Command {
private final CrawlerController controller;
private final String crawlerName;
public RunSingleCommand(CrawlerController controller, String crawlerName) {
this.controller = controller;
this.crawlerName = crawlerName;
}
@Override
public String getName() {
return "run";
}
@Override
public String getDescription() {
return "运行指定爬虫: " + crawlerName;
}
@Override
public CrawlerResult execute() {
return controller.runCrawler(crawlerName);
}
@Override
public List<String> getRequiredSources() {
return Collections.singletonList(crawlerName);
}
}

38
project/src/main/java/command/StatsCommand.java

@ -0,0 +1,38 @@
package command;
import controller.CrawlerController;
import exception.CrawlerResult;
import java.util.Collections;
import java.util.List;
public class StatsCommand implements Command {
private final CrawlerController controller;
public StatsCommand(CrawlerController controller) {
this.controller = controller;
}
@Override
public String getName() {
return "stats";
}
@Override
public String getDescription() {
return "显示统计信息";
}
@Override
public CrawlerResult execute() {
String stats = controller.getStats();
return CrawlerResult.success("STATS")
.message(stats)
.dataCount(0)
.build();
}
@Override
public List<String> getRequiredSources() {
return Collections.emptyList();
}
}

73
project/src/main/java/config/CrawlerConfig.java

@ -0,0 +1,73 @@
package config;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
/**
* 爬虫配置类
*/
public class CrawlerConfig {
private static final String CONFIG_FILE = "crawler.properties";
private static Properties props = new Properties();
// 默认配置
static {
props.setProperty("delay.ms", "1000");
props.setProperty("timeout.ms", "15000");
props.setProperty("user.agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
props.setProperty("db.path", "crawler.db");
props.setProperty("output.dir", "output");
props.setProperty("enable.database", "true");
props.setProperty("enable.file", "true");
}
/**
* 加载配置文件
*/
public static void load() {
try (InputStream is = new FileInputStream(CONFIG_FILE)) {
props.load(is);
System.out.println("配置文件加载成功: " + CONFIG_FILE);
} catch (IOException e) {
System.out.println("使用默认配置(未找到配置文件: " + CONFIG_FILE + ")");
}
}
public static int getDelayMs() {
return Integer.parseInt(props.getProperty("delay.ms", "1000"));
}
public static int getTimeoutMs() {
return Integer.parseInt(props.getProperty("timeout.ms", "15000"));
}
public static String getUserAgent() {
return props.getProperty("user.agent");
}
public static String getDbPath() {
return props.getProperty("db.path", "crawler.db");
}
public static String getOutputDir() {
return props.getProperty("output.dir", "output");
}
public static boolean isDatabaseEnabled() {
return Boolean.parseBoolean(props.getProperty("enable.database", "true"));
}
public static boolean isFileOutputEnabled() {
return Boolean.parseBoolean(props.getProperty("enable.file", "true"));
}
public static String getProperty(String key) {
return props.getProperty(key);
}
public static String getProperty(String key, String defaultValue) {
return props.getProperty(key, defaultValue);
}
}

177
project/src/main/java/controller/CrawlerController.java

@ -0,0 +1,177 @@
package controller;
import exception.CrawlerResult;
import exception.ValidationException;
import model.Movie;
import storage.DataStorage;
import storage.FileStorage;
import storage.StorageStats;
import strategy.CrawlerStrategy;
import strategy.BookCrawlerStrategy;
import strategy.impl.DoubanStrategy;
import strategy.impl.MaoyanStrategy;
import strategy.impl.RottenTomatoesStrategy;
import strategy.impl.DoubanBookStrategy;
import strategy.impl.BooksToScrapeStrategy;
import util.Logger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class CrawlerController {
private Map<String, CrawlerStrategy> movieCrawlers;
private Map<String, BookCrawlerStrategy> bookCrawlers;
private DataStorage storage;
private String outputDir;
public CrawlerController() {
this.movieCrawlers = new HashMap<>();
this.bookCrawlers = new HashMap<>();
this.outputDir = "output";
initStorage();
registerDefaultCrawlers();
}
public CrawlerController(String outputDir) {
this.movieCrawlers = new HashMap<>();
this.bookCrawlers = new HashMap<>();
this.outputDir = outputDir;
initStorage();
registerDefaultCrawlers();
}
private void initStorage() {
this.storage = new FileStorage(outputDir);
Logger.info("文件存储初始化完成,输出目录: " + outputDir);
}
private void registerDefaultCrawlers() {
registerMovieCrawler(new DoubanStrategy());
registerMovieCrawler(new MaoyanStrategy());
registerMovieCrawler(new RottenTomatoesStrategy());
registerBookCrawler(new DoubanBookStrategy());
registerBookCrawler(new BooksToScrapeStrategy());
}
public void registerMovieCrawler(CrawlerStrategy strategy) {
strategy.setStorage(storage);
movieCrawlers.put(strategy.getName(), strategy);
Logger.info("已注册电影爬虫: " + strategy.getName());
}
public void registerBookCrawler(BookCrawlerStrategy strategy) {
strategy.setStorage(storage);
bookCrawlers.put(strategy.getName(), strategy);
Logger.info("已注册图书爬虫: " + strategy.getName());
}
public void registerCrawler(CrawlerStrategy strategy) {
registerMovieCrawler(strategy);
}
public void registerCrawler(CrawlerStrategy strategy, DataStorage customStorage) {
strategy.setStorage(customStorage);
movieCrawlers.put(strategy.getName(), strategy);
Logger.info("已注册爬虫: " + strategy.getName());
}
public List<String> getAllCrawlerNames() {
List<String> names = new ArrayList<>();
names.addAll(movieCrawlers.keySet());
names.addAll(bookCrawlers.keySet());
return names;
}
public List<String> getMovieCrawlerNames() {
return new ArrayList<>(movieCrawlers.keySet());
}
public List<String> getBookCrawlerNames() {
return new ArrayList<>(bookCrawlers.keySet());
}
public CrawlerResult runCrawler(String name) {
if (movieCrawlers.containsKey(name)) {
CrawlerStrategy strategy = movieCrawlers.get(name);
Logger.info("开始执行电影爬虫: " + name);
CrawlerResult result = strategy.execute();
Logger.info("爬虫执行完成: " + result);
return result;
} else if (bookCrawlers.containsKey(name)) {
BookCrawlerStrategy strategy = bookCrawlers.get(name);
Logger.info("开始执行图书爬虫: " + name);
CrawlerResult result = strategy.execute();
Logger.info("爬虫执行完成: " + result);
return result;
} else {
throw new ValidationException("未找到爬虫: " + name);
}
}
public List<CrawlerResult> runAllCrawlers() {
List<CrawlerResult> results = new ArrayList<>();
int total = movieCrawlers.size() + bookCrawlers.size();
Logger.info("开始执行所有爬虫,共 " + total + " 个");
for (CrawlerStrategy strategy : movieCrawlers.values()) {
try {
CrawlerResult result = strategy.execute();
results.add(result);
} catch (Exception e) {
Logger.error("爬虫执行失败: " + strategy.getName(), e);
results.add(CrawlerResult.failure(strategy.getName(), "EXEC_ERROR", e.getMessage()).build());
}
}
for (BookCrawlerStrategy strategy : bookCrawlers.values()) {
try {
CrawlerResult result = strategy.execute();
results.add(result);
} catch (Exception e) {
Logger.error("爬虫执行失败: " + strategy.getName(), e);
results.add(CrawlerResult.failure(strategy.getName(), "EXEC_ERROR", e.getMessage()).build());
}
}
return results;
}
public String getStats() {
StringBuilder sb = new StringBuilder();
sb.append("========== 爬虫统计 ==========\n");
sb.append("电影爬虫数量: ").append(movieCrawlers.size()).append("\n");
sb.append("图书爬虫数量: ").append(bookCrawlers.size()).append("\n");
sb.append("总爬虫数量: ").append(movieCrawlers.size() + bookCrawlers.size()).append("\n");
sb.append("\n电影爬虫列表:\n");
for (String name : movieCrawlers.keySet()) {
sb.append(" - ").append(name).append("\n");
}
sb.append("\n图书爬虫列表:\n");
for (String name : bookCrawlers.keySet()) {
sb.append(" - ").append(name).append("\n");
}
sb.append("=============================");
return sb.toString();
}
public void clearAllData() {
if (storage != null) {
storage.clearAll();
Logger.info("所有数据已清空");
}
}
public DataStorage getStorage() {
return storage;
}
public Map<String, CrawlerStrategy> getCrawlers() {
return movieCrawlers;
}
public Map<String, BookCrawlerStrategy> getBookCrawlers() {
return bookCrawlers;
}
}

139
project/src/main/java/crawler/BaseCrawler.java

@ -0,0 +1,139 @@
package crawler;
import model.Movie;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import storage.DataStorage;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* 爬虫抽象基类
*/
public abstract class BaseCrawler {
protected String name; // 爬虫名称
protected String baseUrl; // 基础URL
protected int delayMs; // 请求延迟(毫秒)
protected DataStorage storage; // 数据存储
public BaseCrawler(String name, String baseUrl) {
this(name, baseUrl, 1000);
}
public BaseCrawler(String name, String baseUrl, int delayMs) {
this.name = name;
this.baseUrl = baseUrl;
this.delayMs = delayMs;
}
/**
* 设置数据存储
*/
public void setStorage(DataStorage storage) {
this.storage = storage;
}
/**
* 获取爬虫名称
*/
public String getName() {
return name;
}
/**
* 获取网页文档
*/
protected Document fetchDocument(String url) throws IOException {
return Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.timeout(15000)
.get();
}
/**
* 延迟等待
*/
protected void delay() {
try {
Thread.sleep(delayMs);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
/**
* 开始爬取模板方法模式
*/
public final void crawl() {
System.out.println("========================================");
System.out.println("开始爬取: " + name);
System.out.println("目标URL: " + baseUrl);
System.out.println("========================================");
long startTime = System.currentTimeMillis();
List<Movie> allMovies = new ArrayList<>();
try {
// 获取所有需要爬取的URL列表
List<String> urls = getUrls();
System.out.println("共 " + urls.size() + " 个页面需要爬取");
for (int i = 0; i < urls.size(); i++) {
String url = urls.get(i);
System.out.println("\n正在爬取第 " + (i + 1) + "/" + urls.size() + " 页: " + url);
try {
Document doc = fetchDocument(url);
List<Movie> movies = parsePage(doc);
// 设置数据来源
for (Movie movie : movies) {
movie.setSource(name);
}
allMovies.addAll(movies);
System.out.println("本页获取 " + movies.size() + " 条数据");
// 延迟,避免被封
if (i < urls.size() - 1) {
delay();
}
} catch (IOException e) {
System.err.println("爬取页面失败: " + url + " - " + e.getMessage());
}
}
// 保存数据
if (!allMovies.isEmpty() && storage != null) {
storage.saveBatch(allMovies);
}
long endTime = System.currentTimeMillis();
System.out.println("\n========================================");
System.out.println("爬取完成!");
System.out.println("总数据量: " + allMovies.size());
System.out.println("耗时: " + (endTime - startTime) / 1000 + " 秒");
System.out.println("========================================");
} catch (Exception e) {
System.err.println("爬取过程出错: " + e.getMessage());
e.printStackTrace();
}
}
/**
* 获取所有需要爬取的URL列表子类实现
*/
protected abstract List<String> getUrls();
/**
* 解析单个页面子类实现
*/
protected abstract List<Movie> parsePage(Document doc);
}

113
project/src/main/java/crawler/DoubanCrawler.java

@ -0,0 +1,113 @@
package crawler;
import model.Movie;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
/**
* 豆瓣电影Top250爬虫
*/
public class DoubanCrawler extends BaseCrawler {
public DoubanCrawler() {
super("豆瓣电影Top250", "https://movie.douban.com/top250", 1500);
}
@Override
protected List<String> getUrls() {
List<String> urls = new ArrayList<>();
// 豆瓣Top250共10页,每页25部
for (int i = 0; i < 10; i++) {
urls.add(baseUrl + "?start=" + (i * 25));
}
return urls;
}
@Override
protected List<Movie> parsePage(Document doc) {
List<Movie> movies = new ArrayList<>();
Elements items = doc.select("div.item");
for (Element item : items) {
try {
Movie movie = new Movie();
// 排名
String rankStr = item.select("em").text();
movie.setRank(Integer.parseInt(rankStr));
// 电影名称(取第一个标题)
Element titleElement = item.select("span.title").first();
if (titleElement != null) {
movie.setName(titleElement.text());
}
// 评分
String ratingStr = item.select("span.rating_num").text();
if (!ratingStr.isEmpty()) {
movie.setRating(Double.parseDouble(ratingStr));
}
// 评分人数
String ratingCountStr = item.select("div.star span").last().text();
if (ratingCountStr != null && ratingCountStr.contains("人评价")) {
String num = ratingCountStr.replace("人评价", "").trim();
movie.setRatingCount(parseNumber(num));
}
// 其他信息(导演、年份等)
String info = item.select("div.bd p").first().text();
if (info != null) {
// 提取年份
String[] parts = info.split(" / ");
if (parts.length > 0) {
String firstPart = parts[0];
if (firstPart.contains("导演: ")) {
movie.setDirector(firstPart.replace("导演: ", "").trim());
}
// 提取年份(通常是最后一个数字部分)
for (String part : parts) {
if (part.matches("\\d{4}") || part.matches("\\d{4}.*")) {
movie.setYear(part.trim().split("\\s+")[0]);
break;
}
}
}
}
// 详情链接
String link = item.select("div.hd a").attr("href");
movie.setUrl(link);
// 海报图片
String imgUrl = item.select("div.pic img").attr("src");
movie.setImageUrl(imgUrl);
movies.add(movie);
} catch (Exception e) {
System.err.println("解析电影数据出错: " + e.getMessage());
}
}
return movies;
}
/**
* 解析数字处理中文数字如"万"
*/
private Integer parseNumber(String str) {
try {
if (str.contains("万")) {
return (int) (Double.parseDouble(str.replace("万", "")) * 10000);
}
return Integer.parseInt(str.replace(",", ""));
} catch (NumberFormatException e) {
return null;
}
}
}

100
project/src/main/java/crawler/ImdbCrawler.java

@ -0,0 +1,100 @@
package crawler;
import model.Movie;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
/**
* IMDB Top250 爬虫示例
*/
public class ImdbCrawler extends BaseCrawler {
public ImdbCrawler() {
super("IMDB电影Top250", "https://www.imdb.com/chart/top/", 2000);
}
@Override
protected List<String> getUrls() {
List<String> urls = new ArrayList<>();
urls.add(baseUrl);
return urls;
}
@Override
protected List<Movie> parsePage(Document doc) {
List<Movie> movies = new ArrayList<>();
Elements items = doc.select("li.ipc-metadata-list-summary-item");
int rank = 1;
for (Element item : items) {
try {
Movie movie = new Movie();
movie.setRank(rank++);
// 电影名称
Element titleElement = item.select("h3.ipc-title__text").first();
if (titleElement != null) {
String fullTitle = titleElement.text();
// 移除排名前缀如 "1. "
if (fullTitle.matches("\\d+\\..*")) {
fullTitle = fullTitle.substring(fullTitle.indexOf(".") + 1).trim();
}
movie.setName(fullTitle);
}
// 评分
String ratingStr = item.select("span.ipc-rating-star--rating").text();
if (!ratingStr.isEmpty()) {
movie.setRating(Double.parseDouble(ratingStr));
}
// 评分人数
String countStr = item.select("span.ipc-rating-star--voteCount").text();
if (!countStr.isEmpty()) {
movie.setRatingCount(parseNumber(countStr.replaceAll("[()\\s]", "")));
}
// 年份
String yearStr = item.select("span.cli-title-metadata-item").first().text();
if (yearStr != null && yearStr.matches("\\d{4}")) {
movie.setYear(yearStr);
}
// 详情链接
String link = item.select("a.ipc-title-link-wrapper").attr("href");
if (!link.isEmpty()) {
movie.setUrl("https://www.imdb.com" + link);
}
// 海报图片
String imgUrl = item.select("img.ipc-image").attr("src");
movie.setImageUrl(imgUrl);
movies.add(movie);
} catch (Exception e) {
System.err.println("解析电影数据出错: " + e.getMessage());
}
}
return movies;
}
private Integer parseNumber(String str) {
try {
if (str.contains("M")) {
return (int) (Double.parseDouble(str.replace("M", "")) * 1000000);
}
if (str.contains("K")) {
return (int) (Double.parseDouble(str.replace("K", "")) * 1000);
}
return Integer.parseInt(str.replace(",", ""));
} catch (NumberFormatException e) {
return null;
}
}
}

92
project/src/main/java/crawler/MaoyanCrawler.java

@ -0,0 +1,92 @@
package crawler;
import model.Movie;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
/**
* 猫眼电影 Top100 爬虫
*/
public class MaoyanCrawler extends BaseCrawler {
public MaoyanCrawler() {
super("猫眼电影Top100", "https://maoyan.com/board/4", 1500);
}
@Override
protected List<String> getUrls() {
List<String> urls = new ArrayList<>();
// 猫眼Top100共10页,每页10部
for (int i = 0; i < 10; i++) {
urls.add(baseUrl + "?offset=" + (i * 10));
}
return urls;
}
@Override
protected List<Movie> parsePage(Document doc) {
List<Movie> movies = new ArrayList<>();
Elements items = doc.select("dl.board-wrapper dd");
for (Element item : items) {
try {
Movie movie = new Movie();
// 排名
String rankStr = item.select("i.board-index").text();
movie.setRank(Integer.parseInt(rankStr));
// 电影名称
String name = item.select("p.name a").text();
movie.setName(name);
// 评分
String ratingStr = item.select("i.integer").text() +
item.select("i.fraction").text();
if (!ratingStr.isEmpty()) {
movie.setRating(Double.parseDouble(ratingStr));
}
// 主演
String actors = item.select("p.star").text();
if (actors != null && actors.contains("主演:")) {
movie.setActors(actors.replace("主演:", "").trim());
}
// 上映时间
String releaseTime = item.select("p.releasetime").text();
if (releaseTime != null && releaseTime.contains("上映时间:")) {
String timeStr = releaseTime.replace("上映时间:", "").trim();
// 提取年份
if (timeStr.matches("\\d{4}.*")) {
movie.setYear(timeStr.substring(0, 4));
}
}
// 详情链接
String link = item.select("p.name a").attr("href");
if (!link.isEmpty()) {
movie.setUrl("https://maoyan.com" + link);
}
// 海报图片
String imgUrl = item.select("img.board-img").attr("data-src");
if (imgUrl.isEmpty()) {
imgUrl = item.select("img.board-img").attr("src");
}
movie.setImageUrl(imgUrl);
movies.add(movie);
} catch (Exception e) {
System.err.println("解析猫眼电影数据出错: " + e.getMessage());
}
}
return movies;
}
}

102
project/src/main/java/crawler/RottenTomatoesCrawler.java

@ -0,0 +1,102 @@
package crawler;
import model.Movie;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
/**
* 烂番茄 (Rotten Tomatoes) Top100 爬虫
*/
public class RottenTomatoesCrawler extends BaseCrawler {
public RottenTomatoesCrawler() {
super("烂番茄Top100", "https://www.rottentomatoes.com/top/bestofrt/", 2000);
}
@Override
protected List<String> getUrls() {
List<String> urls = new ArrayList<>();
urls.add(baseUrl);
return urls;
}
@Override
protected List<Movie> parsePage(Document doc) {
List<Movie> movies = new ArrayList<>();
Elements items = doc.select("table.table tr");
// 跳过表头
int rank = 0;
for (Element item : items) {
try {
// 跳过表头行
Element rankElement = item.selectFirst("td.rank");
if (rankElement == null) continue;
Movie movie = new Movie();
// 排名
String rankStr = rankElement.text();
if (!rankStr.isEmpty()) {
movie.setRank(Integer.parseInt(rankStr));
} else {
movie.setRank(++rank);
}
// 电影名称和年份
Element titleElement = item.selectFirst("td.title a");
if (titleElement != null) {
String fullTitle = titleElement.text();
// 提取年份(通常在括号里)
if (fullTitle.contains("(") && fullTitle.contains(")")) {
int start = fullTitle.lastIndexOf("(");
int end = fullTitle.lastIndexOf(")");
if (start > 0 && end > start) {
String yearStr = fullTitle.substring(start + 1, end);
if (yearStr.matches("\\d{4}")) {
movie.setYear(yearStr);
}
movie.setName(fullTitle.substring(0, start).trim());
} else {
movie.setName(fullTitle);
}
} else {
movie.setName(fullTitle);
}
// 详情链接
String link = titleElement.attr("href");
if (!link.isEmpty()) {
if (link.startsWith("/")) {
movie.setUrl("https://www.rottentomatoes.com" + link);
} else {
movie.setUrl(link);
}
}
}
// 新鲜度评分(烂番茄特有)
Element scoreElement = item.selectFirst("td.score span.tMeterScore");
if (scoreElement != null) {
String scoreStr = scoreElement.text();
if (scoreStr.matches("\\d+%")) {
// 转换为10分制
double rating = Double.parseDouble(scoreStr.replace("%", "")) / 10;
movie.setRating(Math.round(rating * 10) / 10.0);
}
}
movies.add(movie);
} catch (Exception e) {
System.err.println("解析烂番茄数据出错: " + e.getMessage());
}
}
return movies;
}
}

55
project/src/main/java/exception/CrawlerException.java

@ -0,0 +1,55 @@
package exception;
public class CrawlerException extends RuntimeException {
private final String source;
private final String errorCode;
public CrawlerException(String message) {
super(message);
this.source = "UNKNOWN";
this.errorCode = "CRAWLER_001";
}
public CrawlerException(String message, String source) {
super(message);
this.source = source;
this.errorCode = "CRAWLER_001";
}
public CrawlerException(String message, Throwable cause) {
super(message, cause);
this.source = "UNKNOWN";
this.errorCode = "CRAWLER_002";
}
public CrawlerException(String message, String source, Throwable cause) {
super(message, cause);
this.source = source;
this.errorCode = "CRAWLER_002";
}
public CrawlerException(String message, String source, String errorCode) {
super(message);
this.source = source;
this.errorCode = errorCode;
}
public CrawlerException(String message, String source, String errorCode, Throwable cause) {
super(message, cause);
this.source = source;
this.errorCode = errorCode;
}
public String getSource() {
return source;
}
public String getErrorCode() {
return errorCode;
}
@Override
public String toString() {
return String.format("[%s] [%s] %s (source: %s)", errorCode, getClass().getSimpleName(), getMessage(), source);
}
}

103
project/src/main/java/exception/CrawlerResult.java

@ -0,0 +1,103 @@
package exception;
public class CrawlerResult {
private final boolean success;
private final String source;
private final String message;
private final int dataCount;
private final long elapsedTime;
private final String errorCode;
private CrawlerResult(Builder builder) {
this.success = builder.success;
this.source = builder.source;
this.message = builder.message;
this.dataCount = builder.dataCount;
this.elapsedTime = builder.elapsedTime;
this.errorCode = builder.errorCode;
}
public boolean isSuccess() {
return success;
}
public String getSource() {
return source;
}
public String getMessage() {
return message;
}
public int getDataCount() {
return dataCount;
}
public long getElapsedTime() {
return elapsedTime;
}
public String getErrorCode() {
return errorCode;
}
public static Builder success(String source) {
return new Builder().success(true).source(source);
}
public static Builder failure(String source, String errorCode, String message) {
return new Builder().success(false).source(source).errorCode(errorCode).message(message);
}
public static class Builder {
private boolean success;
private String source;
private String message;
private int dataCount;
private long elapsedTime;
private String errorCode;
public Builder success(boolean success) {
this.success = success;
return this;
}
public Builder source(String source) {
this.source = source;
return this;
}
public Builder message(String message) {
this.message = message;
return this;
}
public Builder dataCount(int dataCount) {
this.dataCount = dataCount;
return this;
}
public Builder elapsedTime(long elapsedTime) {
this.elapsedTime = elapsedTime;
return this;
}
public Builder errorCode(String errorCode) {
this.errorCode = errorCode;
return this;
}
public CrawlerResult build() {
return new CrawlerResult(this);
}
}
@Override
public String toString() {
if (success) {
return String.format("[SUCCESS] %s - 获取 %d 条数据 (耗时: %dms)", source, dataCount, elapsedTime);
} else {
return String.format("[FAILURE] [%s] %s - %s", errorCode, source, message);
}
}
}

20
project/src/main/java/exception/NetworkException.java

@ -0,0 +1,20 @@
package exception;
public class NetworkException extends CrawlerException {
public NetworkException(String message) {
super(message);
}
public NetworkException(String message, String source) {
super(message, source);
}
public NetworkException(String message, Throwable cause) {
super(message, cause);
}
public NetworkException(String message, String source, Throwable cause) {
super(message, source, cause);
}
}

20
project/src/main/java/exception/ParseException.java

@ -0,0 +1,20 @@
package exception;
public class ParseException extends CrawlerException {
public ParseException(String message) {
super(message);
}
public ParseException(String message, String source) {
super(message, source);
}
public ParseException(String message, Throwable cause) {
super(message, cause);
}
public ParseException(String message, String source, Throwable cause) {
super(message, source, cause);
}
}

20
project/src/main/java/exception/StorageException.java

@ -0,0 +1,20 @@
package exception;
public class StorageException extends CrawlerException {
public StorageException(String message) {
super(message);
}
public StorageException(String message, String source) {
super(message, source);
}
public StorageException(String message, Throwable cause) {
super(message, cause);
}
public StorageException(String message, String source, Throwable cause) {
super(message, source, cause);
}
}

20
project/src/main/java/exception/ValidationException.java

@ -0,0 +1,20 @@
package exception;
public class ValidationException extends CrawlerException {
public ValidationException(String message) {
super(message);
}
public ValidationException(String message, String source) {
super(message, source);
}
public ValidationException(String message, Throwable cause) {
super(message, cause);
}
public ValidationException(String message, String source, Throwable cause) {
super(message, source, cause);
}
}

236
project/src/main/java/main/CrawlerManager.java

@ -0,0 +1,236 @@
package main;
import config.CrawlerConfig;
import crawler.BaseCrawler;
import crawler.DoubanCrawler;
import crawler.ImdbCrawler;
import crawler.MaoyanCrawler;
import crawler.RottenTomatoesCrawler;
import model.Book;
import model.Movie;
import storage.DataStorage;
import storage.FileStorage;
import storage.SQLiteStorage;
import storage.StorageStats;
import util.Logger;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
public class CrawlerManager {
private List<BaseCrawler> crawlers;
private DataStorage databaseStorage;
private DataStorage fileStorage;
public CrawlerManager() {
crawlers = new ArrayList<>();
CrawlerConfig.load();
if (CrawlerConfig.isDatabaseEnabled()) {
databaseStorage = new SQLiteStorage();
Logger.info("数据库存储已启用");
}
if (CrawlerConfig.isFileOutputEnabled()) {
fileStorage = new FileStorage(CrawlerConfig.getOutputDir());
Logger.info("文件输出已启用");
}
registerCrawler(new DoubanCrawler());
registerCrawler(new MaoyanCrawler());
registerCrawler(new RottenTomatoesCrawler());
}
public void registerCrawler(BaseCrawler crawler) {
if (databaseStorage != null) {
crawler.setStorage(new MultiStorage(databaseStorage, fileStorage));
} else {
crawler.setStorage(fileStorage);
}
crawlers.add(crawler);
Logger.info("已注册爬虫: " + crawler.getName());
}
public void runAll() {
Logger.info("开始运行所有爬虫,共 " + crawlers.size() + " 个");
for (BaseCrawler crawler : crawlers) {
crawler.crawl();
System.out.println();
}
showStats();
}
public void runCrawler(String name) {
for (BaseCrawler crawler : crawlers) {
if (crawler.getName().equals(name)) {
crawler.crawl();
showStats();
return;
}
}
Logger.error("未找到爬虫: " + name);
}
public void showStats() {
if (databaseStorage != null) {
StorageStats stats = databaseStorage.getStats();
System.out.println("\n========== 数据库统计 ==========");
System.out.println("总记录数: " + stats.getTotalCount());
System.out.println("数据源数量: " + stats.getSourceCount());
System.out.println("================================\n");
}
}
public void showMenu() {
System.out.println("\n========== 爬虫管理系统 ==========");
System.out.println("1. 运行所有爬虫");
System.out.println("2. 运行指定爬虫");
System.out.println("3. 查看统计信息");
System.out.println("4. 清空数据库");
System.out.println("5. 退出");
System.out.println("==================================");
System.out.print("请选择操作: ");
}
public void interactive() {
Scanner scanner = new Scanner(System.in);
while (true) {
showMenu();
String choice = scanner.nextLine().trim();
switch (choice) {
case "1":
runAll();
break;
case "2":
System.out.println("\n可用爬虫:");
for (int i = 0; i < crawlers.size(); i++) {
System.out.println((i + 1) + ". " + crawlers.get(i).getName());
}
System.out.print("请输入爬虫名称: ");
String crawlerName = scanner.nextLine().trim();
runCrawler(crawlerName);
break;
case "3":
showStats();
break;
case "4":
System.out.print("确定要清空所有数据吗?(yes/no): ");
String confirm = scanner.nextLine().trim();
if ("yes".equalsIgnoreCase(confirm) && databaseStorage != null) {
databaseStorage.clearAll();
}
break;
case "5":
System.out.println("再见!");
close();
return;
default:
System.out.println("无效选择,请重试");
}
}
}
public void close() {
if (databaseStorage != null) {
databaseStorage.close();
}
}
private static class MultiStorage implements DataStorage {
private DataStorage primary;
private DataStorage secondary;
public MultiStorage(DataStorage primary, DataStorage secondary) {
this.primary = primary;
this.secondary = secondary;
}
@Override
public void save(Movie movie) {
primary.save(movie);
if (secondary != null) secondary.save(movie);
}
@Override
public void saveBatch(List<Movie> movies) {
primary.saveBatch(movies);
if (secondary != null) secondary.saveBatch(movies);
}
@Override
public List<Movie> findAll() {
return primary.findAll();
}
@Override
public List<Movie> findBySource(String source) {
return primary.findBySource(source);
}
@Override
public List<Movie> findByRankRange(int start, int end) {
return primary.findByRankRange(start, end);
}
@Override
public void saveBook(Book book) {
primary.saveBook(book);
if (secondary != null) secondary.saveBook(book);
}
@Override
public void saveBookBatch(List<Book> books) {
primary.saveBookBatch(books);
if (secondary != null) secondary.saveBookBatch(books);
}
@Override
public List<Book> findAllBooks() {
return primary.findAllBooks();
}
@Override
public List<Book> findBooksBySource(String source) {
return primary.findBooksBySource(source);
}
@Override
public void deleteBySource(String source) {
primary.deleteBySource(source);
}
@Override
public void clearAll() {
primary.clearAll();
}
@Override
public StorageStats getStats() {
return primary.getStats();
}
@Override
public void close() {
primary.close();
}
}
public static void main(String[] args) {
CrawlerManager manager = new CrawlerManager();
if (args.length > 0 && args[0].equals("--auto")) {
manager.runAll();
manager.close();
} else {
manager.interactive();
}
}
}

86
project/src/main/java/model/Book.java

@ -0,0 +1,86 @@
package model;
import java.time.LocalDateTime;
public class Book {
private Integer id;
private String source;
private Integer rank;
private String title;
private String author;
private String publisher;
private String year;
private Double price;
private Double rating;
private Integer ratingCount;
private String category;
private String description;
private String url;
private String imageUrl;
private String isbn;
private LocalDateTime crawlTime;
public Book() {}
public Book(String source, Integer rank, String title, Double rating) {
this.source = source;
this.rank = rank;
this.title = title;
this.rating = rating;
this.crawlTime = LocalDateTime.now();
}
public Integer getId() { return id; }
public void setId(Integer id) { this.id = id; }
public String getSource() { return source; }
public void setSource(String source) { this.source = source; }
public Integer getRank() { return rank; }
public void setRank(Integer rank) { this.rank = rank; }
public String getTitle() { return title; }
public void setTitle(String title) { this.title = title; }
public String getAuthor() { return author; }
public void setAuthor(String author) { this.author = author; }
public String getPublisher() { return publisher; }
public void setPublisher(String publisher) { this.publisher = publisher; }
public String getYear() { return year; }
public void setYear(String year) { this.year = year; }
public Double getPrice() { return price; }
public void setPrice(Double price) { this.price = price; }
public Double getRating() { return rating; }
public void setRating(Double rating) { this.rating = rating; }
public Integer getRatingCount() { return ratingCount; }
public void setRatingCount(Integer ratingCount) { this.ratingCount = ratingCount; }
public String getCategory() { return category; }
public void setCategory(String category) { this.category = category; }
public String getDescription() { return description; }
public void setDescription(String description) { this.description = description; }
public String getUrl() { return url; }
public void setUrl(String url) { this.url = url; }
public String getImageUrl() { return imageUrl; }
public void setImageUrl(String imageUrl) { this.imageUrl = imageUrl; }
public String getIsbn() { return isbn; }
public void setIsbn(String isbn) { this.isbn = isbn; }
public LocalDateTime getCrawlTime() { return crawlTime; }
public void setCrawlTime(LocalDateTime crawlTime) { this.crawlTime = crawlTime; }
@Override
public String toString() {
return String.format("Book{source='%s', rank=%d, title='%s', rating=%.1f}",
source, rank, title, rating != null ? rating : 0.0);
}
}

78
project/src/main/java/model/Movie.java

@ -0,0 +1,78 @@
package model;
import java.time.LocalDateTime;
/**
* 电影数据模型类
*/
public class Movie {
private Integer id;
private String source; // 数据来源网站
private Integer rank; // 排名
private String name; // 电影名称
private String director; // 导演
private String actors; // 演员
private String year; // 年份
private Double rating; // 评分
private Integer ratingCount; // 评分人数
private String description; // 简介
private String url; // 详情链接
private String imageUrl; // 海报图片
private LocalDateTime crawlTime; // 爬取时间
public Movie() {}
public Movie(String source, Integer rank, String name, Double rating) {
this.source = source;
this.rank = rank;
this.name = name;
this.rating = rating;
this.crawlTime = LocalDateTime.now();
}
// Getters and Setters
public Integer getId() { return id; }
public void setId(Integer id) { this.id = id; }
public String getSource() { return source; }
public void setSource(String source) { this.source = source; }
public Integer getRank() { return rank; }
public void setRank(Integer rank) { this.rank = rank; }
public String getName() { return name; }
public void setName(String name) { this.name = name; }
public String getDirector() { return director; }
public void setDirector(String director) { this.director = director; }
public String getActors() { return actors; }
public void setActors(String actors) { this.actors = actors; }
public String getYear() { return year; }
public void setYear(String year) { this.year = year; }
public Double getRating() { return rating; }
public void setRating(Double rating) { this.rating = rating; }
public Integer getRatingCount() { return ratingCount; }
public void setRatingCount(Integer ratingCount) { this.ratingCount = ratingCount; }
public String getDescription() { return description; }
public void setDescription(String description) { this.description = description; }
public String getUrl() { return url; }
public void setUrl(String url) { this.url = url; }
public String getImageUrl() { return imageUrl; }
public void setImageUrl(String imageUrl) { this.imageUrl = imageUrl; }
public LocalDateTime getCrawlTime() { return crawlTime; }
public void setCrawlTime(LocalDateTime crawlTime) { this.crawlTime = crawlTime; }
@Override
public String toString() {
return String.format("Movie{source='%s', rank=%d, name='%s', rating=%.1f}",
source, rank, name, rating);
}
}

34
project/src/main/java/storage/DataStorage.java

@ -0,0 +1,34 @@
package storage;
import model.Movie;
import model.Book;
import java.util.List;
public interface DataStorage {
void save(Movie movie);
void saveBatch(List<Movie> movies);
List<Movie> findAll();
List<Movie> findBySource(String source);
List<Movie> findByRankRange(int start, int end);
void deleteBySource(String source);
void clearAll();
StorageStats getStats();
void close();
void saveBook(model.Book book);
void saveBookBatch(List<Book> books);
List<Book> findAllBooks();
List<Book> findBooksBySource(String source);
}

237
project/src/main/java/storage/FileStorage.java

@ -0,0 +1,237 @@
package storage;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import model.Book;
import model.Movie;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
public class FileStorage implements DataStorage {
private static final Gson gson = new GsonBuilder()
.setPrettyPrinting()
.registerTypeAdapter(LocalDateTime.class, new LocalDateTimeAdapter())
.create();
private final String outputDir;
public FileStorage() {
this("output");
}
public FileStorage(String outputDir) {
this.outputDir = outputDir;
File dir = new File(outputDir);
if (!dir.exists()) {
dir.mkdirs();
}
}
@Override
public void save(Movie movie) {
List<Movie> list = new ArrayList<>();
list.add(movie);
saveBatch(list);
}
@Override
public void saveBatch(List<Movie> movies) {
if (movies.isEmpty()) return;
String source = movies.get(0).getSource();
saveMoviesAsJson(movies, source);
saveMoviesAsTxt(movies, source);
}
private void saveMoviesAsJson(List<Movie> movies, String source) {
String filename = outputDir + "/" + sanitizeFilename(source) + "_" +
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".json";
try (Writer writer = new OutputStreamWriter(
new FileOutputStream(filename), StandardCharsets.UTF_8)) {
gson.toJson(movies, writer);
System.out.println("JSON文件已保存: " + filename);
} catch (IOException e) {
System.err.println("保存JSON失败: " + e.getMessage());
}
}
private void saveMoviesAsTxt(List<Movie> movies, String source) {
String filename = outputDir + "/" + sanitizeFilename(source) + "_" +
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".txt";
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(filename), StandardCharsets.UTF_8))) {
writer.write("==========================================");
writer.newLine();
writer.write(" 数据来源: " + source);
writer.newLine();
writer.write(" 爬取时间: " + LocalDateTime.now());
writer.newLine();
writer.write(" 电影数量: " + movies.size());
writer.newLine();
writer.write("==========================================");
writer.newLine();
writer.newLine();
for (Movie movie : movies) {
writer.write(String.format("排名: %d", movie.getRank()));
writer.newLine();
writer.write(String.format("电影: %s", movie.getName()));
writer.newLine();
writer.write(String.format("评分: %.1f", movie.getRating()));
writer.newLine();
if (movie.getDirector() != null) {
writer.write(String.format("导演: %s", movie.getDirector()));
writer.newLine();
}
if (movie.getYear() != null) {
writer.write(String.format("年份: %s", movie.getYear()));
writer.newLine();
}
writer.write("------------------------------------------");
writer.newLine();
}
System.out.println("TXT文件已保存: " + filename);
} catch (IOException e) {
System.err.println("保存TXT失败: " + e.getMessage());
}
}
@Override
public void saveBook(Book book) {
List<Book> list = new ArrayList<>();
list.add(book);
saveBookBatch(list);
}
@Override
public void saveBookBatch(List<Book> books) {
if (books.isEmpty()) return;
String source = books.get(0).getSource();
saveBooksAsJson(books, source);
saveBooksAsTxt(books, source);
}
private void saveBooksAsJson(List<Book> books, String source) {
String filename = outputDir + "/" + sanitizeFilename(source) + "_" +
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".json";
try (Writer writer = new OutputStreamWriter(
new FileOutputStream(filename), StandardCharsets.UTF_8)) {
gson.toJson(books, writer);
System.out.println("JSON文件已保存: " + filename);
} catch (IOException e) {
System.err.println("保存JSON失败: " + e.getMessage());
}
}
private void saveBooksAsTxt(List<Book> books, String source) {
String filename = outputDir + "/" + sanitizeFilename(source) + "_" +
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".txt";
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(filename), StandardCharsets.UTF_8))) {
writer.write("==========================================");
writer.newLine();
writer.write(" 数据来源: " + source);
writer.newLine();
writer.write(" 爬取时间: " + LocalDateTime.now());
writer.newLine();
writer.write(" 图书数量: " + books.size());
writer.newLine();
writer.write("==========================================");
writer.newLine();
writer.newLine();
for (Book book : books) {
if (book.getRank() != null) {
writer.write(String.format("排名: %d", book.getRank()));
writer.newLine();
}
writer.write(String.format("书名: %s", book.getTitle()));
writer.newLine();
if (book.getRating() != null) {
writer.write(String.format("评分: %.1f", book.getRating()));
writer.newLine();
}
if (book.getAuthor() != null) {
writer.write(String.format("作者: %s", book.getAuthor()));
writer.newLine();
}
if (book.getPublisher() != null) {
writer.write(String.format("出版社: %s", book.getPublisher()));
writer.newLine();
}
if (book.getPrice() != null) {
writer.write(String.format("价格: %.2f", book.getPrice()));
writer.newLine();
}
if (book.getYear() != null) {
writer.write(String.format("年份: %s", book.getYear()));
writer.newLine();
}
writer.write("------------------------------------------");
writer.newLine();
}
System.out.println("TXT文件已保存: " + filename);
} catch (IOException e) {
System.err.println("保存TXT失败: " + e.getMessage());
}
}
private String sanitizeFilename(String filename) {
return filename.replaceAll("[\\\\/:*?\"<>|]", "_");
}
@Override
public List<Movie> findAll() {
return new ArrayList<>();
}
@Override
public List<Movie> findBySource(String source) {
return new ArrayList<>();
}
@Override
public List<Movie> findByRankRange(int start, int end) {
return new ArrayList<>();
}
@Override
public List<Book> findAllBooks() {
return new ArrayList<>();
}
@Override
public List<Book> findBooksBySource(String source) {
return new ArrayList<>();
}
@Override
public void deleteBySource(String source) {}
@Override
public void clearAll() {}
@Override
public StorageStats getStats() {
return new StorageStats(0, 0);
}
@Override
public void close() {}
}

25
project/src/main/java/storage/LocalDateTimeAdapter.java

@ -0,0 +1,25 @@
package storage;
import com.google.gson.*;
import java.lang.reflect.Type;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
/**
* Gson LocalDateTime 适配器
*/
public class LocalDateTimeAdapter implements JsonSerializer<LocalDateTime>, JsonDeserializer<LocalDateTime> {
private static final DateTimeFormatter formatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME;
@Override
public JsonElement serialize(LocalDateTime src, Type typeOfSrc, JsonSerializationContext context) {
return new JsonPrimitive(formatter.format(src));
}
@Override
public LocalDateTime deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context)
throws JsonParseException {
return LocalDateTime.parse(json.getAsString(), formatter);
}
}

414
project/src/main/java/storage/SQLiteStorage.java

@ -0,0 +1,414 @@
package storage;
import model.Book;
import model.Movie;
import java.sql.*;
import java.util.ArrayList;
import java.util.List;
public class SQLiteStorage implements DataStorage {
private static final String DB_URL = "jdbc:sqlite:crawler.db";
private Connection connection;
public SQLiteStorage() {
try {
connection = DriverManager.getConnection(DB_URL);
initTable();
} catch (SQLException e) {
throw new RuntimeException("数据库连接失败: " + e.getMessage(), e);
}
}
private void initTable() throws SQLException {
String movieSql = "CREATE TABLE IF NOT EXISTS movies (" +
"id INTEGER PRIMARY KEY AUTOINCREMENT," +
"source TEXT NOT NULL," +
"rank INTEGER," +
"name TEXT NOT NULL," +
"director TEXT," +
"actors TEXT," +
"year TEXT," +
"rating REAL," +
"rating_count INTEGER," +
"description TEXT," +
"url TEXT," +
"image_url TEXT," +
"crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP" +
")";
String bookSql = "CREATE TABLE IF NOT EXISTS books (" +
"id INTEGER PRIMARY KEY AUTOINCREMENT," +
"source TEXT NOT NULL," +
"rank INTEGER," +
"title TEXT NOT NULL," +
"author TEXT," +
"publisher TEXT," +
"year TEXT," +
"price REAL," +
"rating REAL," +
"rating_count INTEGER," +
"category TEXT," +
"description TEXT," +
"url TEXT," +
"image_url TEXT," +
"isbn TEXT," +
"crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP" +
")";
try (Statement stmt = connection.createStatement()) {
stmt.execute(movieSql);
stmt.execute(bookSql);
}
String indexSql1 = "CREATE INDEX IF NOT EXISTS idx_movie_source ON movies(source)";
String indexSql2 = "CREATE INDEX IF NOT EXISTS idx_book_source ON books(source)";
try (Statement stmt = connection.createStatement()) {
stmt.execute(indexSql1);
stmt.execute(indexSql2);
}
}
@Override
public void save(Movie movie) {
String sql = "INSERT INTO movies (source, rank, name, director, actors, year, " +
"rating, rating_count, description, url, image_url, crawl_time) " +
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
pstmt.setString(1, movie.getSource());
pstmt.setObject(2, movie.getRank());
pstmt.setString(3, movie.getName());
pstmt.setString(4, movie.getDirector());
pstmt.setString(5, movie.getActors());
pstmt.setString(6, movie.getYear());
pstmt.setObject(7, movie.getRating());
pstmt.setObject(8, movie.getRatingCount());
pstmt.setString(9, movie.getDescription());
pstmt.setString(10, movie.getUrl());
pstmt.setString(11, movie.getImageUrl());
pstmt.setTimestamp(12, movie.getCrawlTime() != null ?
Timestamp.valueOf(movie.getCrawlTime()) : null);
pstmt.executeUpdate();
} catch (SQLException e) {
System.err.println("保存电影失败: " + movie.getName() + " - " + e.getMessage());
}
}
@Override
public void saveBatch(List<Movie> movies) {
String sql = "INSERT INTO movies (source, rank, name, director, actors, year, " +
"rating, rating_count, description, url, image_url, crawl_time) " +
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
connection.setAutoCommit(false);
for (Movie movie : movies) {
pstmt.setString(1, movie.getSource());
pstmt.setObject(2, movie.getRank());
pstmt.setString(3, movie.getName());
pstmt.setString(4, movie.getDirector());
pstmt.setString(5, movie.getActors());
pstmt.setString(6, movie.getYear());
pstmt.setObject(7, movie.getRating());
pstmt.setObject(8, movie.getRatingCount());
pstmt.setString(9, movie.getDescription());
pstmt.setString(10, movie.getUrl());
pstmt.setString(11, movie.getImageUrl());
pstmt.setTimestamp(12, movie.getCrawlTime() != null ?
Timestamp.valueOf(movie.getCrawlTime()) : null);
pstmt.addBatch();
}
pstmt.executeBatch();
connection.commit();
System.out.println("批量保存 " + movies.size() + " 条数据成功");
} catch (SQLException e) {
try {
connection.rollback();
} catch (SQLException ex) {
ex.printStackTrace();
}
System.err.println("批量保存失败: " + e.getMessage());
} finally {
try {
connection.setAutoCommit(true);
} catch (SQLException e) {
e.printStackTrace();
}
}
}
@Override
public void saveBook(Book book) {
String sql = "INSERT INTO books (source, rank, title, author, publisher, year, " +
"price, rating, rating_count, category, description, url, image_url, isbn, crawl_time) " +
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
pstmt.setString(1, book.getSource());
pstmt.setObject(2, book.getRank());
pstmt.setString(3, book.getTitle());
pstmt.setString(4, book.getAuthor());
pstmt.setString(5, book.getPublisher());
pstmt.setString(6, book.getYear());
pstmt.setObject(7, book.getPrice());
pstmt.setObject(8, book.getRating());
pstmt.setObject(9, book.getRatingCount());
pstmt.setString(10, book.getCategory());
pstmt.setString(11, book.getDescription());
pstmt.setString(12, book.getUrl());
pstmt.setString(13, book.getImageUrl());
pstmt.setString(14, book.getIsbn());
pstmt.setTimestamp(15, book.getCrawlTime() != null ?
Timestamp.valueOf(book.getCrawlTime()) : null);
pstmt.executeUpdate();
} catch (SQLException e) {
System.err.println("保存图书失败: " + book.getTitle() + " - " + e.getMessage());
}
}
@Override
public void saveBookBatch(List<Book> books) {
String sql = "INSERT INTO books (source, rank, title, author, publisher, year, " +
"price, rating, rating_count, category, description, url, image_url, isbn, crawl_time) " +
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
connection.setAutoCommit(false);
for (Book book : books) {
pstmt.setString(1, book.getSource());
pstmt.setObject(2, book.getRank());
pstmt.setString(3, book.getTitle());
pstmt.setString(4, book.getAuthor());
pstmt.setString(5, book.getPublisher());
pstmt.setString(6, book.getYear());
pstmt.setObject(7, book.getPrice());
pstmt.setObject(8, book.getRating());
pstmt.setObject(9, book.getRatingCount());
pstmt.setString(10, book.getCategory());
pstmt.setString(11, book.getDescription());
pstmt.setString(12, book.getUrl());
pstmt.setString(13, book.getImageUrl());
pstmt.setString(14, book.getIsbn());
pstmt.setTimestamp(15, book.getCrawlTime() != null ?
Timestamp.valueOf(book.getCrawlTime()) : null);
pstmt.addBatch();
}
pstmt.executeBatch();
connection.commit();
System.out.println("批量保存 " + books.size() + " 条图书数据成功");
} catch (SQLException e) {
try {
connection.rollback();
} catch (SQLException ex) {
ex.printStackTrace();
}
System.err.println("批量保存图书失败: " + e.getMessage());
} finally {
try {
connection.setAutoCommit(true);
} catch (SQLException e) {
e.printStackTrace();
}
}
}
@Override
public List<Movie> findAll() {
List<Movie> movies = new ArrayList<>();
String sql = "SELECT * FROM movies ORDER BY source, rank";
try (Statement stmt = connection.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
movies.add(mapResultSetToMovie(rs));
}
} catch (SQLException e) {
System.err.println("查询失败: " + e.getMessage());
}
return movies;
}
@Override
public List<Movie> findBySource(String source) {
List<Movie> movies = new ArrayList<>();
String sql = "SELECT * FROM movies WHERE source = ? ORDER BY rank";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
pstmt.setString(1, source);
ResultSet rs = pstmt.executeQuery();
while (rs.next()) {
movies.add(mapResultSetToMovie(rs));
}
} catch (SQLException e) {
System.err.println("查询失败: " + e.getMessage());
}
return movies;
}
@Override
public List<Movie> findByRankRange(int start, int end) {
List<Movie> movies = new ArrayList<>();
String sql = "SELECT * FROM movies WHERE rank BETWEEN ? AND ? ORDER BY rank";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
pstmt.setInt(1, start);
pstmt.setInt(2, end);
ResultSet rs = pstmt.executeQuery();
while (rs.next()) {
movies.add(mapResultSetToMovie(rs));
}
} catch (SQLException e) {
System.err.println("查询失败: " + e.getMessage());
}
return movies;
}
@Override
public List<Book> findAllBooks() {
List<Book> books = new ArrayList<>();
String sql = "SELECT * FROM books ORDER BY source, rank";
try (Statement stmt = connection.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
books.add(mapResultSetToBook(rs));
}
} catch (SQLException e) {
System.err.println("查询失败: " + e.getMessage());
}
return books;
}
@Override
public List<Book> findBooksBySource(String source) {
List<Book> books = new ArrayList<>();
String sql = "SELECT * FROM books WHERE source = ? ORDER BY rank";
try (PreparedStatement pstmt = connection.prepareStatement(sql)) {
pstmt.setString(1, source);
ResultSet rs = pstmt.executeQuery();
while (rs.next()) {
books.add(mapResultSetToBook(rs));
}
} catch (SQLException e) {
System.err.println("查询失败: " + e.getMessage());
}
return books;
}
@Override
public void deleteBySource(String source) {
String sql1 = "DELETE FROM movies WHERE source = ?";
String sql2 = "DELETE FROM books WHERE source = ?";
try (PreparedStatement pstmt1 = connection.prepareStatement(sql1);
PreparedStatement pstmt2 = connection.prepareStatement(sql2)) {
pstmt1.setString(1, source);
pstmt2.setString(1, source);
int count1 = pstmt1.executeUpdate();
int count2 = pstmt2.executeUpdate();
System.out.println("删除 " + source + " 的 " + (count1 + count2) + " 条数据");
} catch (SQLException e) {
System.err.println("删除失败: " + e.getMessage());
}
}
@Override
public void clearAll() {
try (Statement stmt = connection.createStatement()) {
stmt.execute("DELETE FROM movies");
stmt.execute("DELETE FROM books");
System.out.println("清空所有数据");
} catch (SQLException e) {
System.err.println("清空失败: " + e.getMessage());
}
}
@Override
public StorageStats getStats() {
int totalCount = 0;
int sourceCount = 0;
try (Statement stmt = connection.createStatement()) {
ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM movies");
if (rs.next()) {
totalCount = rs.getInt(1);
}
rs = stmt.executeQuery("SELECT COUNT(*) FROM books");
if (rs.next()) {
totalCount += rs.getInt(1);
}
rs = stmt.executeQuery("SELECT COUNT(DISTINCT source) FROM movies");
if (rs.next()) {
sourceCount = rs.getInt(1);
}
rs = stmt.executeQuery("SELECT COUNT(DISTINCT source) FROM books");
if (rs.next()) {
sourceCount += rs.getInt(1);
}
} catch (SQLException e) {
System.err.println("统计失败: " + e.getMessage());
}
return new StorageStats(totalCount, sourceCount);
}
@Override
public void close() {
try {
if (connection != null && !connection.isClosed()) {
connection.close();
}
} catch (SQLException e) {
System.err.println("关闭连接失败: " + e.getMessage());
}
}
private Movie mapResultSetToMovie(ResultSet rs) throws SQLException {
Movie movie = new Movie();
movie.setId(rs.getInt("id"));
movie.setSource(rs.getString("source"));
movie.setRank(rs.getInt("rank"));
movie.setName(rs.getString("name"));
movie.setDirector(rs.getString("director"));
movie.setActors(rs.getString("actors"));
movie.setYear(rs.getString("year"));
movie.setRating(rs.getDouble("rating"));
movie.setRatingCount(rs.getInt("rating_count"));
movie.setDescription(rs.getString("description"));
movie.setUrl(rs.getString("url"));
movie.setImageUrl(rs.getString("image_url"));
Timestamp ts = rs.getTimestamp("crawl_time");
if (ts != null) {
movie.setCrawlTime(ts.toLocalDateTime());
}
return movie;
}
private Book mapResultSetToBook(ResultSet rs) throws SQLException {
Book book = new Book();
book.setId(rs.getInt("id"));
book.setSource(rs.getString("source"));
book.setRank(rs.getInt("rank"));
book.setTitle(rs.getString("title"));
book.setAuthor(rs.getString("author"));
book.setPublisher(rs.getString("publisher"));
book.setYear(rs.getString("year"));
book.setPrice(rs.getDouble("price"));
book.setRating(rs.getDouble("rating"));
book.setRatingCount(rs.getInt("rating_count"));
book.setCategory(rs.getString("category"));
book.setDescription(rs.getString("description"));
book.setUrl(rs.getString("url"));
book.setImageUrl(rs.getString("image_url"));
book.setIsbn(rs.getString("isbn"));
Timestamp ts = rs.getTimestamp("crawl_time");
if (ts != null) {
book.setCrawlTime(ts.toLocalDateTime());
}
return book;
}
}

23
project/src/main/java/storage/StorageStats.java

@ -0,0 +1,23 @@
package storage;
/**
* 存储统计信息
*/
public class StorageStats {
private int totalCount;
private int sourceCount;
public StorageStats(int totalCount, int sourceCount) {
this.totalCount = totalCount;
this.sourceCount = sourceCount;
}
public int getTotalCount() { return totalCount; }
public int getSourceCount() { return sourceCount; }
@Override
public String toString() {
return String.format("StorageStats{totalCount=%d, sourceCount=%d}",
totalCount, sourceCount);
}
}

115
project/src/main/java/strategy/AbstractBookCrawlerStrategy.java

@ -0,0 +1,115 @@
package strategy;
import exception.CrawlerResult;
import exception.NetworkException;
import exception.ParseException;
import model.Book;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import storage.DataStorage;
import util.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public abstract class AbstractBookCrawlerStrategy implements BookCrawlerStrategy {
protected DataStorage storage;
protected int delayMs = 1500;
@Override
public void setStorage(DataStorage storage) {
this.storage = storage;
}
@Override
public int getDelayMs() {
return delayMs;
}
protected Document fetchDocument(String url) throws IOException {
return Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.timeout(15000)
.get();
}
protected void delay() {
try {
Thread.sleep(delayMs);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
@Override
public CrawlerResult execute() {
long startTime = System.currentTimeMillis();
String sourceName = getName();
List<Book> allBooks = new ArrayList<>();
try {
List<String> urls = getPageUrls();
Logger.info(String.format("[%s] 开始爬取,共 %d 个页面", sourceName, urls.size()));
for (int i = 0; i < urls.size(); i++) {
String url = urls.get(i);
Logger.info(String.format("[%s] 爬取第 %d/%d 页: %s", sourceName, i + 1, urls.size(), url));
try {
Document doc = fetchDocument(url);
List<Book> books = parseBooks(doc.html());
for (Book book : books) {
book.setSource(sourceName);
}
allBooks.addAll(books);
Logger.info(String.format("[%s] 第 %d 页获取 %d 条数据", sourceName, i + 1, books.size()));
} catch (IOException e) {
Logger.error(String.format("[%s] 网络请求失败: %s", sourceName, url), e);
throw new NetworkException("网络请求失败: " + url, sourceName, e);
} catch (Exception e) {
Logger.error(String.format("[%s] 解析页面失败: %s", sourceName, url), e);
throw new ParseException("解析页面失败: " + url, sourceName, e);
}
if (i < urls.size() - 1) {
delay();
}
}
if (storage != null && !allBooks.isEmpty()) {
storage.saveBookBatch(allBooks);
Logger.info(String.format("[%s] 数据已保存到存储", sourceName));
}
long elapsedTime = System.currentTimeMillis() - startTime;
return CrawlerResult.success(sourceName)
.message("爬取成功")
.dataCount(allBooks.size())
.elapsedTime(elapsedTime)
.build();
} catch (NetworkException e) {
long elapsedTime = System.currentTimeMillis() - startTime;
return CrawlerResult.failure(sourceName, "NETWORK_ERROR", e.getMessage())
.elapsedTime(elapsedTime)
.build();
} catch (ParseException e) {
long elapsedTime = System.currentTimeMillis() - startTime;
return CrawlerResult.failure(sourceName, "PARSE_ERROR", e.getMessage())
.elapsedTime(elapsedTime)
.build();
} catch (Exception e) {
long elapsedTime = System.currentTimeMillis() - startTime;
return CrawlerResult.failure(sourceName, "UNKNOWN_ERROR", e.getMessage())
.elapsedTime(elapsedTime)
.build();
}
}
}

114
project/src/main/java/strategy/AbstractCrawlerStrategy.java

@ -0,0 +1,114 @@
package strategy;
import exception.CrawlerResult;
import exception.NetworkException;
import exception.ParseException;
import model.Movie;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import storage.DataStorage;
import util.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public abstract class AbstractCrawlerStrategy implements CrawlerStrategy {
protected DataStorage storage;
protected int delayMs = 1500;
public void setStorage(DataStorage storage) {
this.storage = storage;
}
@Override
public int getDelayMs() {
return delayMs;
}
protected Document fetchDocument(String url) throws IOException {
return Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.timeout(15000)
.get();
}
protected void delay() {
try {
Thread.sleep(delayMs);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
@Override
public CrawlerResult execute() {
long startTime = System.currentTimeMillis();
String sourceName = getName();
List<Movie> allMovies = new ArrayList<>();
try {
List<String> urls = getPageUrls();
Logger.info(String.format("[%s] 开始爬取,共 %d 个页面", sourceName, urls.size()));
for (int i = 0; i < urls.size(); i++) {
String url = urls.get(i);
Logger.info(String.format("[%s] 爬取第 %d/%d 页: %s", sourceName, i + 1, urls.size(), url));
try {
Document doc = fetchDocument(url);
List<Movie> movies = parseMovies(doc.html());
for (Movie movie : movies) {
movie.setSource(sourceName);
}
allMovies.addAll(movies);
Logger.info(String.format("[%s] 第 %d 页获取 %d 条数据", sourceName, i + 1, movies.size()));
} catch (IOException e) {
Logger.error(String.format("[%s] 网络请求失败: %s", sourceName, url), e);
throw new NetworkException("网络请求失败: " + url, sourceName, e);
} catch (Exception e) {
Logger.error(String.format("[%s] 解析页面失败: %s", sourceName, url), e);
throw new ParseException("解析页面失败: " + url, sourceName, e);
}
if (i < urls.size() - 1) {
delay();
}
}
if (storage != null && !allMovies.isEmpty()) {
storage.saveBatch(allMovies);
Logger.info(String.format("[%s] 数据已保存到存储", sourceName));
}
long elapsedTime = System.currentTimeMillis() - startTime;
return CrawlerResult.success(sourceName)
.message("爬取成功")
.dataCount(allMovies.size())
.elapsedTime(elapsedTime)
.build();
} catch (NetworkException e) {
long elapsedTime = System.currentTimeMillis() - startTime;
return CrawlerResult.failure(sourceName, "NETWORK_ERROR", e.getMessage())
.elapsedTime(elapsedTime)
.build();
} catch (ParseException e) {
long elapsedTime = System.currentTimeMillis() - startTime;
return CrawlerResult.failure(sourceName, "PARSE_ERROR", e.getMessage())
.elapsedTime(elapsedTime)
.build();
} catch (Exception e) {
long elapsedTime = System.currentTimeMillis() - startTime;
return CrawlerResult.failure(sourceName, "UNKNOWN_ERROR", e.getMessage())
.elapsedTime(elapsedTime)
.build();
}
}
}

15
project/src/main/java/strategy/BookCrawlerStrategy.java

@ -0,0 +1,15 @@
package strategy;
import exception.CrawlerResult;
import storage.DataStorage;
import java.util.List;
public interface BookCrawlerStrategy {
String getName();
String getBaseUrl();
List<String> getPageUrls();
List<model.Book> parseBooks(String htmlContent);
CrawlerResult execute();
int getDelayMs();
void setStorage(DataStorage storage);
}

16
project/src/main/java/strategy/CrawlerStrategy.java

@ -0,0 +1,16 @@
package strategy;
import exception.CrawlerResult;
import model.Movie;
import storage.DataStorage;
import java.util.List;
public interface CrawlerStrategy {
String getName();
String getBaseUrl();
List<String> getPageUrls();
List<Movie> parseMovies(String htmlContent);
CrawlerResult execute();
int getDelayMs();
void setStorage(DataStorage storage);
}

116
project/src/main/java/strategy/impl/BooksToScrapeStrategy.java

@ -0,0 +1,116 @@
package strategy.impl;
import model.Book;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import strategy.AbstractBookCrawlerStrategy;
import java.util.ArrayList;
import java.util.List;
public class BooksToScrapeStrategy extends AbstractBookCrawlerStrategy {
private static final String NAME = "BooksToScrape";
private static final String BASE_URL = "https://books.toscrape.com";
public BooksToScrapeStrategy() {
this.delayMs = 1000;
}
@Override
public String getName() {
return NAME;
}
@Override
public String getBaseUrl() {
return BASE_URL;
}
@Override
public List<String> getPageUrls() {
List<String> urls = new ArrayList<>();
urls.add(BASE_URL);
for (int i = 2; i <= 50; i++) {
urls.add(BASE_URL + "/catalogue/page-" + i + ".html");
}
return urls;
}
@Override
public List<Book> parseBooks(String htmlContent) {
List<Book> books = new ArrayList<>();
Document doc = Jsoup.parse(htmlContent);
Elements items = doc.select("article.product_pod");
int rank = 1;
for (Element item : items) {
try {
Book book = new Book();
Element titleElement = item.select("h3 a").first();
if (titleElement != null) {
book.setTitle(titleElement.attr("title"));
String href = titleElement.attr("href");
if (href.startsWith("../")) {
book.setUrl(BASE_URL + "/catalogue/" + href.substring(3));
} else {
book.setUrl(BASE_URL + "/" + href);
}
}
Element priceElement = item.select("p.price_color").first();
if (priceElement != null) {
String priceStr = priceElement.text().replace("£", "").replace("Â", "").trim();
try {
book.setPrice(Double.parseDouble(priceStr));
} catch (NumberFormatException e) {
// ignore
}
}
Element ratingElement = item.select("p.star-rating").first();
if (ratingElement != null) {
String ratingClass = ratingElement.className();
int rating = parseRating(ratingClass);
book.setRating((double) rating);
}
Element imgElement = item.select("img").first();
if (imgElement != null) {
String src = imgElement.attr("src");
if (src.startsWith("../")) {
book.setImageUrl(BASE_URL + "/" + src.substring(3));
} else {
book.setImageUrl(BASE_URL + "/" + src);
}
}
Element availabilityElement = item.select("p.instock.availability").first();
if (availabilityElement != null) {
String availability = availabilityElement.text().trim();
}
book.setRank(rank++);
if (book.getTitle() != null && !book.getTitle().isEmpty()) {
books.add(book);
}
} catch (Exception e) {
// skip invalid item
}
}
return books;
}
private int parseRating(String ratingClass) {
if (ratingClass.contains("One")) return 1;
if (ratingClass.contains("Two")) return 2;
if (ratingClass.contains("Three")) return 3;
if (ratingClass.contains("Four")) return 4;
if (ratingClass.contains("Five")) return 5;
return 0;
}
}

159
project/src/main/java/strategy/impl/DoubanBookStrategy.java

@ -0,0 +1,159 @@
package strategy.impl;
import model.Book;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import strategy.AbstractBookCrawlerStrategy;
import java.util.ArrayList;
import java.util.List;
public class DoubanBookStrategy extends AbstractBookCrawlerStrategy {
private static final String NAME = "豆瓣读书Top250";
private static final String BASE_URL = "https://book.douban.com/top250";
public DoubanBookStrategy() {
this.delayMs = 2000;
}
@Override
public String getName() {
return NAME;
}
@Override
public String getBaseUrl() {
return BASE_URL;
}
@Override
public List<String> getPageUrls() {
List<String> urls = new ArrayList<>();
for (int i = 0; i < 10; i++) {
urls.add(BASE_URL + "?start=" + (i * 25));
}
return urls;
}
@Override
public List<Book> parseBooks(String htmlContent) {
List<Book> books = new ArrayList<>();
Document doc = Jsoup.parse(htmlContent);
Elements items = doc.select("tr.item");
for (Element item : items) {
try {
Book book = new Book();
Element indent = item.select("td.indent").first();
if (indent != null) {
String rankStr = indent.select("div.starcount").text();
if (!rankStr.isEmpty()) {
book.setRank(parseNumber(rankStr));
}
}
if (book.getRank() == null) {
Element order = item.select("div.starcount").first();
if (order != null) {
book.setRank(parseNumber(order.text()));
}
}
Element titleElement = item.select("div.pl2 a").first();
if (titleElement != null) {
String title = titleElement.attr("title");
if (title.isEmpty()) {
title = titleElement.text().split("\\s")[0];
}
book.setTitle(title.trim());
book.setUrl(titleElement.attr("href"));
}
Element ratingElement = item.select("span.rating_nums").first();
if (ratingElement != null) {
String ratingStr = ratingElement.text();
if (!ratingStr.isEmpty()) {
book.setRating(Double.parseDouble(ratingStr));
}
}
Element countElement = item.select("span.pl").first();
if (countElement != null) {
String countText = countElement.text();
if (countText.contains("人评价")) {
String num = countText.replace("人评价", "").replace("(", "").replace(")", "").trim();
book.setRatingCount(parseNumber(num));
}
}
Element infoElement = item.select("p.pl").first();
if (infoElement != null) {
String info = infoElement.text();
parseBookInfo(book, info);
}
Element imgElement = item.select("img").first();
if (imgElement != null) {
book.setImageUrl(imgElement.attr("src"));
}
if (book.getTitle() != null && !book.getTitle().isEmpty()) {
books.add(book);
}
} catch (Exception e) {
// skip invalid item
}
}
return books;
}
private void parseBookInfo(Book book, String info) {
String[] parts = info.split(" / ");
for (int i = 0; i < parts.length; i++) {
String part = parts[i].trim();
if (i == 0 && !part.matches("\\d{4}.*") && !part.matches(".*\\d+\\.\\d+.*")) {
book.setAuthor(part);
}
if (part.matches("\\d{4}")) {
book.setYear(part);
}
if (part.contains("出版社")) {
book.setPublisher(part.replace("出版社", "").trim());
}
if (part.matches(".*\\d+\\.\\d+元")) {
String priceStr = part.replace("元", "").trim();
try {
book.setPrice(Double.parseDouble(priceStr));
} catch (NumberFormatException e) {
// ignore
}
}
if (part.matches("ISBN.*")) {
book.setIsbn(part.replace("ISBN", "").trim());
}
}
}
private Integer parseNumber(String str) {
try {
if (str == null || str.isEmpty()) return null;
str = str.replaceAll("[^0-9.]", "");
if (str.isEmpty()) return null;
if (str.contains(".")) {
return (int) Double.parseDouble(str);
}
return Integer.parseInt(str);
} catch (NumberFormatException e) {
return null;
}
}
}

111
project/src/main/java/strategy/impl/DoubanStrategy.java

@ -0,0 +1,111 @@
package strategy.impl;
import model.Movie;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import strategy.AbstractCrawlerStrategy;
import java.util.ArrayList;
import java.util.List;
public class DoubanStrategy extends AbstractCrawlerStrategy {
private static final String NAME = "豆瓣电影Top250";
private static final String BASE_URL = "https://movie.douban.com/top250";
public DoubanStrategy() {
this.delayMs = 1500;
}
@Override
public String getName() {
return NAME;
}
@Override
public String getBaseUrl() {
return BASE_URL;
}
@Override
public List<String> getPageUrls() {
List<String> urls = new ArrayList<>();
for (int i = 0; i < 10; i++) {
urls.add(BASE_URL + "?start=" + (i * 25));
}
return urls;
}
@Override
public List<Movie> parseMovies(String htmlContent) {
List<Movie> movies = new ArrayList<>();
Document doc = Jsoup.parse(htmlContent);
Elements items = doc.select("div.item");
for (Element item : items) {
try {
Movie movie = new Movie();
String rankStr = item.select("em").text();
movie.setRank(Integer.parseInt(rankStr));
Element titleElement = item.select("span.title").first();
if (titleElement != null) {
movie.setName(titleElement.text());
}
String ratingStr = item.select("span.rating_num").text();
if (!ratingStr.isEmpty()) {
movie.setRating(Double.parseDouble(ratingStr));
}
String ratingCountStr = item.select("div.star span").last().text();
if (ratingCountStr != null && ratingCountStr.contains("人评价")) {
String num = ratingCountStr.replace("人评价", "").trim();
movie.setRatingCount(parseNumber(num));
}
String info = item.select("div.bd p").first().text();
if (info != null) {
String[] parts = info.split(" / ");
if (parts.length > 0) {
String firstPart = parts[0];
if (firstPart.contains("导演: ")) {
movie.setDirector(firstPart.replace("导演: ", "").trim());
}
for (String part : parts) {
if (part.matches("\\d{4}") || part.matches("\\d{4}.*")) {
movie.setYear(part.trim().split("\\s+")[0]);
break;
}
}
}
}
String link = item.select("div.hd a").attr("href");
movie.setUrl(link);
String imgUrl = item.select("div.pic img").attr("src");
movie.setImageUrl(imgUrl);
movies.add(movie);
} catch (Exception e) {
// skip invalid item
}
}
return movies;
}
private Integer parseNumber(String str) {
try {
if (str.contains("万")) {
return (int) (Double.parseDouble(str.replace("万", "")) * 10000);
}
return Integer.parseInt(str.replace(",", ""));
} catch (NumberFormatException e) {
return null;
}
}
}

94
project/src/main/java/strategy/impl/MaoyanStrategy.java

@ -0,0 +1,94 @@
package strategy.impl;
import model.Movie;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import strategy.AbstractCrawlerStrategy;
import java.util.ArrayList;
import java.util.List;
public class MaoyanStrategy extends AbstractCrawlerStrategy {
private static final String NAME = "猫眼电影Top100";
private static final String BASE_URL = "https://maoyan.com/board/4";
public MaoyanStrategy() {
this.delayMs = 1500;
}
@Override
public String getName() {
return NAME;
}
@Override
public String getBaseUrl() {
return BASE_URL;
}
@Override
public List<String> getPageUrls() {
List<String> urls = new ArrayList<>();
for (int i = 0; i < 10; i++) {
urls.add(BASE_URL + "?offset=" + (i * 10));
}
return urls;
}
@Override
public List<Movie> parseMovies(String htmlContent) {
List<Movie> movies = new ArrayList<>();
Document doc = Jsoup.parse(htmlContent);
Elements items = doc.select("dl.board-wrapper dd");
for (Element item : items) {
try {
Movie movie = new Movie();
String rankStr = item.select("i.board-index").text();
movie.setRank(Integer.parseInt(rankStr));
String name = item.select("p.name a").text();
movie.setName(name);
String ratingStr = item.select("i.integer").text() +
item.select("i.fraction").text();
if (!ratingStr.isEmpty()) {
movie.setRating(Double.parseDouble(ratingStr));
}
String actors = item.select("p.star").text();
if (actors != null && actors.contains("主演:")) {
movie.setActors(actors.replace("主演:", "").trim());
}
String releaseTime = item.select("p.releasetime").text();
if (releaseTime != null && releaseTime.contains("上映时间:")) {
String timeStr = releaseTime.replace("上映时间:", "").trim();
if (timeStr.matches("\\d{4}.*")) {
movie.setYear(timeStr.substring(0, 4));
}
}
String link = item.select("p.name a").attr("href");
if (!link.isEmpty()) {
movie.setUrl("https://maoyan.com" + link);
}
String imgUrl = item.select("img.board-img").attr("data-src");
if (imgUrl.isEmpty()) {
imgUrl = item.select("img.board-img").attr("src");
}
movie.setImageUrl(imgUrl);
movies.add(movie);
} catch (Exception e) {
// skip invalid item
}
}
return movies;
}
}

105
project/src/main/java/strategy/impl/RottenTomatoesStrategy.java

@ -0,0 +1,105 @@
package strategy.impl;
import model.Movie;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import strategy.AbstractCrawlerStrategy;
import java.util.ArrayList;
import java.util.List;
public class RottenTomatoesStrategy extends AbstractCrawlerStrategy {
private static final String NAME = "烂番茄Top100";
private static final String BASE_URL = "https://www.rottentomatoes.com/top/bestofrt/";
public RottenTomatoesStrategy() {
this.delayMs = 2000;
}
@Override
public String getName() {
return NAME;
}
@Override
public String getBaseUrl() {
return BASE_URL;
}
@Override
public List<String> getPageUrls() {
List<String> urls = new ArrayList<>();
urls.add(BASE_URL);
return urls;
}
@Override
public List<Movie> parseMovies(String htmlContent) {
List<Movie> movies = new ArrayList<>();
Document doc = Jsoup.parse(htmlContent);
Elements items = doc.select("table.table tr");
int rank = 0;
for (Element item : items) {
try {
Element rankElement = item.selectFirst("td.rank");
if (rankElement == null) continue;
Movie movie = new Movie();
String rankStr = rankElement.text();
if (!rankStr.isEmpty()) {
movie.setRank(Integer.parseInt(rankStr));
} else {
movie.setRank(++rank);
}
Element titleElement = item.selectFirst("td.title a");
if (titleElement != null) {
String fullTitle = titleElement.text();
if (fullTitle.contains("(") && fullTitle.contains(")")) {
int start = fullTitle.lastIndexOf("(");
int end = fullTitle.lastIndexOf(")");
if (start > 0 && end > start) {
String yearStr = fullTitle.substring(start + 1, end);
if (yearStr.matches("\\d{4}")) {
movie.setYear(yearStr);
}
movie.setName(fullTitle.substring(0, start).trim());
} else {
movie.setName(fullTitle);
}
} else {
movie.setName(fullTitle);
}
String link = titleElement.attr("href");
if (!link.isEmpty()) {
if (link.startsWith("/")) {
movie.setUrl("https://www.rottentomatoes.com" + link);
} else {
movie.setUrl(link);
}
}
}
Element scoreElement = item.selectFirst("td.score span.tMeterScore");
if (scoreElement != null) {
String scoreStr = scoreElement.text();
if (scoreStr.matches("\\d+%")) {
double rating = Double.parseDouble(scoreStr.replace("%", "")) / 10;
movie.setRating(Math.round(rating * 10) / 10.0);
}
}
movies.add(movie);
} catch (Exception e) {
// skip invalid item
}
}
return movies;
}
}

54
project/src/main/java/util/Logger.java

@ -0,0 +1,54 @@
package util;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
/**
* 简单日志工具类
*/
public class Logger {
private static final DateTimeFormatter formatter =
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
public enum Level {
DEBUG, INFO, WARN, ERROR
}
private static Level currentLevel = Level.INFO;
public static void setLevel(Level level) {
currentLevel = level;
}
private static void log(Level level, String message) {
if (level.ordinal() >= currentLevel.ordinal()) {
String timestamp = LocalDateTime.now().format(formatter);
String threadName = Thread.currentThread().getName();
System.out.printf("[%s] [%s] [%s] %s%n",
timestamp, level, threadName, message);
}
}
public static void debug(String message) {
log(Level.DEBUG, message);
}
public static void info(String message) {
log(Level.INFO, message);
}
public static void warn(String message) {
log(Level.WARN, message);
}
public static void error(String message) {
log(Level.ERROR, message);
}
public static void error(String message, Throwable e) {
log(Level.ERROR, message + " - " + e.getMessage());
if (currentLevel == Level.DEBUG) {
e.printStackTrace();
}
}
}

109
project/src/main/java/view/CrawlerView.java

@ -0,0 +1,109 @@
package view;
import exception.CrawlerResult;
import java.util.List;
public class CrawlerView {
public void showWelcome() {
System.out.println();
System.out.println("╔════════════════════════════════════════════════╗");
System.out.println("║ Java 爬虫管理系统 v3.0 (电影+图书) ║");
System.out.println("╚════════════════════════════════════════════════╝");
System.out.println();
}
public void showHelp() {
System.out.println();
System.out.println("═══════════════ 使用帮助 ═══════════════");
System.out.println(" java -jar crawler.jar <命令> [参数]");
System.out.println();
System.out.println("可用命令:");
System.out.println(" list - 列出所有爬虫");
System.out.println(" run <爬虫名> - 运行指定爬虫");
System.out.println(" run-all - 运行所有爬虫");
System.out.println(" stats - 显示统计信息");
System.out.println(" clear - 清空所有数据");
System.out.println(" help - 显示帮助信息");
System.out.println();
System.out.println("电影爬虫:");
System.out.println(" - 豆瓣电影Top250");
System.out.println(" - 猫眼电影Top100");
System.out.println(" - RottenTomatoes");
System.out.println();
System.out.println("图书爬虫:");
System.out.println(" - 豆瓣读书Top250");
System.out.println(" - BooksToScrape");
System.out.println();
System.out.println("示例:");
System.out.println(" java -jar crawler.jar list");
System.out.println(" java -jar crawler.jar run 豆瓣电影Top250");
System.out.println(" java -jar crawler.jar run 豆瓣读书Top250");
System.out.println(" java -jar crawler.jar run BooksToScrape");
System.out.println(" java -jar crawler.jar run-all");
System.out.println("═══════════════════════════════════════════");
System.out.println();
}
public void showCrawlerList(List<String> crawlers) {
System.out.println();
System.out.println("═══════════════ 爬虫列表 ═══════════════");
for (int i = 0; i < crawlers.size(); i++) {
System.out.println(" " + (i + 1) + ". " + crawlers.get(i));
}
System.out.println("═══════════════════════════════════════════");
System.out.println();
}
public void showResult(CrawlerResult result) {
System.out.println();
if (result.isSuccess()) {
System.out.println("╔════════════════════════════════════════╗");
System.out.printf("║ SUCCESS: %-30s ║%n", result.getSource());
System.out.printf("║ 数据条数: %-28d ║%n", result.getDataCount());
System.out.printf("║ 耗时: %-30dms ║%n", result.getElapsedTime());
System.out.println("╚════════════════════════════════════════╝");
} else {
System.out.println("╔════════════════════════════════════════╗");
System.out.printf("║ FAILURE: [%s] %-20s ║%n", result.getErrorCode(), result.getSource());
System.out.printf("║ 错误信息: %-28s ║%n", result.getMessage());
System.out.println("╚════════════════════════════════════════╝");
}
System.out.println();
}
public void showResults(List<CrawlerResult> results) {
System.out.println();
System.out.println("═══════════════ 执行结果 ═══════════════");
int successCount = 0;
int totalData = 0;
for (CrawlerResult result : results) {
System.out.println(result.toString());
if (result.isSuccess()) {
successCount++;
totalData += result.getDataCount();
}
}
System.out.println("─────────────────────────────────────────");
System.out.printf(" 成功: %d/%d | 总数据: %d 条%n",
successCount, results.size(), totalData);
System.out.println("═══════════════════════════════════════════");
System.out.println();
}
public void showError(String message) {
System.out.println();
System.out.println("╔════════════════════════════════════════╗");
System.out.println("║ 错误信息 ║");
System.out.printf("║ %-36s ║%n", message);
System.out.println("╚════════════════════════════════════════╝");
System.out.println();
}
public void showMessage(String message) {
System.out.println(message);
}
}

88
project/src/test/TestMain.java/TestMain.java

@ -0,0 +1,88 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.FileWriter;
import java.io.IOException;
// 抽象父类:封装通用爬虫逻辑
abstract class BaseCrawler {
// 封装属性
private String baseUrl;
public BaseCrawler(String baseUrl) {
this.baseUrl = baseUrl;
}
// 封装:获取网页文档
protected Document getDoc(String url) throws IOException {
return Jsoup.connect(url)
.userAgent("Mozilla/5.0")
.timeout(8000)
.get();
}
// 抽象方法:交给子类实现(多态基础)
public abstract void parse(Document doc, FileWriter writer) throws IOException;
// 封装:统一执行入口
public void start(FileWriter writer) {
try {
for (int i = 0; i < 10; i++) {
int start = i * 25;
String url = baseUrl + "?start=" + start;
System.out.println("正在爬取第 " + (i + 1) + " 页");
Document doc = getDoc(url);
parse(doc, writer); // 多态:调用子类的parse
Thread.sleep(1000);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
// 子类:继承父类,实现豆瓣电影解析
class DoubanCrawler extends BaseCrawler {
public DoubanCrawler() {
super("https://movie.douban.com/top250");
}
// 重写方法 → 多态
@Override
public void parse(Document doc, FileWriter writer) throws IOException {
Elements items = doc.select("div.item");
for (Element item : items) {
String rank = item.select("em").text();
String name = item.select("span.title").first().text();
String score = item.select("span.rating_num").text();
String line = "排名:" + rank + " 电影:" + name + " 评分:" + score;
System.out.println(line);
writer.write(line + "\r\n"); // 写入文件
}
}
}
// 主类
public class TestMain {
public static void main(String[] args) {
try {
// 直接写入桌面,好找!
FileWriter writer = new FileWriter("douban_top250.txt");
// 多态:父类引用 指向 子类对象
BaseCrawler crawler = new DoubanCrawler();
crawler.start(writer);
writer.close();
System.out.println("===== 全部爬完,文件已保存到桌面 =====");
} catch (IOException e) {
e.printStackTrace();
}
}
}

BIN
project/项目报告v1(1).docx

Binary file not shown.

27
w1/BMICalculator.java

@ -0,0 +1,27 @@
import java.util.Scanner;
public class BMICalculator {
public static void main(String[] args) {
Scanner scanner = new Scanner(System.in);
System.out.print("请输入身高(米):");
double height = scanner.nextDouble();
System.out.print("请输入体重(千克):");
double weight = scanner.nextDouble();
double bmi = weight / (height * height);
System.out.printf("你的 BMI 值为:%.2f%n", bmi);
// BMI 范围判断
if (bmi < 18.5) {
System.out.println("体重过轻");
} else if (bmi < 24) {
System.out.println("正常范围");
} else if (bmi < 28) {
System.out.println("超重");
} else {
System.out.println("肥胖");
}
scanner.close();
}
}

0
w1/Student.java

4
w5/ShapeTest.java

@ -0,0 +1,4 @@
package PACKAGE_NAME;
public class ShapeTest {
}

4
w6/AnimalTest.java

@ -0,0 +1,4 @@
package PACKAGE_NAME;
public class AnimalTest {
}

4
爬虫/DoubanMovieCrawler.java

@ -0,0 +1,4 @@
package PACKAGE_NAME;
public class DoubanMovieCrawler {
}
Loading…
Cancel
Save