84 changed files with 19633 additions and 0 deletions
@ -0,0 +1,39 @@ |
|||
target/ |
|||
!.mvn/wrapper/maven-wrapper.jar |
|||
!**/src/main/**/target/ |
|||
!**/src/test/**/target/ |
|||
.kotlin |
|||
|
|||
### IntelliJ IDEA ### |
|||
.idea/modules.xml |
|||
.idea/jarRepositories.xml |
|||
.idea/compiler.xml |
|||
.idea/libraries/ |
|||
*.iws |
|||
*.iml |
|||
*.ipr |
|||
|
|||
### Eclipse ### |
|||
.apt_generated |
|||
.classpath |
|||
.factorypath |
|||
.project |
|||
.settings |
|||
.springBeans |
|||
.sts4-cache |
|||
|
|||
### NetBeans ### |
|||
/nbproject/private/ |
|||
/nbbuild/ |
|||
/dist/ |
|||
/nbdist/ |
|||
/.nb-gradle/ |
|||
build/ |
|||
!**/src/main/**/build/ |
|||
!**/src/test/**/build/ |
|||
|
|||
### VS Code ### |
|||
.vscode/ |
|||
|
|||
### Mac OS ### |
|||
.DS_Store |
|||
@ -0,0 +1,10 @@ |
|||
# 默认忽略的文件 |
|||
/shelf/ |
|||
/workspace.xml |
|||
# 已忽略包含查询文件的默认文件夹 |
|||
/queries/ |
|||
# Datasource local storage ignored files |
|||
/dataSources/ |
|||
/dataSources.local.xml |
|||
# 基于编辑器的 HTTP 客户端请求 |
|||
/httpRequests/ |
|||
@ -0,0 +1,7 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="Encoding"> |
|||
<file url="file://$PROJECT_DIR$/src/main/Douban.java" charset="UTF-8" /> |
|||
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" /> |
|||
</component> |
|||
</project> |
|||
@ -0,0 +1,14 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="ExternalStorageConfigurationManager" enabled="true" /> |
|||
<component name="MavenProjectsManager"> |
|||
<option name="originalFiles"> |
|||
<list> |
|||
<option value="$PROJECT_DIR$/pom.xml" /> |
|||
</list> |
|||
</option> |
|||
</component> |
|||
<component name="ProjectRootManager" version="2" project-jdk-name="25" project-jdk-type="JavaSDK"> |
|||
<output url="file://$PROJECT_DIR$/out" /> |
|||
</component> |
|||
</project> |
|||
@ -0,0 +1,87 @@ |
|||
# Web Crawler Application |
|||
|
|||
基于 Java 的多网站爬虫应用,采用 MVC、Command 模式、策略模式和完整的异常体系设计。 |
|||
|
|||
## 功能特性 |
|||
|
|||
- 爬取豆瓣电影排行榜 |
|||
- 爬取豆瓣读书排行榜 |
|||
- 爬取 Books to Scrape 网站 |
|||
- 数据保存为 JSON 格式文件 |
|||
- 支持交互式和命令行模式 |
|||
|
|||
## 项目架构 |
|||
|
|||
### 设计模式 |
|||
|
|||
1. **MVC 模式** |
|||
- Model: `Movie`, `Book`, `ScrapeBook` |
|||
- View: `ConsoleView` |
|||
- Controller: `CrawlerController` |
|||
|
|||
2. **策略模式 (Strategy Pattern)** |
|||
- `CrawlerStrategy` 接口 |
|||
- `DoubanMovieStrategy` - 豆瓣电影策略 |
|||
- `DoubanBookStrategy` - 豆瓣读书策略 |
|||
- `BooksToScrapeStrategy` - Books to Scrape 策略 |
|||
|
|||
3. **命令模式 (Command Pattern)** |
|||
- `Command` 接口 |
|||
- `CrawlCommand` - 单个爬虫命令 |
|||
- `CrawlAllCommand` - 组合命令,执行所有爬虫 |
|||
|
|||
4. **异常体系** |
|||
- `CrawlerException` - 基类异常 |
|||
- `NetworkException` - 网络异常 |
|||
- `ParseException` - 解析异常 |
|||
- `FileException` - 文件操作异常 |
|||
|
|||
## 使用方法 |
|||
|
|||
### 编译项目 |
|||
|
|||
```bash |
|||
mvn clean package |
|||
``` |
|||
|
|||
### 运行方式 |
|||
|
|||
#### 1. 交互式模式 |
|||
|
|||
```bash |
|||
java -jar target/web-crawler-1.0-SNAPSHOT.jar -i |
|||
``` |
|||
|
|||
#### 2. 命令行模式 |
|||
|
|||
爬取所有网站: |
|||
```bash |
|||
java -jar target/web-crawler-1.0-SNAPSHOT.jar |
|||
``` |
|||
|
|||
爬取指定网站: |
|||
```bash |
|||
java -jar target/web-crawler-1.0-SNAPSHOT.jar -s douban-movie |
|||
java -jar target/web-crawler-1.0-SNAPSHOT.jar -s douban-book |
|||
java -jar target/web-crawler-1.0-SNAPSHOT.jar -s books-to-scrape |
|||
``` |
|||
|
|||
查看帮助: |
|||
```bash |
|||
java -jar target/web-crawler-1.0-SNAPSHOT.jar --help |
|||
``` |
|||
|
|||
## 输出文件 |
|||
|
|||
爬取的数据将保存到 `data/` 目录下: |
|||
|
|||
- `douban_movies.json` - 豆瓣电影数据 |
|||
- `douban_books.json` - 豆瓣读书数据 |
|||
- `books_to_scrape.json` - Books to Scrape 数据 |
|||
|
|||
## 依赖项 |
|||
|
|||
- Jsoup - HTML 解析 |
|||
- Gson - JSON 处理 |
|||
- Picocli - 命令行解析 |
|||
- SLF4J - 日志框架 |
|||
@ -0,0 +1,96 @@ |
|||
mvn : WARNING: A restricted method in java.lang.System has been called |
|||
所在位置 行:1 字符: 72 |
|||
+ ... 嘻哈哈\Git\java爬虫\TestMaven"; mvn clean package -DskipTests 2>&1 | Out-F ... |
|||
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|||
+ CategoryInfo : NotSpecified: (WARNING: A rest...has been called |
|||
:String) [], RemoteException |
|||
+ FullyQualifiedErrorId : NativeCommandError |
|||
|
|||
WARNING: java.lang.System::load has been called by org.fusesource.jansi.interna |
|||
l.JansiLoader in an unnamed module (file:/D:/嘻嘻哈哈/Git/java/apache-maven-3.9.6/l |
|||
ib/jansi-2.4.0.jar) |
|||
WARNING: Use --enable-native-access=ALL-UNNAMED to avoid a warning for callers |
|||
in this module |
|||
WARNING: Restricted methods will be blocked in a future release unless native a |
|||
ccess is enabled |
|||
|
|||
WARNING: A terminally deprecated method in sun.misc.Unsafe has been called |
|||
WARNING: sun.misc.Unsafe::objectFieldOffset has been called by com.google.commo |
|||
n.util.concurrent.AbstractFuture$UnsafeAtomicHelper (file:/D:/嘻嘻哈哈/Git/java/apa |
|||
che-maven-3.9.6/lib/guava-32.0.1-jre.jar) |
|||
WARNING: Please consider reporting this to the maintainers of class com.google. |
|||
common.util.concurrent.AbstractFuture$UnsafeAtomicHelper |
|||
WARNING: sun.misc.Unsafe::objectFieldOffset will be removed in a future release |
|||
[INFO] Scanning for projects... |
|||
[INFO] |
|||
[INFO] -----------------------< com.example:TestMaven >------------------------ |
|||
[INFO] Building TestMaven 1.0-SNAPSHOT |
|||
[INFO] from pom.xml |
|||
[INFO] --------------------------------[ jar ]--------------------------------- |
|||
[INFO] |
|||
[INFO] --- clean:3.2.0:clean (default-clean) @ TestMaven --- |
|||
[INFO] Deleting D:\鍢诲樆鍝堝搱\Git\java鐖櫕\TestMaven\target |
|||
[INFO] |
|||
[INFO] --- resources:3.3.1:resources (default-resources) @ TestMaven --- |
|||
[INFO] Copying 0 resource from src\main\resources to target\classes |
|||
[INFO] |
|||
[INFO] --- compiler:3.11.0:compile (default-compile) @ TestMaven --- |
|||
[INFO] Changes detected - recompiling the module! :source |
|||
[INFO] Compiling 41 source files with javac [debug target 8] to target\classes |
|||
[INFO] ------------------------------------------------------------- |
|||
[WARNING] COMPILATION WARNING : |
|||
[INFO] ------------------------------------------------------------- |
|||
[WARNING] 鏈笌 -source 8 涓€璧疯缃紩瀵肩被璺緞 |
|||
涓嶈缃紩瀵肩被璺緞鍙兘浼氬鑷寸被鏂囦欢鏃犳硶鍦?JDK 8 涓婅繍琛? 寤鸿浣跨敤 --release 8 鑰屼笉鏄?-source 8 -target 8锛屽洜涓哄畠浼氳嚜鍔ㄨ缃紩瀵肩被璺緞 |
|||
[WARNING] 婧愬€?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎 |
|||
[WARNING] 鐩爣鍊?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎 |
|||
[WARNING] 瑕侀殣钘忔湁鍏冲凡杩囨椂閫夐」鐨勮鍛? 璇蜂娇鐢?-Xlint:-options銆? |
|||
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
|||
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
|||
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
|||
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
|||
[INFO] 8 warnings |
|||
[INFO] ------------------------------------------------------------- |
|||
[INFO] ------------------------------------------------------------- |
|||
[ERROR] COMPILATION ERROR : |
|||
[INFO] ------------------------------------------------------------- |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/main/CrawlerManager.java:[178,20] main.CrawlerManager.MultiStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String) |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
|||
浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
|||
浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
|||
浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
|||
浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/storage/SQLiteStorage.java:[12,8] storage.SQLiteStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String) |
|||
[INFO] 6 errors |
|||
[INFO] ------------------------------------------------------------- |
|||
[INFO] ------------------------------------------------------------------------ |
|||
[INFO] BUILD FAILURE |
|||
[INFO] ------------------------------------------------------------------------ |
|||
[INFO] Total time: 15.493 s |
|||
[INFO] Finished at: 2026-05-31T23:13:59+08:00 |
|||
[INFO] ------------------------------------------------------------------------ |
|||
[ERROR] Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.11.0:compile (default-compile) on project TestMaven: Compilation failure: Compilation failure: |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/main/CrawlerManager.java:[178,20] main.CrawlerManager.MultiStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String) |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙? |
|||
[ERROR] 绗﹀彿: 绫?var |
|||
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙? |
|||
[ERROR] 绗﹀彿: 绫?var |
|||
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙? |
|||
[ERROR] 绗﹀彿: 绫?var |
|||
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙? |
|||
[ERROR] 绗﹀彿: 绫?var |
|||
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/storage/SQLiteStorage.java:[12,8] storage.SQLiteStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String) |
|||
[ERROR] -> [Help 1] |
|||
[ERROR] |
|||
[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch. |
|||
[ERROR] Re-run Maven using the -X switch to enable full debug logging. |
|||
[ERROR] |
|||
[ERROR] For more information about the errors and possible solutions, please read the following articles: |
|||
[ERROR] [Help 1] http://cwiki.apache.org/confluence/display/MAVEN/MojoFailureException |
|||
@ -0,0 +1,92 @@ |
|||
mvn : WARNING: A restricted method in java.lang.System has been called |
|||
所在位置 行:1 字符: 72 |
|||
+ ... JAVA_HOME = "D:\嘻嘻哈哈\Git"; mvn clean package -DskipTests 2>&1 | Out-F ... |
|||
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|||
+ CategoryInfo : NotSpecified: (WARNING: A rest...has been called |
|||
:String) [], RemoteException |
|||
+ FullyQualifiedErrorId : NativeCommandError |
|||
|
|||
WARNING: java.lang.System::load has been called by org.fusesource.jansi.interna |
|||
l.JansiLoader in an unnamed module (file:/D:/嘻嘻哈哈/Git/java/apache-maven-3.9.6/l |
|||
ib/jansi-2.4.0.jar) |
|||
WARNING: Use --enable-native-access=ALL-UNNAMED to avoid a warning for callers |
|||
in this module |
|||
WARNING: Restricted methods will be blocked in a future release unless native a |
|||
ccess is enabled |
|||
|
|||
WARNING: A terminally deprecated method in sun.misc.Unsafe has been called |
|||
WARNING: sun.misc.Unsafe::objectFieldOffset has been called by com.google.commo |
|||
n.util.concurrent.AbstractFuture$UnsafeAtomicHelper (file:/D:/嘻嘻哈哈/Git/java/apa |
|||
che-maven-3.9.6/lib/guava-32.0.1-jre.jar) |
|||
WARNING: Please consider reporting this to the maintainers of class com.google. |
|||
common.util.concurrent.AbstractFuture$UnsafeAtomicHelper |
|||
WARNING: sun.misc.Unsafe::objectFieldOffset will be removed in a future release |
|||
[INFO] Scanning for projects... |
|||
[INFO] |
|||
[INFO] -----------------------< com.example:TestMaven >------------------------ |
|||
[INFO] Building TestMaven 1.0-SNAPSHOT |
|||
[INFO] from pom.xml |
|||
[INFO] --------------------------------[ jar ]--------------------------------- |
|||
[INFO] |
|||
[INFO] --- clean:3.2.0:clean (default-clean) @ TestMaven --- |
|||
[INFO] Deleting D:\鍢诲樆鍝堝搱\Git\java鐖櫕\TestMaven\target |
|||
[INFO] |
|||
[INFO] --- resources:3.3.1:resources (default-resources) @ TestMaven --- |
|||
[INFO] Copying 0 resource from src\main\resources to target\classes |
|||
[INFO] |
|||
[INFO] --- compiler:3.11.0:compile (default-compile) @ TestMaven --- |
|||
[INFO] Changes detected - recompiling the module! :source |
|||
[INFO] Compiling 36 source files with javac [debug target 8] to target\classes |
|||
[INFO] ------------------------------------------------------------- |
|||
[WARNING] COMPILATION WARNING : |
|||
[INFO] ------------------------------------------------------------- |
|||
[WARNING] 鏈笌 -source 8 涓€璧疯缃紩瀵肩被璺緞 |
|||
涓嶈缃紩瀵肩被璺緞鍙兘浼氬鑷寸被鏂囦欢鏃犳硶鍦?JDK 8 涓婅繍琛? 寤鸿浣跨敤 --release 8 鑰屼笉鏄?-source 8 -target 8锛屽洜涓哄畠浼氳嚜鍔ㄨ缃紩瀵肩被璺緞 |
|||
[WARNING] 婧愬€?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎 |
|||
[WARNING] 鐩爣鍊?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎 |
|||
[WARNING] 瑕侀殣钘忔湁鍏冲凡杩囨椂閫夐」鐨勮鍛? 璇蜂娇鐢?-Xlint:-options銆? |
|||
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
|||
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
|||
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
|||
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
|||
[INFO] 8 warnings |
|||
[INFO] ------------------------------------------------------------- |
|||
[INFO] ------------------------------------------------------------- |
|||
[ERROR] COMPILATION ERROR : |
|||
[INFO] ------------------------------------------------------------- |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
|||
浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
|||
浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
|||
浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
|||
浣嶇疆: 绫?cli.CrawlerCLI |
|||
[INFO] 4 errors |
|||
[INFO] ------------------------------------------------------------- |
|||
[INFO] ------------------------------------------------------------------------ |
|||
[INFO] BUILD FAILURE |
|||
[INFO] ------------------------------------------------------------------------ |
|||
[INFO] Total time: 15.527 s |
|||
[INFO] Finished at: 2026-05-31T22:16:51+08:00 |
|||
[INFO] ------------------------------------------------------------------------ |
|||
[ERROR] Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.11.0:compile (default-compile) on project TestMaven: Compilation failure: Compilation failure: |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙? |
|||
[ERROR] 绗﹀彿: 绫?var |
|||
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙? |
|||
[ERROR] 绗﹀彿: 绫?var |
|||
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙? |
|||
[ERROR] 绗﹀彿: 绫?var |
|||
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙? |
|||
[ERROR] 绗﹀彿: 绫?var |
|||
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
|||
[ERROR] -> [Help 1] |
|||
[ERROR] |
|||
[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch. |
|||
[ERROR] Re-run Maven using the -X switch to enable full debug logging. |
|||
[ERROR] |
|||
[ERROR] For more information about the errors and possible solutions, please read the following articles: |
|||
[ERROR] [Help 1] http://cwiki.apache.org/confluence/display/MAVEN/MojoFailureException |
|||
Binary file not shown.
@ -0,0 +1,22 @@ |
|||
# 爬虫配置文件 |
|||
|
|||
# 请求延迟(毫秒)- 避免请求过快被封 |
|||
delay.ms=1500 |
|||
|
|||
# 请求超时时间(毫秒) |
|||
timeout.ms=15000 |
|||
|
|||
# User-Agent |
|||
user.agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 |
|||
|
|||
# 数据库路径 |
|||
db.path=crawler.db |
|||
|
|||
# 输出目录 |
|||
output.dir=output |
|||
|
|||
# 是否启用数据库存储 |
|||
enable.database=true |
|||
|
|||
# 是否启用文件输出 |
|||
enable.file=true |
|||
@ -0,0 +1,162 @@ |
|||
[ |
|||
{ |
|||
"title": "A Light in the Attic", |
|||
"price": "£51.77", |
|||
"rating": "3", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html" |
|||
}, |
|||
{ |
|||
"title": "Tipping the Velvet", |
|||
"price": "£53.74", |
|||
"rating": "1", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html" |
|||
}, |
|||
{ |
|||
"title": "Soumission", |
|||
"price": "£50.10", |
|||
"rating": "1", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/soumission_998/index.html" |
|||
}, |
|||
{ |
|||
"title": "Sharp Objects", |
|||
"price": "£47.82", |
|||
"rating": "4", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/sharp-objects_997/index.html" |
|||
}, |
|||
{ |
|||
"title": "Sapiens: A Brief History of Humankind", |
|||
"price": "£54.23", |
|||
"rating": "5", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html" |
|||
}, |
|||
{ |
|||
"title": "The Requiem Red", |
|||
"price": "£22.65", |
|||
"rating": "1", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/the-requiem-red_995/index.html" |
|||
}, |
|||
{ |
|||
"title": "The Dirty Little Secrets of Getting Your Dream Job", |
|||
"price": "£33.34", |
|||
"rating": "4", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html" |
|||
}, |
|||
{ |
|||
"title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull", |
|||
"price": "£17.93", |
|||
"rating": "3", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html" |
|||
}, |
|||
{ |
|||
"title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics", |
|||
"price": "£22.60", |
|||
"rating": "4", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html" |
|||
}, |
|||
{ |
|||
"title": "The Black Maria", |
|||
"price": "£52.15", |
|||
"rating": "1", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/the-black-maria_991/index.html" |
|||
}, |
|||
{ |
|||
"title": "Starving Hearts (Triangular Trade Trilogy, #1)", |
|||
"price": "£13.99", |
|||
"rating": "2", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html" |
|||
}, |
|||
{ |
|||
"title": "Shakespeare\u0027s Sonnets", |
|||
"price": "£20.66", |
|||
"rating": "4", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/shakespeares-sonnets_989/index.html" |
|||
}, |
|||
{ |
|||
"title": "Set Me Free", |
|||
"price": "£17.46", |
|||
"rating": "5", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/set-me-free_988/index.html" |
|||
}, |
|||
{ |
|||
"title": "Scott Pilgrim\u0027s Precious Little Life (Scott Pilgrim #1)", |
|||
"price": "£52.29", |
|||
"rating": "5", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html" |
|||
}, |
|||
{ |
|||
"title": "Rip it Up and Start Again", |
|||
"price": "£35.02", |
|||
"rating": "5", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/rip-it-up-and-start-again_986/index.html" |
|||
}, |
|||
{ |
|||
"title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", |
|||
"price": "£57.25", |
|||
"rating": "3", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/our-band-could-be-your-life-scenes-from-the-american-indie-underground-1981-1991_985/index.html" |
|||
}, |
|||
{ |
|||
"title": "Olio", |
|||
"price": "£23.88", |
|||
"rating": "1", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/olio_984/index.html" |
|||
}, |
|||
{ |
|||
"title": "Mesaerion: The Best Science Fiction Stories 1800-1849", |
|||
"price": "£37.59", |
|||
"rating": "1", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/mesaerion-the-best-science-fiction-stories-1800-1849_983/index.html" |
|||
}, |
|||
{ |
|||
"title": "Libertarianism for Beginners", |
|||
"price": "£51.33", |
|||
"rating": "2", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/libertarianism-for-beginners_982/index.html" |
|||
}, |
|||
{ |
|||
"title": "It\u0027s Only the Himalayas", |
|||
"price": "£45.17", |
|||
"rating": "2", |
|||
"availability": "In stock", |
|||
"imageUrl": "http://books.toscrape.com/media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg", |
|||
"productUrl": "http://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html" |
|||
} |
|||
] |
|||
@ -0,0 +1,82 @@ |
|||
[ |
|||
{ |
|||
"title": "安定此心:我当精神科医生的12000天", |
|||
"url": "https://book.douban.com/subject/37502923/" |
|||
}, |
|||
{ |
|||
"title": "挽救计划", |
|||
"url": "https://book.douban.com/subject/38210508/" |
|||
}, |
|||
{ |
|||
"title": "咸的玩笑", |
|||
"url": "https://book.douban.com/subject/37833272/" |
|||
}, |
|||
{ |
|||
"title": "真事隐:康熙废储与正史虚构", |
|||
"url": "https://book.douban.com/subject/37920184/" |
|||
}, |
|||
{ |
|||
"title": "大厂小民:我在互联网公司的1480天", |
|||
"url": "https://book.douban.com/subject/38208793/" |
|||
}, |
|||
{ |
|||
"title": "天色已晚", |
|||
"url": "https://book.douban.com/subject/37890167/" |
|||
}, |
|||
{ |
|||
"title": "她和她的决心", |
|||
"url": "https://book.douban.com/subject/38178826/" |
|||
}, |
|||
{ |
|||
"title": "凯罗斯", |
|||
"url": "https://book.douban.com/subject/37825000/" |
|||
}, |
|||
{ |
|||
"title": "幸福蒙太奇", |
|||
"url": "https://book.douban.com/subject/37841159/" |
|||
}, |
|||
{ |
|||
"title": "螃蟹的邀请", |
|||
"url": "https://book.douban.com/subject/37496217/" |
|||
}, |
|||
{ |
|||
"title": "抄写员巴托比", |
|||
"url": "https://book.douban.com/subject/38392174/" |
|||
}, |
|||
{ |
|||
"title": "我收养了一个朋友", |
|||
"url": "https://book.douban.com/subject/37938861/" |
|||
}, |
|||
{ |
|||
"title": "哈萨比斯:谷歌AI之脑", |
|||
"url": "https://book.douban.com/subject/38357884/" |
|||
}, |
|||
{ |
|||
"title": "像女孩那样丢球", |
|||
"url": "https://book.douban.com/subject/37126780/" |
|||
}, |
|||
{ |
|||
"title": "刚刚离开的世界", |
|||
"url": "https://book.douban.com/subject/37447242/" |
|||
}, |
|||
{ |
|||
"title": "故纸浮生.1-2", |
|||
"url": "https://book.douban.com/subject/37648813/" |
|||
}, |
|||
{ |
|||
"title": "在世与认知", |
|||
"url": "https://book.douban.com/subject/37112076/" |
|||
}, |
|||
{ |
|||
"title": "呼啸山庄", |
|||
"url": "https://book.douban.com/subject/30471282/" |
|||
}, |
|||
{ |
|||
"title": "我们如何理解这个世界:与齐格蒙特·鲍曼对谈", |
|||
"url": "https://book.douban.com/subject/37930972/" |
|||
}, |
|||
{ |
|||
"title": "刮风下雨", |
|||
"url": "https://book.douban.com/subject/38240709/" |
|||
} |
|||
] |
|||
@ -0,0 +1,62 @@ |
|||
[ |
|||
{ |
|||
"title": "爱情抓马", |
|||
"rating": "6.9", |
|||
"ratingCount": "(34363人评价)", |
|||
"url": "https://movie.douban.com/subject/36995126/" |
|||
}, |
|||
{ |
|||
"title": "世界的主人", |
|||
"rating": "9.1", |
|||
"ratingCount": "(116736人评价)", |
|||
"url": "https://movie.douban.com/subject/37116612/" |
|||
}, |
|||
{ |
|||
"title": "木乃伊", |
|||
"rating": "6.2", |
|||
"ratingCount": "(13705人评价)", |
|||
"url": "https://movie.douban.com/subject/36929221/" |
|||
}, |
|||
{ |
|||
"title": "蜂蜜的针", |
|||
"rating": "6.7", |
|||
"ratingCount": "(48214人评价)", |
|||
"url": "https://movie.douban.com/subject/26022233/" |
|||
}, |
|||
{ |
|||
"title": "杀的就是你", |
|||
"rating": "6.9", |
|||
"ratingCount": "(21794人评价)", |
|||
"url": "https://movie.douban.com/subject/36926954/" |
|||
}, |
|||
{ |
|||
"title": "惩罚者:最后一击", |
|||
"rating": "6.8", |
|||
"ratingCount": "(5478人评价)", |
|||
"url": "https://movie.douban.com/subject/37259325/" |
|||
}, |
|||
{ |
|||
"title": "蒙特利尔,我的美人", |
|||
"rating": "7.6", |
|||
"ratingCount": "(14162人评价)", |
|||
"url": "https://movie.douban.com/subject/37019075/" |
|||
}, |
|||
{ |
|||
"title": "与王生活的男人", |
|||
"rating": "7.4", |
|||
"ratingCount": "(10007人评价)", |
|||
"url": "https://movie.douban.com/subject/36978169/" |
|||
}, |
|||
{ |
|||
"title": "挽救计划", |
|||
"rating": "8.6", |
|||
"ratingCount": "(463129人评价)", |
|||
"url": "https://movie.douban.com/subject/35010610/" |
|||
}, |
|||
{ |
|||
"title": "长夜将尽", |
|||
"rating": "6.5", |
|||
"ratingCount": "(10878人评价)", |
|||
"url": "https://movie.douban.com/subject/35590993/" |
|||
} |
|||
] |
|||
@ -0,0 +1,250 @@ |
|||
排名:1 电影:肖申克的救赎 评分:9.7 |
|||
排名:2 电影:霸王别姬 评分:9.6 |
|||
排名:3 电影:泰坦尼克号 评分:9.5 |
|||
排名:4 电影:阿甘正传 评分:9.5 |
|||
排名:5 电影:千与千寻 评分:9.4 |
|||
排名:6 电影:美丽人生 评分:9.5 |
|||
排名:7 电影:星际穿越 评分:9.4 |
|||
排名:8 电影:这个杀手不太冷 评分:9.4 |
|||
排名:9 电影:盗梦空间 评分:9.4 |
|||
排名:10 电影:楚门的世界 评分:9.4 |
|||
排名:11 电影:辛德勒的名单 评分:9.5 |
|||
排名:12 电影:忠犬八公的故事 评分:9.4 |
|||
排名:13 电影:海上钢琴师 评分:9.3 |
|||
排名:14 电影:疯狂动物城 评分:9.3 |
|||
排名:15 电影:三傻大闹宝莱坞 评分:9.2 |
|||
排名:16 电影:机器人总动员 评分:9.3 |
|||
排名:17 电影:放牛班的春天 评分:9.3 |
|||
排名:18 电影:无间道 评分:9.3 |
|||
排名:19 电影:控方证人 评分:9.6 |
|||
排名:20 电影:寻梦环游记 评分:9.1 |
|||
排名:21 电影:大话西游之大圣娶亲 评分:9.2 |
|||
排名:22 电影:熔炉 评分:9.3 |
|||
排名:23 电影:触不可及 评分:9.3 |
|||
排名:24 电影:教父 评分:9.3 |
|||
排名:25 电影:末代皇帝 评分:9.3 |
|||
排名:26 电影:哈利·波特与魔法石 评分:9.2 |
|||
排名:27 电影:当幸福来敲门 评分:9.1 |
|||
排名:28 电影:龙猫 评分:9.2 |
|||
排名:29 电影:活着 评分:9.3 |
|||
排名:30 电影:怦然心动 评分:9.1 |
|||
排名:31 电影:蝙蝠侠:黑暗骑士 评分:9.2 |
|||
排名:32 电影:指环王3:王者无敌 评分:9.3 |
|||
排名:33 电影:我不是药神 评分:9.0 |
|||
排名:34 电影:乱世佳人 评分:9.3 |
|||
排名:35 电影:飞屋环游记 评分:9.1 |
|||
排名:36 电影:让子弹飞 评分:9.0 |
|||
排名:37 电影:哈尔的移动城堡 评分:9.1 |
|||
排名:38 电影:十二怒汉 评分:9.4 |
|||
排名:39 电影:海蒂和爷爷 评分:9.3 |
|||
排名:40 电影:素媛 评分:9.3 |
|||
排名:41 电影:猫鼠游戏 评分:9.1 |
|||
排名:42 电影:天空之城 评分:9.2 |
|||
排名:43 电影:鬼子来了 评分:9.3 |
|||
排名:44 电影:摔跤吧!爸爸 评分:9.0 |
|||
排名:45 电影:少年派的奇幻漂流 评分:9.1 |
|||
排名:46 电影:钢琴家 评分:9.3 |
|||
排名:47 电影:指环王2:双塔奇兵 评分:9.2 |
|||
排名:48 电影:死亡诗社 评分:9.2 |
|||
排名:49 电影:大话西游之月光宝盒 评分:9.0 |
|||
排名:50 电影:绿皮书 评分:8.9 |
|||
排名:51 电影:何以为家 评分:9.1 |
|||
排名:52 电影:闻香识女人 评分:9.1 |
|||
排名:53 电影:大闹天宫 评分:9.4 |
|||
排名:54 电影:黑客帝国 评分:9.1 |
|||
排名:55 电影:指环王1:护戒使者 评分:9.1 |
|||
排名:56 电影:罗马假日 评分:9.1 |
|||
排名:57 电影:教父2 评分:9.3 |
|||
排名:58 电影:狮子王 评分:9.1 |
|||
排名:59 电影:天堂电影院 评分:9.2 |
|||
排名:60 电影:饮食男女 评分:9.2 |
|||
排名:61 电影:辩护人 评分:9.2 |
|||
排名:62 电影:本杰明·巴顿奇事 评分:9.0 |
|||
排名:63 电影:搏击俱乐部 评分:9.0 |
|||
排名:64 电影:美丽心灵 评分:9.1 |
|||
排名:65 电影:穿条纹睡衣的男孩 评分:9.2 |
|||
排名:66 电影:哈利·波特与死亡圣器(下) 评分:9.0 |
|||
排名:67 电影:情书 评分:8.9 |
|||
排名:68 电影:两杆大烟枪 评分:9.1 |
|||
排名:69 电影:窃听风暴 评分:9.2 |
|||
排名:70 电影:音乐之声 评分:9.1 |
|||
排名:71 电影:功夫 评分:8.9 |
|||
排名:72 电影:哈利·波特与阿兹卡班的囚徒 评分:9.0 |
|||
排名:73 电影:阿凡达 评分:8.8 |
|||
排名:74 电影:西西里的美丽传说 评分:8.9 |
|||
排名:75 电影:看不见的客人 评分:8.8 |
|||
排名:76 电影:拯救大兵瑞恩 评分:9.1 |
|||
排名:77 电影:沉默的羔羊 评分:8.9 |
|||
排名:78 电影:小鞋子 评分:9.2 |
|||
排名:79 电影:布达佩斯大饭店 评分:8.9 |
|||
排名:80 电影:蝴蝶效应 评分:8.9 |
|||
排名:81 电影:飞越疯人院 评分:9.1 |
|||
排名:82 电影:还有明天 评分:9.3 |
|||
排名:83 电影:禁闭岛 评分:8.9 |
|||
排名:84 电影:心灵捕手 评分:9.0 |
|||
排名:85 电影:致命魔术 评分:8.9 |
|||
排名:86 电影:低俗小说 评分:8.9 |
|||
排名:87 电影:哈利·波特与密室 评分:8.9 |
|||
排名:88 电影:超脱 评分:9.0 |
|||
排名:89 电影:一一 评分:9.1 |
|||
排名:90 电影:喜剧之王 评分:8.8 |
|||
排名:91 电影:杀人回忆 评分:8.9 |
|||
排名:92 电影:致命ID 评分:8.9 |
|||
排名:93 电影:摩登时代 评分:9.3 |
|||
排名:94 电影:春光乍泄 评分:9.0 |
|||
排名:95 电影:加勒比海盗 评分:8.8 |
|||
排名:96 电影:海豚湾 评分:9.3 |
|||
排名:97 电影:美国往事 评分:9.1 |
|||
排名:98 电影:红辣椒 评分:9.0 |
|||
排名:99 电影:七宗罪 评分:8.8 |
|||
排名:100 电影:唐伯虎点秋香 评分:8.8 |
|||
排名:101 电影:狩猎 评分:9.1 |
|||
排名:102 电影:幽灵公主 评分:8.9 |
|||
排名:103 电影:甜蜜蜜 评分:8.9 |
|||
排名:104 电影:寄生虫 评分:8.8 |
|||
排名:105 电影:天书奇谭 评分:9.2 |
|||
排名:106 电影:蝙蝠侠:黑暗骑士崛起 评分:8.9 |
|||
排名:107 电影:超能陆战队 评分:8.8 |
|||
排名:108 电影:7号房的礼物 评分:8.9 |
|||
排名:109 电影:茶馆 评分:9.5 |
|||
排名:110 电影:第六感 评分:8.9 |
|||
排名:111 电影:爱在黎明破晓前 评分:8.8 |
|||
排名:112 电影:爱在日落黄昏时 评分:8.9 |
|||
排名:113 电影:被嫌弃的松子的一生 评分:8.8 |
|||
排名:114 电影:头脑特工队 评分:8.8 |
|||
排名:115 电影:哈利·波特与火焰杯 评分:8.8 |
|||
排名:116 电影:未麻的部屋 评分:9.1 |
|||
排名:117 电影:重庆森林 评分:8.8 |
|||
排名:118 电影:借东西的小人阿莉埃蒂 评分:8.9 |
|||
排名:119 电影:菊次郎的夏天 评分:8.9 |
|||
排名:120 电影:入殓师 评分:8.9 |
|||
排名:121 电影:断背山 评分:8.8 |
|||
排名:122 电影:剪刀手爱德华 评分:8.7 |
|||
排名:123 电影:勇敢的心 评分:8.9 |
|||
排名:124 电影:时空恋旅人 评分:8.8 |
|||
排名:125 电影:驯龙高手 评分:8.8 |
|||
排名:126 电影:消失的爱人 评分:8.7 |
|||
排名:127 电影:无人知晓 评分:9.1 |
|||
排名:128 电影:傲慢与偏见 评分:8.7 |
|||
排名:129 电影:倩女幽魂 评分:8.8 |
|||
排名:130 电影:新世界 评分:8.9 |
|||
排名:131 电影:花样年华 评分:8.8 |
|||
排名:132 电影:玩具总动员3 评分:8.9 |
|||
排名:133 电影:一个叫欧维的男人决定去死 评分:8.9 |
|||
排名:134 电影:色,戒 评分:8.7 |
|||
排名:135 电影:完美的世界 评分:9.1 |
|||
排名:136 电影:阳光灿烂的日子 评分:8.8 |
|||
排名:137 电影:怪兽电力公司 评分:8.8 |
|||
排名:138 电影:教父3 评分:9.0 |
|||
排名:139 电影:小森林 夏秋篇 评分:9.0 |
|||
排名:140 电影:天使爱美丽 评分:8.7 |
|||
排名:141 电影:侧耳倾听 评分:8.9 |
|||
排名:142 电影:哪吒闹海 评分:9.2 |
|||
排名:143 电影:九品芝麻官 评分:8.8 |
|||
排名:144 电影:被解救的姜戈 评分:8.8 |
|||
排名:145 电影:请以你的名字呼唤我 评分:8.8 |
|||
排名:146 电影:幸福终点站 评分:8.8 |
|||
排名:147 电影:釜山行 评分:8.6 |
|||
排名:148 电影:神偷奶爸 评分:8.7 |
|||
排名:149 电影:小森林 冬春篇 评分:9.0 |
|||
排名:150 电影:喜宴 评分:9.0 |
|||
排名:151 电影:萤火之森 评分:8.8 |
|||
排名:152 电影:告白 评分:8.8 |
|||
排名:153 电影:玛丽和麦克斯 评分:9.0 |
|||
排名:154 电影:七武士 评分:9.3 |
|||
排名:155 电影:头号玩家 评分:8.6 |
|||
排名:156 电影:模仿游戏 评分:8.8 |
|||
排名:157 电影:惊魂记 评分:9.0 |
|||
排名:158 电影:大鱼 评分:8.8 |
|||
排名:159 电影:机器人之梦 评分:9.1 |
|||
排名:160 电影:心灵奇旅 评分:8.7 |
|||
排名:161 电影:背靠背,脸对脸 评分:9.5 |
|||
排名:162 电影:射雕英雄传之东成西就 评分:8.7 |
|||
排名:163 电影:血战钢锯岭 评分:8.7 |
|||
排名:164 电影:你的名字。 评分:8.5 |
|||
排名:165 电影:我是山姆 评分:9.0 |
|||
排名:166 电影:阳光姐妹淘 评分:8.8 |
|||
排名:167 电影:恐怖直播 评分:8.7 |
|||
排名:168 电影:黑客帝国3:矩阵革命 评分:8.8 |
|||
排名:169 电影:末路狂花 评分:9.0 |
|||
排名:170 电影:高山下的花环 评分:9.5 |
|||
排名:171 电影:小丑 评分:8.7 |
|||
排名:172 电影:谍影重重3 评分:8.9 |
|||
排名:173 电影:三块广告牌 评分:8.7 |
|||
排名:174 电影:电锯惊魂 评分:8.7 |
|||
排名:175 电影:无间道2 评分:8.8 |
|||
排名:176 电影:达拉斯买家俱乐部 评分:8.8 |
|||
排名:177 电影:疯狂原始人 评分:8.7 |
|||
排名:178 电影:绿里奇迹 评分:8.9 |
|||
排名:179 电影:爱在午夜降临前 评分:8.9 |
|||
排名:180 电影:疯狂的石头 评分:8.6 |
|||
排名:181 电影:雨中曲 评分:9.1 |
|||
排名:182 电影:2001太空漫游 评分:8.9 |
|||
排名:183 电影:海街日记 评分:8.8 |
|||
排名:184 电影:风之谷 评分:8.9 |
|||
排名:185 电影:上帝之城 评分:9.0 |
|||
排名:186 电影:心迷宫 评分:8.7 |
|||
排名:187 电影:英雄本色 评分:8.6 |
|||
排名:188 电影:记忆碎片 评分:8.7 |
|||
排名:189 电影:纵横四海 评分:8.8 |
|||
排名:190 电影:无敌破坏王 评分:8.7 |
|||
排名:191 电影:卢旺达饭店 评分:8.9 |
|||
排名:192 电影:牯岭街少年杀人事件 评分:8.9 |
|||
排名:193 电影:恐怖游轮 评分:8.5 |
|||
排名:194 电影:东京教父 评分:9.0 |
|||
排名:195 电影:小偷家族 评分:8.7 |
|||
排名:196 电影:魔女宅急便 评分:8.7 |
|||
排名:197 电影:冰川时代 评分:8.7 |
|||
排名:198 电影:芙蓉镇 评分:9.3 |
|||
排名:199 电影:忠犬八公物语 评分:9.2 |
|||
排名:200 电影:岁月神偷 评分:8.7 |
|||
排名:201 电影:遗愿清单 评分:8.7 |
|||
排名:202 电影:荒蛮故事 评分:8.7 |
|||
排名:203 电影:大佛普拉斯 评分:8.7 |
|||
排名:204 电影:源代码 评分:8.6 |
|||
排名:205 电影:花束般的恋爱 评分:8.6 |
|||
排名:206 电影:白日梦想家 评分:8.6 |
|||
排名:207 电影:爱乐之城 评分:8.4 |
|||
排名:208 电影:疯狂的麦克斯4:狂暴之路 评分:8.7 |
|||
排名:209 电影:可可西里 评分:8.9 |
|||
排名:210 电影:你看起来好像很好吃 评分:8.9 |
|||
排名:211 电影:贫民窟的百万富翁 评分:8.6 |
|||
排名:212 电影:波西米亚狂想曲 评分:8.6 |
|||
排名:213 电影:城市之光 评分:9.3 |
|||
排名:214 电影:爆裂鼓手 评分:8.6 |
|||
排名:215 电影:青蛇 评分:8.6 |
|||
排名:216 电影:哈利·波特与死亡圣器(上) 评分:8.6 |
|||
排名:217 电影:无耻混蛋 评分:8.7 |
|||
排名:218 电影:东邪西毒 评分:8.6 |
|||
排名:219 电影:终结者2:审判日 评分:8.8 |
|||
排名:220 电影:大红灯笼高高挂 评分:8.8 |
|||
排名:221 电影:黑天鹅 评分:8.6 |
|||
排名:222 电影:新龙门客栈 评分:8.7 |
|||
排名:223 电影:初恋这件小事 评分:8.5 |
|||
排名:224 电影:千钧一发 评分:8.8 |
|||
排名:225 电影:人工智能 评分:8.7 |
|||
排名:226 电影:崖上的波妞 评分:8.6 |
|||
排名:227 电影:雨人 评分:8.7 |
|||
排名:228 电影:虎口脱险 评分:8.9 |
|||
排名:229 电影:哈利·波特与凤凰社 评分:8.6 |
|||
排名:230 电影:彗星来的那一夜 评分:8.6 |
|||
排名:231 电影:罗生门 评分:8.8 |
|||
排名:232 电影:海边的曼彻斯特 评分:8.6 |
|||
排名:233 电影:恋恋笔记本 评分:8.5 |
|||
排名:234 电影:火星救援 评分:8.5 |
|||
排名:235 电影:真爱至上 评分:8.5 |
|||
排名:236 电影:黑客帝国2:重装上阵 评分:8.7 |
|||
排名:237 电影:冰雪奇缘 评分:8.5 |
|||
排名:238 电影:步履不停 评分:8.8 |
|||
排名:239 电影:奇迹男孩 评分:8.6 |
|||
排名:240 电影:千年女优 评分:8.8 |
|||
排名:241 电影:谍影重重2 评分:8.7 |
|||
排名:242 电影:战争之王 评分:8.7 |
|||
排名:243 电影:蜘蛛侠:平行宇宙 评分:8.6 |
|||
排名:244 电影:攻壳机动队 评分:9.0 |
|||
排名:245 电影:血钻 评分:8.7 |
|||
排名:246 电影:小姐 评分:8.5 |
|||
排名:247 电影:隐藏人物 评分:8.9 |
|||
排名:248 电影:血观音 评分:8.6 |
|||
排名:249 电影:魂断蓝桥 评分:8.8 |
|||
排名:250 电影:房间 评分:8.7 |
|||
File diff suppressed because it is too large
File diff suppressed because it is too large
@ -0,0 +1,74 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
|
|||
<groupId>com.crawler</groupId> |
|||
<artifactId>web-crawler</artifactId> |
|||
<version>1.0-SNAPSHOT</version> |
|||
<packaging>jar</packaging> |
|||
|
|||
<name>Web Crawler</name> |
|||
<description>Multi-site web crawler with CLI, MVC, Command pattern and Strategy pattern</description> |
|||
|
|||
<properties> |
|||
<maven.compiler.source>11</maven.compiler.source> |
|||
<maven.compiler.target>11</maven.compiler.target> |
|||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
|||
</properties> |
|||
|
|||
<dependencies> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>1.16.1</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.google.code.gson</groupId> |
|||
<artifactId>gson</artifactId> |
|||
<version>2.10.1</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.slf4j</groupId> |
|||
<artifactId>slf4j-api</artifactId> |
|||
<version>2.0.9</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.slf4j</groupId> |
|||
<artifactId>slf4j-simple</artifactId> |
|||
<version>2.0.9</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>info.picocli</groupId> |
|||
<artifactId>picocli</artifactId> |
|||
<version>4.7.5</version> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-jar-plugin</artifactId> |
|||
<version>3.3.0</version> |
|||
<configuration> |
|||
<archive> |
|||
<manifest> |
|||
<mainClass>com.crawler.Main</mainClass> |
|||
</manifest> |
|||
</archive> |
|||
</configuration> |
|||
</plugin> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-compiler-plugin</artifactId> |
|||
<version>3.11.0</version> |
|||
<configuration> |
|||
<source>11</source> |
|||
<target>11</target> |
|||
</configuration> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
|||
@ -0,0 +1,24 @@ |
|||
@echo off |
|||
echo ======================================== |
|||
echo Web Crawler Application |
|||
echo ======================================== |
|||
echo. |
|||
|
|||
if not exist "target\web-crawler-1.0-SNAPSHOT.jar" ( |
|||
echo Compiling project... |
|||
mvn clean package |
|||
if errorlevel 1 ( |
|||
echo Compilation failed! |
|||
pause |
|||
exit /b 1 |
|||
) |
|||
echo. |
|||
) |
|||
|
|||
echo Running crawler... |
|||
echo. |
|||
|
|||
java -jar target\web-crawler-1.0-SNAPSHOT.jar %* |
|||
|
|||
echo. |
|||
pause |
|||
@ -0,0 +1 @@ |
|||
// 此文件已废弃,请使用 main.CrawlerManager
|
|||
@ -0,0 +1,133 @@ |
|||
package cli; |
|||
|
|||
import command.*; |
|||
import controller.CrawlerController; |
|||
import exception.CrawlerException; |
|||
import exception.CrawlerResult; |
|||
import exception.ValidationException; |
|||
import view.CrawlerView; |
|||
|
|||
import java.util.Arrays; |
|||
|
|||
public class CrawlerCLI { |
|||
private final CrawlerController controller; |
|||
private final CrawlerView view; |
|||
private final CommandRegistry commandRegistry; |
|||
|
|||
public CrawlerCLI() { |
|||
this.controller = new CrawlerController(); |
|||
this.view = new CrawlerView(); |
|||
this.commandRegistry = new CommandRegistry(); |
|||
initCommands(); |
|||
} |
|||
|
|||
public CrawlerCLI(String outputDir) { |
|||
this.controller = new CrawlerController(outputDir); |
|||
this.view = new CrawlerView(); |
|||
this.commandRegistry = new CommandRegistry(); |
|||
initCommands(); |
|||
} |
|||
|
|||
private void initCommands() { |
|||
commandRegistry.register(new RunAllCommand(controller)); |
|||
commandRegistry.register(new ListCrawlersCommand(controller)); |
|||
commandRegistry.register(new StatsCommand(controller)); |
|||
commandRegistry.register(new ClearCommand(controller)); |
|||
} |
|||
|
|||
public void run(String[] args) { |
|||
view.showWelcome(); |
|||
|
|||
if (args == null || args.length == 0) { |
|||
view.showHelp(); |
|||
return; |
|||
} |
|||
|
|||
String commandName = args[0].toLowerCase().trim(); |
|||
|
|||
try { |
|||
switch (commandName) { |
|||
case "help": |
|||
case "-h": |
|||
case "--help": |
|||
view.showHelp(); |
|||
break; |
|||
|
|||
case "list": |
|||
case "ls": |
|||
handleList(); |
|||
break; |
|||
|
|||
case "run": |
|||
handleRun(args); |
|||
break; |
|||
|
|||
case "run-all": |
|||
case "all": |
|||
handleRunAll(); |
|||
break; |
|||
|
|||
case "stats": |
|||
handleStats(); |
|||
break; |
|||
|
|||
case "clear": |
|||
handleClear(); |
|||
break; |
|||
|
|||
default: |
|||
view.showError("未知命令: " + commandName + "\n使用 'help' 查看可用命令"); |
|||
} |
|||
} catch (ValidationException e) { |
|||
view.showError(e.getMessage()); |
|||
view.showHelp(); |
|||
} catch (CrawlerException e) { |
|||
view.showError("爬虫错误 [" + e.getErrorCode() + "]: " + e.getMessage()); |
|||
} catch (Exception e) { |
|||
view.showError("系统错误: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
|
|||
private void handleList() { |
|||
Command cmd = commandRegistry.getCommand("list"); |
|||
cmd.execute(); |
|||
view.showCrawlerList(controller.getAllCrawlerNames()); |
|||
} |
|||
|
|||
private void handleRun(String[] args) { |
|||
if (args.length < 2) { |
|||
view.showError("请指定爬虫名称\n示例: run 豆瓣电影Top250"); |
|||
view.showCrawlerList(controller.getAllCrawlerNames()); |
|||
return; |
|||
} |
|||
|
|||
String crawlerName = args[1]; |
|||
Command cmd = new RunSingleCommand(controller, crawlerName); |
|||
CrawlerResult result = cmd.execute(); |
|||
view.showResult(result); |
|||
} |
|||
|
|||
private void handleRunAll() { |
|||
Command cmd = commandRegistry.getCommand("run-all"); |
|||
CrawlerResult result = cmd.execute(); |
|||
view.showResult(result); |
|||
} |
|||
|
|||
private void handleStats() { |
|||
Command cmd = commandRegistry.getCommand("stats"); |
|||
CrawlerResult result = cmd.execute(); |
|||
view.showMessage(result.getMessage()); |
|||
} |
|||
|
|||
private void handleClear() { |
|||
Command cmd = commandRegistry.getCommand("clear"); |
|||
CrawlerResult result = cmd.execute(); |
|||
view.showResult(result); |
|||
} |
|||
|
|||
public static void main(String[] args) { |
|||
CrawlerCLI cli = new CrawlerCLI(); |
|||
cli.run(args); |
|||
} |
|||
} |
|||
@ -0,0 +1,34 @@ |
|||
package com.crawler; |
|||
|
|||
import com.crawler.controller.CrawlerController; |
|||
import picocli.CommandLine; |
|||
import picocli.CommandLine.Command; |
|||
import picocli.CommandLine.Option; |
|||
|
|||
@Command(name = "crawler", mixinStandardHelpOptions = true, version = "1.0", |
|||
description = "Web Crawler - Crawl Douban Movies, Douban Books, and Books to Scrape") |
|||
public class Main implements Runnable { |
|||
|
|||
@Option(names = {"-s", "--site"}, description = "Site to crawl: douban-movie, douban-book, books-to-scrape, all", |
|||
defaultValue = "all") |
|||
private String site; |
|||
|
|||
@Option(names = {"-i", "--interactive"}, description = "Run in interactive mode") |
|||
private boolean interactive; |
|||
|
|||
public static void main(String[] args) { |
|||
int exitCode = new CommandLine(new Main()).execute(args); |
|||
System.exit(exitCode); |
|||
} |
|||
|
|||
@Override |
|||
public void run() { |
|||
CrawlerController controller = new CrawlerController(); |
|||
|
|||
if (interactive) { |
|||
controller.runInteractive(); |
|||
} else { |
|||
controller.crawlBySite(site); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,8 @@ |
|||
package com.crawler.command; |
|||
|
|||
import com.crawler.exception.CrawlerException; |
|||
|
|||
public interface Command { |
|||
void execute() throws CrawlerException; |
|||
String getDescription(); |
|||
} |
|||
@ -0,0 +1,37 @@ |
|||
package com.crawler.command; |
|||
|
|||
import com.crawler.exception.CrawlerException; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class CrawlAllCommand implements Command { |
|||
private final List<Command> commands; |
|||
private final String description; |
|||
|
|||
public CrawlAllCommand(String description) { |
|||
this.commands = new ArrayList<>(); |
|||
this.description = description; |
|||
} |
|||
|
|||
public void addCommand(Command command) { |
|||
commands.add(command); |
|||
} |
|||
|
|||
@Override |
|||
public void execute() throws CrawlerException { |
|||
for (Command command : commands) { |
|||
try { |
|||
command.execute(); |
|||
} catch (CrawlerException e) { |
|||
System.err.println("Error executing command: " + command.getDescription()); |
|||
System.err.println("Error: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return description; |
|||
} |
|||
} |
|||
@ -0,0 +1,28 @@ |
|||
package com.crawler.command; |
|||
|
|||
import com.crawler.exception.CrawlerException; |
|||
import com.crawler.strategy.CrawlerStrategy; |
|||
import com.crawler.util.FileUtil; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class CrawlCommand<T> implements Command { |
|||
private final CrawlerStrategy<T> strategy; |
|||
private final String description; |
|||
|
|||
public CrawlCommand(CrawlerStrategy<T> strategy, String description) { |
|||
this.strategy = strategy; |
|||
this.description = description; |
|||
} |
|||
|
|||
@Override |
|||
public void execute() throws CrawlerException { |
|||
List<T> data = strategy.crawl(); |
|||
FileUtil.saveToJsonFile(data, strategy.getOutputFileName()); |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return description; |
|||
} |
|||
} |
|||
@ -0,0 +1,119 @@ |
|||
package com.crawler.controller; |
|||
|
|||
import com.crawler.command.Command; |
|||
import com.crawler.command.CrawlAllCommand; |
|||
import com.crawler.command.CrawlCommand; |
|||
import com.crawler.exception.CrawlerException; |
|||
import com.crawler.strategy.BooksToScrapeStrategy; |
|||
import com.crawler.strategy.CrawlerStrategy; |
|||
import com.crawler.strategy.DoubanBookStrategy; |
|||
import com.crawler.strategy.DoubanMovieStrategy; |
|||
import com.crawler.view.ConsoleView; |
|||
|
|||
import java.util.Scanner; |
|||
|
|||
public class CrawlerController { |
|||
private final ConsoleView view; |
|||
private final Scanner scanner; |
|||
|
|||
public CrawlerController() { |
|||
this.view = new ConsoleView(); |
|||
this.scanner = new Scanner(System.in); |
|||
} |
|||
|
|||
public void runInteractive() { |
|||
view.displayWelcome(); |
|||
|
|||
while (true) { |
|||
view.displayMenu(); |
|||
String input = scanner.nextLine().trim(); |
|||
|
|||
try { |
|||
int choice = Integer.parseInt(input); |
|||
switch (choice) { |
|||
case 1: |
|||
crawlDoubanMovies(); |
|||
break; |
|||
case 2: |
|||
crawlDoubanBooks(); |
|||
break; |
|||
case 3: |
|||
crawlBooksToScrape(); |
|||
break; |
|||
case 4: |
|||
crawlAll(); |
|||
break; |
|||
case 0: |
|||
view.displayGoodbye(); |
|||
return; |
|||
default: |
|||
view.displayInvalidChoice(); |
|||
} |
|||
} catch (NumberFormatException e) { |
|||
view.displayInvalidChoice(); |
|||
} |
|||
} |
|||
} |
|||
|
|||
public void crawlDoubanMovies() { |
|||
CrawlerStrategy<?> strategy = new DoubanMovieStrategy(); |
|||
Command command = new CrawlCommand<>(strategy, "Douban Movies"); |
|||
executeCommand(command, strategy.getOutputFileName()); |
|||
} |
|||
|
|||
public void crawlDoubanBooks() { |
|||
CrawlerStrategy<?> strategy = new DoubanBookStrategy(); |
|||
Command command = new CrawlCommand<>(strategy, "Douban Books"); |
|||
executeCommand(command, strategy.getOutputFileName()); |
|||
} |
|||
|
|||
public void crawlBooksToScrape() { |
|||
CrawlerStrategy<?> strategy = new BooksToScrapeStrategy(); |
|||
Command command = new CrawlCommand<>(strategy, "Books to Scrape"); |
|||
executeCommand(command, strategy.getOutputFileName()); |
|||
} |
|||
|
|||
public void crawlAll() { |
|||
CrawlAllCommand allCommand = new CrawlAllCommand("Crawl All"); |
|||
allCommand.addCommand(new CrawlCommand<>(new DoubanMovieStrategy(), "Douban Movies")); |
|||
allCommand.addCommand(new CrawlCommand<>(new DoubanBookStrategy(), "Douban Books")); |
|||
allCommand.addCommand(new CrawlCommand<>(new BooksToScrapeStrategy(), "Books to Scrape")); |
|||
|
|||
try { |
|||
view.displayCrawling("All Websites"); |
|||
allCommand.execute(); |
|||
view.displaySuccess("data/ (all files)"); |
|||
} catch (CrawlerException e) { |
|||
view.displayError(e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
public void crawlBySite(String site) { |
|||
switch (site.toLowerCase()) { |
|||
case "douban-movie": |
|||
crawlDoubanMovies(); |
|||
break; |
|||
case "douban-book": |
|||
crawlDoubanBooks(); |
|||
break; |
|||
case "books-to-scrape": |
|||
crawlBooksToScrape(); |
|||
break; |
|||
case "all": |
|||
crawlAll(); |
|||
break; |
|||
default: |
|||
view.displayError("Unknown site: " + site); |
|||
} |
|||
} |
|||
|
|||
private void executeCommand(Command command, String fileName) { |
|||
try { |
|||
view.displayCrawling(command.getDescription()); |
|||
command.execute(); |
|||
view.displaySuccess(fileName); |
|||
} catch (CrawlerException e) { |
|||
view.displayError(e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class CrawlerException extends Exception { |
|||
public CrawlerException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public CrawlerException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class FileException extends CrawlerException { |
|||
public FileException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public FileException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class NetworkException extends CrawlerException { |
|||
public NetworkException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public NetworkException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class ParseException extends CrawlerException { |
|||
public ParseException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public ParseException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,105 @@ |
|||
package com.crawler.model; |
|||
|
|||
public class Book { |
|||
private String title; |
|||
private String author; |
|||
private String rating; |
|||
private String ratingCount; |
|||
private String publisher; |
|||
private String publishDate; |
|||
private String price; |
|||
private String isbn; |
|||
private String summary; |
|||
private String url; |
|||
|
|||
public Book() {} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getAuthor() { |
|||
return author; |
|||
} |
|||
|
|||
public void setAuthor(String author) { |
|||
this.author = author; |
|||
} |
|||
|
|||
public String getRating() { |
|||
return rating; |
|||
} |
|||
|
|||
public void setRating(String rating) { |
|||
this.rating = rating; |
|||
} |
|||
|
|||
public String getRatingCount() { |
|||
return ratingCount; |
|||
} |
|||
|
|||
public void setRatingCount(String ratingCount) { |
|||
this.ratingCount = ratingCount; |
|||
} |
|||
|
|||
public String getPublisher() { |
|||
return publisher; |
|||
} |
|||
|
|||
public void setPublisher(String publisher) { |
|||
this.publisher = publisher; |
|||
} |
|||
|
|||
public String getPublishDate() { |
|||
return publishDate; |
|||
} |
|||
|
|||
public void setPublishDate(String publishDate) { |
|||
this.publishDate = publishDate; |
|||
} |
|||
|
|||
public String getPrice() { |
|||
return price; |
|||
} |
|||
|
|||
public void setPrice(String price) { |
|||
this.price = price; |
|||
} |
|||
|
|||
public String getIsbn() { |
|||
return isbn; |
|||
} |
|||
|
|||
public void setIsbn(String isbn) { |
|||
this.isbn = isbn; |
|||
} |
|||
|
|||
public String getSummary() { |
|||
return summary; |
|||
} |
|||
|
|||
public void setSummary(String summary) { |
|||
this.summary = summary; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
|
|||
public void setUrl(String url) { |
|||
this.url = url; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Book{" + |
|||
"title='" + title + '\'' + |
|||
", author='" + author + '\'' + |
|||
", rating='" + rating + '\'' + |
|||
'}'; |
|||
} |
|||
} |
|||
@ -0,0 +1,96 @@ |
|||
package com.crawler.model; |
|||
|
|||
public class Movie { |
|||
private String title; |
|||
private String rating; |
|||
private String ratingCount; |
|||
private String year; |
|||
private String director; |
|||
private String actors; |
|||
private String genre; |
|||
private String summary; |
|||
private String url; |
|||
|
|||
public Movie() {} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getRating() { |
|||
return rating; |
|||
} |
|||
|
|||
public void setRating(String rating) { |
|||
this.rating = rating; |
|||
} |
|||
|
|||
public String getRatingCount() { |
|||
return ratingCount; |
|||
} |
|||
|
|||
public void setRatingCount(String ratingCount) { |
|||
this.ratingCount = ratingCount; |
|||
} |
|||
|
|||
public String getYear() { |
|||
return year; |
|||
} |
|||
|
|||
public void setYear(String year) { |
|||
this.year = year; |
|||
} |
|||
|
|||
public String getDirector() { |
|||
return director; |
|||
} |
|||
|
|||
public void setDirector(String director) { |
|||
this.director = director; |
|||
} |
|||
|
|||
public String getActors() { |
|||
return actors; |
|||
} |
|||
|
|||
public void setActors(String actors) { |
|||
this.actors = actors; |
|||
} |
|||
|
|||
public String getGenre() { |
|||
return genre; |
|||
} |
|||
|
|||
public void setGenre(String genre) { |
|||
this.genre = genre; |
|||
} |
|||
|
|||
public String getSummary() { |
|||
return summary; |
|||
} |
|||
|
|||
public void setSummary(String summary) { |
|||
this.summary = summary; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
|
|||
public void setUrl(String url) { |
|||
this.url = url; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Movie{" + |
|||
"title='" + title + '\'' + |
|||
", rating='" + rating + '\'' + |
|||
", year='" + year + '\'' + |
|||
'}'; |
|||
} |
|||
} |
|||
@ -0,0 +1,69 @@ |
|||
package com.crawler.model; |
|||
|
|||
public class ScrapeBook { |
|||
private String title; |
|||
private String price; |
|||
private String rating; |
|||
private String availability; |
|||
private String imageUrl; |
|||
private String productUrl; |
|||
|
|||
public ScrapeBook() {} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getPrice() { |
|||
return price; |
|||
} |
|||
|
|||
public void setPrice(String price) { |
|||
this.price = price; |
|||
} |
|||
|
|||
public String getRating() { |
|||
return rating; |
|||
} |
|||
|
|||
public void setRating(String rating) { |
|||
this.rating = rating; |
|||
} |
|||
|
|||
public String getAvailability() { |
|||
return availability; |
|||
} |
|||
|
|||
public void setAvailability(String availability) { |
|||
this.availability = availability; |
|||
} |
|||
|
|||
public String getImageUrl() { |
|||
return imageUrl; |
|||
} |
|||
|
|||
public void setImageUrl(String imageUrl) { |
|||
this.imageUrl = imageUrl; |
|||
} |
|||
|
|||
public String getProductUrl() { |
|||
return productUrl; |
|||
} |
|||
|
|||
public void setProductUrl(String productUrl) { |
|||
this.productUrl = productUrl; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "ScrapeBook{" + |
|||
"title='" + title + '\'' + |
|||
", price='" + price + '\'' + |
|||
", rating='" + rating + '\'' + |
|||
'}'; |
|||
} |
|||
} |
|||
@ -0,0 +1,72 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import com.crawler.exception.CrawlerException; |
|||
import com.crawler.exception.NetworkException; |
|||
import com.crawler.exception.ParseException; |
|||
import com.crawler.model.ScrapeBook; |
|||
import com.crawler.util.HttpUtil; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class BooksToScrapeStrategy implements CrawlerStrategy<ScrapeBook> { |
|||
private static final String URL = "http://books.toscrape.com/"; |
|||
|
|||
@Override |
|||
public List<ScrapeBook> crawl() throws CrawlerException { |
|||
List<ScrapeBook> books = new ArrayList<>(); |
|||
try { |
|||
Document doc = HttpUtil.getDocument(URL); |
|||
Elements items = doc.select("article.product_pod"); |
|||
|
|||
for (Element item : items) { |
|||
ScrapeBook book = new ScrapeBook(); |
|||
Element titleEl = item.selectFirst("h3 a"); |
|||
if (titleEl != null) { |
|||
book.setTitle(titleEl.attr("title")); |
|||
book.setProductUrl(URL + titleEl.attr("href")); |
|||
} |
|||
|
|||
Element priceEl = item.selectFirst("p.price_color"); |
|||
if (priceEl != null) { |
|||
book.setPrice(priceEl.text()); |
|||
} |
|||
|
|||
Element availabilityEl = item.selectFirst("p.instock"); |
|||
if (availabilityEl != null) { |
|||
book.setAvailability(availabilityEl.text().trim()); |
|||
} |
|||
|
|||
Element starRatingEl = item.selectFirst("p.star-rating"); |
|||
if (starRatingEl != null) { |
|||
String classes = starRatingEl.className(); |
|||
if (classes.contains("One")) book.setRating("1"); |
|||
else if (classes.contains("Two")) book.setRating("2"); |
|||
else if (classes.contains("Three")) book.setRating("3"); |
|||
else if (classes.contains("Four")) book.setRating("4"); |
|||
else if (classes.contains("Five")) book.setRating("5"); |
|||
} |
|||
|
|||
Element imgEl = item.selectFirst("img"); |
|||
if (imgEl != null) { |
|||
book.setImageUrl(URL + imgEl.attr("src")); |
|||
} |
|||
|
|||
books.add(book); |
|||
} |
|||
} catch (NetworkException e) { |
|||
throw e; |
|||
} catch (Exception e) { |
|||
throw new ParseException("Failed to parse Books to Scrape page", e); |
|||
} |
|||
return books; |
|||
} |
|||
|
|||
@Override |
|||
public String getOutputFileName() { |
|||
return "data/books_to_scrape.json"; |
|||
} |
|||
} |
|||
@ -0,0 +1,9 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import com.crawler.exception.CrawlerException; |
|||
import java.util.List; |
|||
|
|||
public interface CrawlerStrategy<T> { |
|||
List<T> crawl() throws CrawlerException; |
|||
String getOutputFileName(); |
|||
} |
|||
@ -0,0 +1,69 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import com.crawler.exception.CrawlerException; |
|||
import com.crawler.exception.NetworkException; |
|||
import com.crawler.exception.ParseException; |
|||
import com.crawler.model.Book; |
|||
import com.crawler.util.HttpUtil; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class DoubanBookStrategy implements CrawlerStrategy<Book> { |
|||
private static final String URL = "https://book.douban.com/chart"; |
|||
|
|||
@Override |
|||
public List<Book> crawl() throws CrawlerException { |
|||
List<Book> books = new ArrayList<>(); |
|||
try { |
|||
Document doc = HttpUtil.getDocument(URL); |
|||
Elements items = doc.select("li.media"); |
|||
|
|||
for (Element item : items) { |
|||
Book book = new Book(); |
|||
Element titleEl = item.selectFirst("h2 a"); |
|||
if (titleEl != null) { |
|||
book.setTitle(titleEl.text().trim()); |
|||
book.setUrl(titleEl.attr("href")); |
|||
} |
|||
|
|||
Element ratingEl = item.selectFirst("span.rating_nums"); |
|||
if (ratingEl != null) { |
|||
book.setRating(ratingEl.text()); |
|||
} |
|||
|
|||
Element ratingCountEl = item.selectFirst("span.pl"); |
|||
if (ratingCountEl != null) { |
|||
book.setRatingCount(ratingCountEl.text()); |
|||
} |
|||
|
|||
Element infoEl = item.selectFirst("div.pub"); |
|||
if (infoEl != null) { |
|||
String info = infoEl.text(); |
|||
String[] parts = info.split("/"); |
|||
if (parts.length >= 3) { |
|||
book.setAuthor(parts[0].trim()); |
|||
book.setPublisher(parts[parts.length - 3].trim()); |
|||
book.setPublishDate(parts[parts.length - 2].trim()); |
|||
book.setPrice(parts[parts.length - 1].trim()); |
|||
} |
|||
} |
|||
|
|||
books.add(book); |
|||
} |
|||
} catch (NetworkException e) { |
|||
throw e; |
|||
} catch (Exception e) { |
|||
throw new ParseException("Failed to parse Douban book page", e); |
|||
} |
|||
return books; |
|||
} |
|||
|
|||
@Override |
|||
public String getOutputFileName() { |
|||
return "data/douban_books.json"; |
|||
} |
|||
} |
|||
@ -0,0 +1,74 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import com.crawler.exception.CrawlerException; |
|||
import com.crawler.exception.NetworkException; |
|||
import com.crawler.exception.ParseException; |
|||
import com.crawler.model.Movie; |
|||
import com.crawler.util.HttpUtil; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class DoubanMovieStrategy implements CrawlerStrategy<Movie> { |
|||
private static final String URL = "https://movie.douban.com/chart"; |
|||
|
|||
@Override |
|||
public List<Movie> crawl() throws CrawlerException { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
try { |
|||
Document doc = HttpUtil.getDocument(URL); |
|||
Elements items = doc.select("tr.item"); |
|||
|
|||
for (Element item : items) { |
|||
Movie movie = new Movie(); |
|||
Element titleEl = item.selectFirst("div.pl2 a"); |
|||
if (titleEl != null) { |
|||
movie.setTitle(titleEl.text().split("/")[0].trim()); |
|||
movie.setUrl(titleEl.attr("href")); |
|||
} |
|||
|
|||
Element ratingEl = item.selectFirst("span.rating_nums"); |
|||
if (ratingEl != null) { |
|||
movie.setRating(ratingEl.text()); |
|||
} |
|||
|
|||
Element ratingCountEl = item.selectFirst("span.pl"); |
|||
if (ratingCountEl != null) { |
|||
movie.setRatingCount(ratingCountEl.text()); |
|||
} |
|||
|
|||
Element infoEl = item.selectFirst("p.pl"); |
|||
if (infoEl != null) { |
|||
String info = infoEl.text(); |
|||
movie.setYear(extractYear(info)); |
|||
} |
|||
|
|||
movies.add(movie); |
|||
} |
|||
} catch (NetworkException e) { |
|||
throw e; |
|||
} catch (Exception e) { |
|||
throw new ParseException("Failed to parse Douban movie page", e); |
|||
} |
|||
return movies; |
|||
} |
|||
|
|||
private String extractYear(String info) { |
|||
String[] parts = info.split("/"); |
|||
for (String part : parts) { |
|||
part = part.trim(); |
|||
if (part.matches("\\d{4}.*")) { |
|||
return part; |
|||
} |
|||
} |
|||
return ""; |
|||
} |
|||
|
|||
@Override |
|||
public String getOutputFileName() { |
|||
return "data/douban_movies.json"; |
|||
} |
|||
} |
|||
@ -0,0 +1,60 @@ |
|||
package com.crawler.util; |
|||
|
|||
import com.crawler.exception.FileException; |
|||
import com.google.gson.Gson; |
|||
import com.google.gson.GsonBuilder; |
|||
import com.google.gson.reflect.TypeToken; |
|||
|
|||
import java.io.BufferedWriter; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.lang.reflect.Type; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Path; |
|||
import java.nio.file.Paths; |
|||
import java.util.List; |
|||
|
|||
public class FileUtil { |
|||
private static final Gson GSON = new GsonBuilder().setPrettyPrinting().create(); |
|||
|
|||
public static <T> void saveToJsonFile(List<T> data, String filePath) throws FileException { |
|||
try { |
|||
Path path = Paths.get(filePath); |
|||
Path parentDir = path.getParent(); |
|||
if (parentDir != null && !Files.exists(parentDir)) { |
|||
Files.createDirectories(parentDir); |
|||
} |
|||
|
|||
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) { |
|||
GSON.toJson(data, writer); |
|||
} |
|||
} catch (IOException e) { |
|||
throw new FileException("Failed to save data to file: " + filePath, e); |
|||
} |
|||
} |
|||
|
|||
public static <T> void saveToCsvFile(List<T> data, String filePath, String[] headers, CsvRowMapper<T> rowMapper) throws FileException { |
|||
try { |
|||
Path path = Paths.get(filePath); |
|||
Path parentDir = path.getParent(); |
|||
if (parentDir != null && !Files.exists(parentDir)) { |
|||
Files.createDirectories(parentDir); |
|||
} |
|||
|
|||
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) { |
|||
writer.write(String.join(",", headers)); |
|||
writer.newLine(); |
|||
for (T item : data) { |
|||
writer.write(rowMapper.mapToCsvRow(item)); |
|||
writer.newLine(); |
|||
} |
|||
} |
|||
} catch (IOException e) { |
|||
throw new FileException("Failed to save data to CSV file: " + filePath, e); |
|||
} |
|||
} |
|||
|
|||
public interface CsvRowMapper<T> { |
|||
String mapToCsvRow(T item); |
|||
} |
|||
} |
|||
@ -0,0 +1,24 @@ |
|||
package com.crawler.util; |
|||
|
|||
import com.crawler.exception.NetworkException; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
|
|||
import java.io.IOException; |
|||
|
|||
public class HttpUtil { |
|||
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; |
|||
private static final int TIMEOUT = 10000; |
|||
|
|||
public static Document getDocument(String url) throws NetworkException { |
|||
try { |
|||
return Jsoup.connect(url) |
|||
.userAgent(USER_AGENT) |
|||
.timeout(TIMEOUT) |
|||
.ignoreHttpErrors(true) |
|||
.get(); |
|||
} catch (IOException e) { |
|||
throw new NetworkException("Failed to fetch URL: " + url, e); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,59 @@ |
|||
package com.crawler.view; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class ConsoleView { |
|||
public void displayWelcome() { |
|||
System.out.println("========================================"); |
|||
System.out.println(" Web Crawler Application"); |
|||
System.out.println("========================================"); |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void displayMenu() { |
|||
System.out.println("Please select an option:"); |
|||
System.out.println("1. Crawl Douban Movies"); |
|||
System.out.println("2. Crawl Douban Books"); |
|||
System.out.println("3. Crawl Books to Scrape"); |
|||
System.out.println("4. Crawl All"); |
|||
System.out.println("0. Exit"); |
|||
System.out.println(); |
|||
System.out.print("Enter your choice: "); |
|||
} |
|||
|
|||
public void displayCrawling(String description) { |
|||
System.out.println(); |
|||
System.out.println("----------------------------------------"); |
|||
System.out.println("Crawling: " + description); |
|||
System.out.println("----------------------------------------"); |
|||
} |
|||
|
|||
public void displaySuccess(String fileName) { |
|||
System.out.println("✓ Data saved to: " + fileName); |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void displayError(String message) { |
|||
System.err.println("✗ Error: " + message); |
|||
System.err.println(); |
|||
} |
|||
|
|||
public void displayResults(List<?> data) { |
|||
System.out.println("Found " + data.size() + " items:"); |
|||
for (Object item : data) { |
|||
System.out.println("- " + item); |
|||
} |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void displayGoodbye() { |
|||
System.out.println("========================================"); |
|||
System.out.println(" Goodbye!"); |
|||
System.out.println("========================================"); |
|||
} |
|||
|
|||
public void displayInvalidChoice() { |
|||
System.out.println("Invalid choice. Please try again."); |
|||
System.out.println(); |
|||
} |
|||
} |
|||
@ -0,0 +1,38 @@ |
|||
package command; |
|||
|
|||
import controller.CrawlerController; |
|||
import exception.CrawlerResult; |
|||
import java.util.Collections; |
|||
import java.util.List; |
|||
|
|||
public class ClearCommand implements Command { |
|||
private final CrawlerController controller; |
|||
|
|||
public ClearCommand(CrawlerController controller) { |
|||
this.controller = controller; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "clear"; |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "清空所有数据"; |
|||
} |
|||
|
|||
@Override |
|||
public CrawlerResult execute() { |
|||
controller.clearAllData(); |
|||
return CrawlerResult.success("SYSTEM") |
|||
.message("数据已清空") |
|||
.dataCount(0) |
|||
.build(); |
|||
} |
|||
|
|||
@Override |
|||
public List<String> getRequiredSources() { |
|||
return Collections.emptyList(); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package command; |
|||
|
|||
import exception.CrawlerResult; |
|||
import java.util.List; |
|||
|
|||
public interface Command { |
|||
String getName(); |
|||
String getDescription(); |
|||
CrawlerResult execute(); |
|||
List<String> getRequiredSources(); |
|||
} |
|||
@ -0,0 +1,41 @@ |
|||
package command; |
|||
|
|||
import exception.CrawlerResult; |
|||
import exception.ValidationException; |
|||
import java.util.ArrayList; |
|||
import java.util.HashMap; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
public class CommandRegistry { |
|||
private final Map<String, Command> commands; |
|||
|
|||
public CommandRegistry() { |
|||
this.commands = new HashMap<>(); |
|||
} |
|||
|
|||
public void register(Command command) { |
|||
commands.put(command.getName(), command); |
|||
} |
|||
|
|||
public Command getCommand(String name) { |
|||
Command command = commands.get(name); |
|||
if (command == null) { |
|||
throw new ValidationException("未知命令: " + name); |
|||
} |
|||
return command; |
|||
} |
|||
|
|||
public List<Command> getAllCommands() { |
|||
return new ArrayList<>(commands.values()); |
|||
} |
|||
|
|||
public String getHelpText() { |
|||
StringBuilder sb = new StringBuilder(); |
|||
sb.append("可用命令:\n"); |
|||
for (Command cmd : commands.values()) { |
|||
sb.append(String.format(" %-15s - %s\n", cmd.getName(), cmd.getDescription())); |
|||
} |
|||
return sb.toString(); |
|||
} |
|||
} |
|||
@ -0,0 +1,37 @@ |
|||
package command; |
|||
|
|||
import controller.CrawlerController; |
|||
import exception.CrawlerResult; |
|||
import java.util.List; |
|||
|
|||
public class ListCrawlersCommand implements Command { |
|||
private final CrawlerController controller; |
|||
|
|||
public ListCrawlersCommand(CrawlerController controller) { |
|||
this.controller = controller; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "list"; |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "列出所有可用爬虫"; |
|||
} |
|||
|
|||
@Override |
|||
public CrawlerResult execute() { |
|||
List<String> crawlers = controller.getAllCrawlerNames(); |
|||
return CrawlerResult.success("SYSTEM") |
|||
.message("获取爬虫列表成功") |
|||
.dataCount(crawlers.size()) |
|||
.build(); |
|||
} |
|||
|
|||
@Override |
|||
public List<String> getRequiredSources() { |
|||
return controller.getAllCrawlerNames(); |
|||
} |
|||
} |
|||
@ -0,0 +1,59 @@ |
|||
package command; |
|||
|
|||
import controller.CrawlerController; |
|||
import exception.CrawlerResult; |
|||
import java.util.List; |
|||
|
|||
public class RunAllCommand implements Command { |
|||
private final CrawlerController controller; |
|||
|
|||
public RunAllCommand(CrawlerController controller) { |
|||
this.controller = controller; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "run-all"; |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "运行所有爬虫"; |
|||
} |
|||
|
|||
@Override |
|||
public CrawlerResult execute() { |
|||
long startTime = System.currentTimeMillis(); |
|||
List<CrawlerResult> results = controller.runAllCrawlers(); |
|||
long elapsedTime = System.currentTimeMillis() - startTime; |
|||
|
|||
int successCount = 0; |
|||
int totalCount = results.size(); |
|||
int totalData = 0; |
|||
|
|||
for (CrawlerResult result : results) { |
|||
if (result.isSuccess()) { |
|||
successCount++; |
|||
totalData += result.getDataCount(); |
|||
} |
|||
} |
|||
|
|||
if (successCount == totalCount) { |
|||
return CrawlerResult.success("ALL") |
|||
.message("所有爬虫执行成功") |
|||
.dataCount(totalData) |
|||
.elapsedTime(elapsedTime) |
|||
.build(); |
|||
} else { |
|||
return CrawlerResult.failure("ALL", "PARTIAL_FAIL", |
|||
String.format("执行完成: %d/%d 成功, 获取 %d 条数据", successCount, totalCount, totalData)) |
|||
.elapsedTime(elapsedTime) |
|||
.build(); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public List<String> getRequiredSources() { |
|||
return controller.getAllCrawlerNames(); |
|||
} |
|||
} |
|||
@ -0,0 +1,37 @@ |
|||
package command; |
|||
|
|||
import controller.CrawlerController; |
|||
import exception.CrawlerResult; |
|||
import exception.ValidationException; |
|||
import java.util.Collections; |
|||
import java.util.List; |
|||
|
|||
public class RunSingleCommand implements Command { |
|||
private final CrawlerController controller; |
|||
private final String crawlerName; |
|||
|
|||
public RunSingleCommand(CrawlerController controller, String crawlerName) { |
|||
this.controller = controller; |
|||
this.crawlerName = crawlerName; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "run"; |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "运行指定爬虫: " + crawlerName; |
|||
} |
|||
|
|||
@Override |
|||
public CrawlerResult execute() { |
|||
return controller.runCrawler(crawlerName); |
|||
} |
|||
|
|||
@Override |
|||
public List<String> getRequiredSources() { |
|||
return Collections.singletonList(crawlerName); |
|||
} |
|||
} |
|||
@ -0,0 +1,38 @@ |
|||
package command; |
|||
|
|||
import controller.CrawlerController; |
|||
import exception.CrawlerResult; |
|||
import java.util.Collections; |
|||
import java.util.List; |
|||
|
|||
public class StatsCommand implements Command { |
|||
private final CrawlerController controller; |
|||
|
|||
public StatsCommand(CrawlerController controller) { |
|||
this.controller = controller; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "stats"; |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "显示统计信息"; |
|||
} |
|||
|
|||
@Override |
|||
public CrawlerResult execute() { |
|||
String stats = controller.getStats(); |
|||
return CrawlerResult.success("STATS") |
|||
.message(stats) |
|||
.dataCount(0) |
|||
.build(); |
|||
} |
|||
|
|||
@Override |
|||
public List<String> getRequiredSources() { |
|||
return Collections.emptyList(); |
|||
} |
|||
} |
|||
@ -0,0 +1,73 @@ |
|||
package config; |
|||
|
|||
import java.io.FileInputStream; |
|||
import java.io.IOException; |
|||
import java.io.InputStream; |
|||
import java.util.Properties; |
|||
|
|||
/** |
|||
* 爬虫配置类 |
|||
*/ |
|||
public class CrawlerConfig { |
|||
private static final String CONFIG_FILE = "crawler.properties"; |
|||
private static Properties props = new Properties(); |
|||
|
|||
// 默认配置
|
|||
static { |
|||
props.setProperty("delay.ms", "1000"); |
|||
props.setProperty("timeout.ms", "15000"); |
|||
props.setProperty("user.agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); |
|||
props.setProperty("db.path", "crawler.db"); |
|||
props.setProperty("output.dir", "output"); |
|||
props.setProperty("enable.database", "true"); |
|||
props.setProperty("enable.file", "true"); |
|||
} |
|||
|
|||
/** |
|||
* 加载配置文件 |
|||
*/ |
|||
public static void load() { |
|||
try (InputStream is = new FileInputStream(CONFIG_FILE)) { |
|||
props.load(is); |
|||
System.out.println("配置文件加载成功: " + CONFIG_FILE); |
|||
} catch (IOException e) { |
|||
System.out.println("使用默认配置(未找到配置文件: " + CONFIG_FILE + ")"); |
|||
} |
|||
} |
|||
|
|||
public static int getDelayMs() { |
|||
return Integer.parseInt(props.getProperty("delay.ms", "1000")); |
|||
} |
|||
|
|||
public static int getTimeoutMs() { |
|||
return Integer.parseInt(props.getProperty("timeout.ms", "15000")); |
|||
} |
|||
|
|||
public static String getUserAgent() { |
|||
return props.getProperty("user.agent"); |
|||
} |
|||
|
|||
public static String getDbPath() { |
|||
return props.getProperty("db.path", "crawler.db"); |
|||
} |
|||
|
|||
public static String getOutputDir() { |
|||
return props.getProperty("output.dir", "output"); |
|||
} |
|||
|
|||
public static boolean isDatabaseEnabled() { |
|||
return Boolean.parseBoolean(props.getProperty("enable.database", "true")); |
|||
} |
|||
|
|||
public static boolean isFileOutputEnabled() { |
|||
return Boolean.parseBoolean(props.getProperty("enable.file", "true")); |
|||
} |
|||
|
|||
public static String getProperty(String key) { |
|||
return props.getProperty(key); |
|||
} |
|||
|
|||
public static String getProperty(String key, String defaultValue) { |
|||
return props.getProperty(key, defaultValue); |
|||
} |
|||
} |
|||
@ -0,0 +1,177 @@ |
|||
package controller; |
|||
|
|||
import exception.CrawlerResult; |
|||
import exception.ValidationException; |
|||
import model.Movie; |
|||
import storage.DataStorage; |
|||
import storage.FileStorage; |
|||
import storage.StorageStats; |
|||
import strategy.CrawlerStrategy; |
|||
import strategy.BookCrawlerStrategy; |
|||
import strategy.impl.DoubanStrategy; |
|||
import strategy.impl.MaoyanStrategy; |
|||
import strategy.impl.RottenTomatoesStrategy; |
|||
import strategy.impl.DoubanBookStrategy; |
|||
import strategy.impl.BooksToScrapeStrategy; |
|||
import util.Logger; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.HashMap; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
public class CrawlerController { |
|||
private Map<String, CrawlerStrategy> movieCrawlers; |
|||
private Map<String, BookCrawlerStrategy> bookCrawlers; |
|||
private DataStorage storage; |
|||
private String outputDir; |
|||
|
|||
public CrawlerController() { |
|||
this.movieCrawlers = new HashMap<>(); |
|||
this.bookCrawlers = new HashMap<>(); |
|||
this.outputDir = "output"; |
|||
initStorage(); |
|||
registerDefaultCrawlers(); |
|||
} |
|||
|
|||
public CrawlerController(String outputDir) { |
|||
this.movieCrawlers = new HashMap<>(); |
|||
this.bookCrawlers = new HashMap<>(); |
|||
this.outputDir = outputDir; |
|||
initStorage(); |
|||
registerDefaultCrawlers(); |
|||
} |
|||
|
|||
private void initStorage() { |
|||
this.storage = new FileStorage(outputDir); |
|||
Logger.info("文件存储初始化完成,输出目录: " + outputDir); |
|||
} |
|||
|
|||
private void registerDefaultCrawlers() { |
|||
registerMovieCrawler(new DoubanStrategy()); |
|||
registerMovieCrawler(new MaoyanStrategy()); |
|||
registerMovieCrawler(new RottenTomatoesStrategy()); |
|||
registerBookCrawler(new DoubanBookStrategy()); |
|||
registerBookCrawler(new BooksToScrapeStrategy()); |
|||
} |
|||
|
|||
public void registerMovieCrawler(CrawlerStrategy strategy) { |
|||
strategy.setStorage(storage); |
|||
movieCrawlers.put(strategy.getName(), strategy); |
|||
Logger.info("已注册电影爬虫: " + strategy.getName()); |
|||
} |
|||
|
|||
public void registerBookCrawler(BookCrawlerStrategy strategy) { |
|||
strategy.setStorage(storage); |
|||
bookCrawlers.put(strategy.getName(), strategy); |
|||
Logger.info("已注册图书爬虫: " + strategy.getName()); |
|||
} |
|||
|
|||
public void registerCrawler(CrawlerStrategy strategy) { |
|||
registerMovieCrawler(strategy); |
|||
} |
|||
|
|||
public void registerCrawler(CrawlerStrategy strategy, DataStorage customStorage) { |
|||
strategy.setStorage(customStorage); |
|||
movieCrawlers.put(strategy.getName(), strategy); |
|||
Logger.info("已注册爬虫: " + strategy.getName()); |
|||
} |
|||
|
|||
public List<String> getAllCrawlerNames() { |
|||
List<String> names = new ArrayList<>(); |
|||
names.addAll(movieCrawlers.keySet()); |
|||
names.addAll(bookCrawlers.keySet()); |
|||
return names; |
|||
} |
|||
|
|||
public List<String> getMovieCrawlerNames() { |
|||
return new ArrayList<>(movieCrawlers.keySet()); |
|||
} |
|||
|
|||
public List<String> getBookCrawlerNames() { |
|||
return new ArrayList<>(bookCrawlers.keySet()); |
|||
} |
|||
|
|||
public CrawlerResult runCrawler(String name) { |
|||
if (movieCrawlers.containsKey(name)) { |
|||
CrawlerStrategy strategy = movieCrawlers.get(name); |
|||
Logger.info("开始执行电影爬虫: " + name); |
|||
CrawlerResult result = strategy.execute(); |
|||
Logger.info("爬虫执行完成: " + result); |
|||
return result; |
|||
} else if (bookCrawlers.containsKey(name)) { |
|||
BookCrawlerStrategy strategy = bookCrawlers.get(name); |
|||
Logger.info("开始执行图书爬虫: " + name); |
|||
CrawlerResult result = strategy.execute(); |
|||
Logger.info("爬虫执行完成: " + result); |
|||
return result; |
|||
} else { |
|||
throw new ValidationException("未找到爬虫: " + name); |
|||
} |
|||
} |
|||
|
|||
public List<CrawlerResult> runAllCrawlers() { |
|||
List<CrawlerResult> results = new ArrayList<>(); |
|||
int total = movieCrawlers.size() + bookCrawlers.size(); |
|||
Logger.info("开始执行所有爬虫,共 " + total + " 个"); |
|||
|
|||
for (CrawlerStrategy strategy : movieCrawlers.values()) { |
|||
try { |
|||
CrawlerResult result = strategy.execute(); |
|||
results.add(result); |
|||
} catch (Exception e) { |
|||
Logger.error("爬虫执行失败: " + strategy.getName(), e); |
|||
results.add(CrawlerResult.failure(strategy.getName(), "EXEC_ERROR", e.getMessage()).build()); |
|||
} |
|||
} |
|||
|
|||
for (BookCrawlerStrategy strategy : bookCrawlers.values()) { |
|||
try { |
|||
CrawlerResult result = strategy.execute(); |
|||
results.add(result); |
|||
} catch (Exception e) { |
|||
Logger.error("爬虫执行失败: " + strategy.getName(), e); |
|||
results.add(CrawlerResult.failure(strategy.getName(), "EXEC_ERROR", e.getMessage()).build()); |
|||
} |
|||
} |
|||
|
|||
return results; |
|||
} |
|||
|
|||
public String getStats() { |
|||
StringBuilder sb = new StringBuilder(); |
|||
sb.append("========== 爬虫统计 ==========\n"); |
|||
sb.append("电影爬虫数量: ").append(movieCrawlers.size()).append("\n"); |
|||
sb.append("图书爬虫数量: ").append(bookCrawlers.size()).append("\n"); |
|||
sb.append("总爬虫数量: ").append(movieCrawlers.size() + bookCrawlers.size()).append("\n"); |
|||
sb.append("\n电影爬虫列表:\n"); |
|||
for (String name : movieCrawlers.keySet()) { |
|||
sb.append(" - ").append(name).append("\n"); |
|||
} |
|||
sb.append("\n图书爬虫列表:\n"); |
|||
for (String name : bookCrawlers.keySet()) { |
|||
sb.append(" - ").append(name).append("\n"); |
|||
} |
|||
sb.append("============================="); |
|||
return sb.toString(); |
|||
} |
|||
|
|||
public void clearAllData() { |
|||
if (storage != null) { |
|||
storage.clearAll(); |
|||
Logger.info("所有数据已清空"); |
|||
} |
|||
} |
|||
|
|||
public DataStorage getStorage() { |
|||
return storage; |
|||
} |
|||
|
|||
public Map<String, CrawlerStrategy> getCrawlers() { |
|||
return movieCrawlers; |
|||
} |
|||
|
|||
public Map<String, BookCrawlerStrategy> getBookCrawlers() { |
|||
return bookCrawlers; |
|||
} |
|||
} |
|||
@ -0,0 +1,139 @@ |
|||
package crawler; |
|||
|
|||
import model.Movie; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import storage.DataStorage; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
/** |
|||
* 爬虫抽象基类 |
|||
*/ |
|||
public abstract class BaseCrawler { |
|||
protected String name; // 爬虫名称
|
|||
protected String baseUrl; // 基础URL
|
|||
protected int delayMs; // 请求延迟(毫秒)
|
|||
protected DataStorage storage; // 数据存储
|
|||
|
|||
public BaseCrawler(String name, String baseUrl) { |
|||
this(name, baseUrl, 1000); |
|||
} |
|||
|
|||
public BaseCrawler(String name, String baseUrl, int delayMs) { |
|||
this.name = name; |
|||
this.baseUrl = baseUrl; |
|||
this.delayMs = delayMs; |
|||
} |
|||
|
|||
/** |
|||
* 设置数据存储 |
|||
*/ |
|||
public void setStorage(DataStorage storage) { |
|||
this.storage = storage; |
|||
} |
|||
|
|||
/** |
|||
* 获取爬虫名称 |
|||
*/ |
|||
public String getName() { |
|||
return name; |
|||
} |
|||
|
|||
/** |
|||
* 获取网页文档 |
|||
*/ |
|||
protected Document fetchDocument(String url) throws IOException { |
|||
return Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + |
|||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
|||
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
|||
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") |
|||
.timeout(15000) |
|||
.get(); |
|||
} |
|||
|
|||
/** |
|||
* 延迟等待 |
|||
*/ |
|||
protected void delay() { |
|||
try { |
|||
Thread.sleep(delayMs); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 开始爬取(模板方法模式) |
|||
*/ |
|||
public final void crawl() { |
|||
System.out.println("========================================"); |
|||
System.out.println("开始爬取: " + name); |
|||
System.out.println("目标URL: " + baseUrl); |
|||
System.out.println("========================================"); |
|||
|
|||
long startTime = System.currentTimeMillis(); |
|||
List<Movie> allMovies = new ArrayList<>(); |
|||
|
|||
try { |
|||
// 获取所有需要爬取的URL列表
|
|||
List<String> urls = getUrls(); |
|||
System.out.println("共 " + urls.size() + " 个页面需要爬取"); |
|||
|
|||
for (int i = 0; i < urls.size(); i++) { |
|||
String url = urls.get(i); |
|||
System.out.println("\n正在爬取第 " + (i + 1) + "/" + urls.size() + " 页: " + url); |
|||
|
|||
try { |
|||
Document doc = fetchDocument(url); |
|||
List<Movie> movies = parsePage(doc); |
|||
|
|||
// 设置数据来源
|
|||
for (Movie movie : movies) { |
|||
movie.setSource(name); |
|||
} |
|||
|
|||
allMovies.addAll(movies); |
|||
System.out.println("本页获取 " + movies.size() + " 条数据"); |
|||
|
|||
// 延迟,避免被封
|
|||
if (i < urls.size() - 1) { |
|||
delay(); |
|||
} |
|||
|
|||
} catch (IOException e) { |
|||
System.err.println("爬取页面失败: " + url + " - " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
// 保存数据
|
|||
if (!allMovies.isEmpty() && storage != null) { |
|||
storage.saveBatch(allMovies); |
|||
} |
|||
|
|||
long endTime = System.currentTimeMillis(); |
|||
System.out.println("\n========================================"); |
|||
System.out.println("爬取完成!"); |
|||
System.out.println("总数据量: " + allMovies.size()); |
|||
System.out.println("耗时: " + (endTime - startTime) / 1000 + " 秒"); |
|||
System.out.println("========================================"); |
|||
|
|||
} catch (Exception e) { |
|||
System.err.println("爬取过程出错: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 获取所有需要爬取的URL列表(子类实现) |
|||
*/ |
|||
protected abstract List<String> getUrls(); |
|||
|
|||
/** |
|||
* 解析单个页面(子类实现) |
|||
*/ |
|||
protected abstract List<Movie> parsePage(Document doc); |
|||
} |
|||
@ -0,0 +1,113 @@ |
|||
package crawler; |
|||
|
|||
import model.Movie; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
/** |
|||
* 豆瓣电影Top250爬虫 |
|||
*/ |
|||
public class DoubanCrawler extends BaseCrawler { |
|||
|
|||
public DoubanCrawler() { |
|||
super("豆瓣电影Top250", "https://movie.douban.com/top250", 1500); |
|||
} |
|||
|
|||
@Override |
|||
protected List<String> getUrls() { |
|||
List<String> urls = new ArrayList<>(); |
|||
// 豆瓣Top250共10页,每页25部
|
|||
for (int i = 0; i < 10; i++) { |
|||
urls.add(baseUrl + "?start=" + (i * 25)); |
|||
} |
|||
return urls; |
|||
} |
|||
|
|||
@Override |
|||
protected List<Movie> parsePage(Document doc) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
Elements items = doc.select("div.item"); |
|||
|
|||
for (Element item : items) { |
|||
try { |
|||
Movie movie = new Movie(); |
|||
|
|||
// 排名
|
|||
String rankStr = item.select("em").text(); |
|||
movie.setRank(Integer.parseInt(rankStr)); |
|||
|
|||
// 电影名称(取第一个标题)
|
|||
Element titleElement = item.select("span.title").first(); |
|||
if (titleElement != null) { |
|||
movie.setName(titleElement.text()); |
|||
} |
|||
|
|||
// 评分
|
|||
String ratingStr = item.select("span.rating_num").text(); |
|||
if (!ratingStr.isEmpty()) { |
|||
movie.setRating(Double.parseDouble(ratingStr)); |
|||
} |
|||
|
|||
// 评分人数
|
|||
String ratingCountStr = item.select("div.star span").last().text(); |
|||
if (ratingCountStr != null && ratingCountStr.contains("人评价")) { |
|||
String num = ratingCountStr.replace("人评价", "").trim(); |
|||
movie.setRatingCount(parseNumber(num)); |
|||
} |
|||
|
|||
// 其他信息(导演、年份等)
|
|||
String info = item.select("div.bd p").first().text(); |
|||
if (info != null) { |
|||
// 提取年份
|
|||
String[] parts = info.split(" / "); |
|||
if (parts.length > 0) { |
|||
String firstPart = parts[0]; |
|||
if (firstPart.contains("导演: ")) { |
|||
movie.setDirector(firstPart.replace("导演: ", "").trim()); |
|||
} |
|||
// 提取年份(通常是最后一个数字部分)
|
|||
for (String part : parts) { |
|||
if (part.matches("\\d{4}") || part.matches("\\d{4}.*")) { |
|||
movie.setYear(part.trim().split("\\s+")[0]); |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
// 详情链接
|
|||
String link = item.select("div.hd a").attr("href"); |
|||
movie.setUrl(link); |
|||
|
|||
// 海报图片
|
|||
String imgUrl = item.select("div.pic img").attr("src"); |
|||
movie.setImageUrl(imgUrl); |
|||
|
|||
movies.add(movie); |
|||
|
|||
} catch (Exception e) { |
|||
System.err.println("解析电影数据出错: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
return movies; |
|||
} |
|||
|
|||
/** |
|||
* 解析数字(处理中文数字如"万") |
|||
*/ |
|||
private Integer parseNumber(String str) { |
|||
try { |
|||
if (str.contains("万")) { |
|||
return (int) (Double.parseDouble(str.replace("万", "")) * 10000); |
|||
} |
|||
return Integer.parseInt(str.replace(",", "")); |
|||
} catch (NumberFormatException e) { |
|||
return null; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,100 @@ |
|||
package crawler; |
|||
|
|||
import model.Movie; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
/** |
|||
* IMDB Top250 爬虫示例 |
|||
*/ |
|||
public class ImdbCrawler extends BaseCrawler { |
|||
|
|||
public ImdbCrawler() { |
|||
super("IMDB电影Top250", "https://www.imdb.com/chart/top/", 2000); |
|||
} |
|||
|
|||
@Override |
|||
protected List<String> getUrls() { |
|||
List<String> urls = new ArrayList<>(); |
|||
urls.add(baseUrl); |
|||
return urls; |
|||
} |
|||
|
|||
@Override |
|||
protected List<Movie> parsePage(Document doc) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
Elements items = doc.select("li.ipc-metadata-list-summary-item"); |
|||
|
|||
int rank = 1; |
|||
for (Element item : items) { |
|||
try { |
|||
Movie movie = new Movie(); |
|||
movie.setRank(rank++); |
|||
|
|||
// 电影名称
|
|||
Element titleElement = item.select("h3.ipc-title__text").first(); |
|||
if (titleElement != null) { |
|||
String fullTitle = titleElement.text(); |
|||
// 移除排名前缀如 "1. "
|
|||
if (fullTitle.matches("\\d+\\..*")) { |
|||
fullTitle = fullTitle.substring(fullTitle.indexOf(".") + 1).trim(); |
|||
} |
|||
movie.setName(fullTitle); |
|||
} |
|||
|
|||
// 评分
|
|||
String ratingStr = item.select("span.ipc-rating-star--rating").text(); |
|||
if (!ratingStr.isEmpty()) { |
|||
movie.setRating(Double.parseDouble(ratingStr)); |
|||
} |
|||
|
|||
// 评分人数
|
|||
String countStr = item.select("span.ipc-rating-star--voteCount").text(); |
|||
if (!countStr.isEmpty()) { |
|||
movie.setRatingCount(parseNumber(countStr.replaceAll("[()\\s]", ""))); |
|||
} |
|||
|
|||
// 年份
|
|||
String yearStr = item.select("span.cli-title-metadata-item").first().text(); |
|||
if (yearStr != null && yearStr.matches("\\d{4}")) { |
|||
movie.setYear(yearStr); |
|||
} |
|||
|
|||
// 详情链接
|
|||
String link = item.select("a.ipc-title-link-wrapper").attr("href"); |
|||
if (!link.isEmpty()) { |
|||
movie.setUrl("https://www.imdb.com" + link); |
|||
} |
|||
|
|||
// 海报图片
|
|||
String imgUrl = item.select("img.ipc-image").attr("src"); |
|||
movie.setImageUrl(imgUrl); |
|||
|
|||
movies.add(movie); |
|||
|
|||
} catch (Exception e) { |
|||
System.err.println("解析电影数据出错: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
return movies; |
|||
} |
|||
|
|||
private Integer parseNumber(String str) { |
|||
try { |
|||
if (str.contains("M")) { |
|||
return (int) (Double.parseDouble(str.replace("M", "")) * 1000000); |
|||
} |
|||
if (str.contains("K")) { |
|||
return (int) (Double.parseDouble(str.replace("K", "")) * 1000); |
|||
} |
|||
return Integer.parseInt(str.replace(",", "")); |
|||
} catch (NumberFormatException e) { |
|||
return null; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,92 @@ |
|||
package crawler; |
|||
|
|||
import model.Movie; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
/** |
|||
* 猫眼电影 Top100 爬虫 |
|||
*/ |
|||
public class MaoyanCrawler extends BaseCrawler { |
|||
|
|||
public MaoyanCrawler() { |
|||
super("猫眼电影Top100", "https://maoyan.com/board/4", 1500); |
|||
} |
|||
|
|||
@Override |
|||
protected List<String> getUrls() { |
|||
List<String> urls = new ArrayList<>(); |
|||
// 猫眼Top100共10页,每页10部
|
|||
for (int i = 0; i < 10; i++) { |
|||
urls.add(baseUrl + "?offset=" + (i * 10)); |
|||
} |
|||
return urls; |
|||
} |
|||
|
|||
@Override |
|||
protected List<Movie> parsePage(Document doc) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
Elements items = doc.select("dl.board-wrapper dd"); |
|||
|
|||
for (Element item : items) { |
|||
try { |
|||
Movie movie = new Movie(); |
|||
|
|||
// 排名
|
|||
String rankStr = item.select("i.board-index").text(); |
|||
movie.setRank(Integer.parseInt(rankStr)); |
|||
|
|||
// 电影名称
|
|||
String name = item.select("p.name a").text(); |
|||
movie.setName(name); |
|||
|
|||
// 评分
|
|||
String ratingStr = item.select("i.integer").text() + |
|||
item.select("i.fraction").text(); |
|||
if (!ratingStr.isEmpty()) { |
|||
movie.setRating(Double.parseDouble(ratingStr)); |
|||
} |
|||
|
|||
// 主演
|
|||
String actors = item.select("p.star").text(); |
|||
if (actors != null && actors.contains("主演:")) { |
|||
movie.setActors(actors.replace("主演:", "").trim()); |
|||
} |
|||
|
|||
// 上映时间
|
|||
String releaseTime = item.select("p.releasetime").text(); |
|||
if (releaseTime != null && releaseTime.contains("上映时间:")) { |
|||
String timeStr = releaseTime.replace("上映时间:", "").trim(); |
|||
// 提取年份
|
|||
if (timeStr.matches("\\d{4}.*")) { |
|||
movie.setYear(timeStr.substring(0, 4)); |
|||
} |
|||
} |
|||
|
|||
// 详情链接
|
|||
String link = item.select("p.name a").attr("href"); |
|||
if (!link.isEmpty()) { |
|||
movie.setUrl("https://maoyan.com" + link); |
|||
} |
|||
|
|||
// 海报图片
|
|||
String imgUrl = item.select("img.board-img").attr("data-src"); |
|||
if (imgUrl.isEmpty()) { |
|||
imgUrl = item.select("img.board-img").attr("src"); |
|||
} |
|||
movie.setImageUrl(imgUrl); |
|||
|
|||
movies.add(movie); |
|||
|
|||
} catch (Exception e) { |
|||
System.err.println("解析猫眼电影数据出错: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
return movies; |
|||
} |
|||
} |
|||
@ -0,0 +1,102 @@ |
|||
package crawler; |
|||
|
|||
import model.Movie; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
/** |
|||
* 烂番茄 (Rotten Tomatoes) Top100 爬虫 |
|||
*/ |
|||
public class RottenTomatoesCrawler extends BaseCrawler { |
|||
|
|||
public RottenTomatoesCrawler() { |
|||
super("烂番茄Top100", "https://www.rottentomatoes.com/top/bestofrt/", 2000); |
|||
} |
|||
|
|||
@Override |
|||
protected List<String> getUrls() { |
|||
List<String> urls = new ArrayList<>(); |
|||
urls.add(baseUrl); |
|||
return urls; |
|||
} |
|||
|
|||
@Override |
|||
protected List<Movie> parsePage(Document doc) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
Elements items = doc.select("table.table tr"); |
|||
|
|||
// 跳过表头
|
|||
int rank = 0; |
|||
for (Element item : items) { |
|||
try { |
|||
// 跳过表头行
|
|||
Element rankElement = item.selectFirst("td.rank"); |
|||
if (rankElement == null) continue; |
|||
|
|||
Movie movie = new Movie(); |
|||
|
|||
// 排名
|
|||
String rankStr = rankElement.text(); |
|||
if (!rankStr.isEmpty()) { |
|||
movie.setRank(Integer.parseInt(rankStr)); |
|||
} else { |
|||
movie.setRank(++rank); |
|||
} |
|||
|
|||
// 电影名称和年份
|
|||
Element titleElement = item.selectFirst("td.title a"); |
|||
if (titleElement != null) { |
|||
String fullTitle = titleElement.text(); |
|||
// 提取年份(通常在括号里)
|
|||
if (fullTitle.contains("(") && fullTitle.contains(")")) { |
|||
int start = fullTitle.lastIndexOf("("); |
|||
int end = fullTitle.lastIndexOf(")"); |
|||
if (start > 0 && end > start) { |
|||
String yearStr = fullTitle.substring(start + 1, end); |
|||
if (yearStr.matches("\\d{4}")) { |
|||
movie.setYear(yearStr); |
|||
} |
|||
movie.setName(fullTitle.substring(0, start).trim()); |
|||
} else { |
|||
movie.setName(fullTitle); |
|||
} |
|||
} else { |
|||
movie.setName(fullTitle); |
|||
} |
|||
|
|||
// 详情链接
|
|||
String link = titleElement.attr("href"); |
|||
if (!link.isEmpty()) { |
|||
if (link.startsWith("/")) { |
|||
movie.setUrl("https://www.rottentomatoes.com" + link); |
|||
} else { |
|||
movie.setUrl(link); |
|||
} |
|||
} |
|||
} |
|||
|
|||
// 新鲜度评分(烂番茄特有)
|
|||
Element scoreElement = item.selectFirst("td.score span.tMeterScore"); |
|||
if (scoreElement != null) { |
|||
String scoreStr = scoreElement.text(); |
|||
if (scoreStr.matches("\\d+%")) { |
|||
// 转换为10分制
|
|||
double rating = Double.parseDouble(scoreStr.replace("%", "")) / 10; |
|||
movie.setRating(Math.round(rating * 10) / 10.0); |
|||
} |
|||
} |
|||
|
|||
movies.add(movie); |
|||
|
|||
} catch (Exception e) { |
|||
System.err.println("解析烂番茄数据出错: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
return movies; |
|||
} |
|||
} |
|||
@ -0,0 +1,55 @@ |
|||
package exception; |
|||
|
|||
public class CrawlerException extends RuntimeException { |
|||
private final String source; |
|||
private final String errorCode; |
|||
|
|||
public CrawlerException(String message) { |
|||
super(message); |
|||
this.source = "UNKNOWN"; |
|||
this.errorCode = "CRAWLER_001"; |
|||
} |
|||
|
|||
public CrawlerException(String message, String source) { |
|||
super(message); |
|||
this.source = source; |
|||
this.errorCode = "CRAWLER_001"; |
|||
} |
|||
|
|||
public CrawlerException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
this.source = "UNKNOWN"; |
|||
this.errorCode = "CRAWLER_002"; |
|||
} |
|||
|
|||
public CrawlerException(String message, String source, Throwable cause) { |
|||
super(message, cause); |
|||
this.source = source; |
|||
this.errorCode = "CRAWLER_002"; |
|||
} |
|||
|
|||
public CrawlerException(String message, String source, String errorCode) { |
|||
super(message); |
|||
this.source = source; |
|||
this.errorCode = errorCode; |
|||
} |
|||
|
|||
public CrawlerException(String message, String source, String errorCode, Throwable cause) { |
|||
super(message, cause); |
|||
this.source = source; |
|||
this.errorCode = errorCode; |
|||
} |
|||
|
|||
public String getSource() { |
|||
return source; |
|||
} |
|||
|
|||
public String getErrorCode() { |
|||
return errorCode; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return String.format("[%s] [%s] %s (source: %s)", errorCode, getClass().getSimpleName(), getMessage(), source); |
|||
} |
|||
} |
|||
@ -0,0 +1,103 @@ |
|||
package exception; |
|||
|
|||
public class CrawlerResult { |
|||
private final boolean success; |
|||
private final String source; |
|||
private final String message; |
|||
private final int dataCount; |
|||
private final long elapsedTime; |
|||
private final String errorCode; |
|||
|
|||
private CrawlerResult(Builder builder) { |
|||
this.success = builder.success; |
|||
this.source = builder.source; |
|||
this.message = builder.message; |
|||
this.dataCount = builder.dataCount; |
|||
this.elapsedTime = builder.elapsedTime; |
|||
this.errorCode = builder.errorCode; |
|||
} |
|||
|
|||
public boolean isSuccess() { |
|||
return success; |
|||
} |
|||
|
|||
public String getSource() { |
|||
return source; |
|||
} |
|||
|
|||
public String getMessage() { |
|||
return message; |
|||
} |
|||
|
|||
public int getDataCount() { |
|||
return dataCount; |
|||
} |
|||
|
|||
public long getElapsedTime() { |
|||
return elapsedTime; |
|||
} |
|||
|
|||
public String getErrorCode() { |
|||
return errorCode; |
|||
} |
|||
|
|||
public static Builder success(String source) { |
|||
return new Builder().success(true).source(source); |
|||
} |
|||
|
|||
public static Builder failure(String source, String errorCode, String message) { |
|||
return new Builder().success(false).source(source).errorCode(errorCode).message(message); |
|||
} |
|||
|
|||
public static class Builder { |
|||
private boolean success; |
|||
private String source; |
|||
private String message; |
|||
private int dataCount; |
|||
private long elapsedTime; |
|||
private String errorCode; |
|||
|
|||
public Builder success(boolean success) { |
|||
this.success = success; |
|||
return this; |
|||
} |
|||
|
|||
public Builder source(String source) { |
|||
this.source = source; |
|||
return this; |
|||
} |
|||
|
|||
public Builder message(String message) { |
|||
this.message = message; |
|||
return this; |
|||
} |
|||
|
|||
public Builder dataCount(int dataCount) { |
|||
this.dataCount = dataCount; |
|||
return this; |
|||
} |
|||
|
|||
public Builder elapsedTime(long elapsedTime) { |
|||
this.elapsedTime = elapsedTime; |
|||
return this; |
|||
} |
|||
|
|||
public Builder errorCode(String errorCode) { |
|||
this.errorCode = errorCode; |
|||
return this; |
|||
} |
|||
|
|||
public CrawlerResult build() { |
|||
return new CrawlerResult(this); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
if (success) { |
|||
return String.format("[SUCCESS] %s - 获取 %d 条数据 (耗时: %dms)", source, dataCount, elapsedTime); |
|||
} else { |
|||
return String.format("[FAILURE] [%s] %s - %s", errorCode, source, message); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,20 @@ |
|||
package exception; |
|||
|
|||
public class NetworkException extends CrawlerException { |
|||
|
|||
public NetworkException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public NetworkException(String message, String source) { |
|||
super(message, source); |
|||
} |
|||
|
|||
public NetworkException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
|
|||
public NetworkException(String message, String source, Throwable cause) { |
|||
super(message, source, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,20 @@ |
|||
package exception; |
|||
|
|||
public class ParseException extends CrawlerException { |
|||
|
|||
public ParseException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public ParseException(String message, String source) { |
|||
super(message, source); |
|||
} |
|||
|
|||
public ParseException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
|
|||
public ParseException(String message, String source, Throwable cause) { |
|||
super(message, source, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,20 @@ |
|||
package exception; |
|||
|
|||
public class StorageException extends CrawlerException { |
|||
|
|||
public StorageException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public StorageException(String message, String source) { |
|||
super(message, source); |
|||
} |
|||
|
|||
public StorageException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
|
|||
public StorageException(String message, String source, Throwable cause) { |
|||
super(message, source, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,20 @@ |
|||
package exception; |
|||
|
|||
public class ValidationException extends CrawlerException { |
|||
|
|||
public ValidationException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public ValidationException(String message, String source) { |
|||
super(message, source); |
|||
} |
|||
|
|||
public ValidationException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
|
|||
public ValidationException(String message, String source, Throwable cause) { |
|||
super(message, source, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,236 @@ |
|||
package main; |
|||
|
|||
import config.CrawlerConfig; |
|||
import crawler.BaseCrawler; |
|||
import crawler.DoubanCrawler; |
|||
import crawler.ImdbCrawler; |
|||
import crawler.MaoyanCrawler; |
|||
import crawler.RottenTomatoesCrawler; |
|||
import model.Book; |
|||
import model.Movie; |
|||
import storage.DataStorage; |
|||
import storage.FileStorage; |
|||
import storage.SQLiteStorage; |
|||
import storage.StorageStats; |
|||
import util.Logger; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.Scanner; |
|||
|
|||
public class CrawlerManager { |
|||
private List<BaseCrawler> crawlers; |
|||
private DataStorage databaseStorage; |
|||
private DataStorage fileStorage; |
|||
|
|||
public CrawlerManager() { |
|||
crawlers = new ArrayList<>(); |
|||
|
|||
CrawlerConfig.load(); |
|||
|
|||
if (CrawlerConfig.isDatabaseEnabled()) { |
|||
databaseStorage = new SQLiteStorage(); |
|||
Logger.info("数据库存储已启用"); |
|||
} |
|||
if (CrawlerConfig.isFileOutputEnabled()) { |
|||
fileStorage = new FileStorage(CrawlerConfig.getOutputDir()); |
|||
Logger.info("文件输出已启用"); |
|||
} |
|||
|
|||
registerCrawler(new DoubanCrawler()); |
|||
registerCrawler(new MaoyanCrawler()); |
|||
registerCrawler(new RottenTomatoesCrawler()); |
|||
} |
|||
|
|||
public void registerCrawler(BaseCrawler crawler) { |
|||
if (databaseStorage != null) { |
|||
crawler.setStorage(new MultiStorage(databaseStorage, fileStorage)); |
|||
} else { |
|||
crawler.setStorage(fileStorage); |
|||
} |
|||
crawlers.add(crawler); |
|||
Logger.info("已注册爬虫: " + crawler.getName()); |
|||
} |
|||
|
|||
public void runAll() { |
|||
Logger.info("开始运行所有爬虫,共 " + crawlers.size() + " 个"); |
|||
for (BaseCrawler crawler : crawlers) { |
|||
crawler.crawl(); |
|||
System.out.println(); |
|||
} |
|||
showStats(); |
|||
} |
|||
|
|||
public void runCrawler(String name) { |
|||
for (BaseCrawler crawler : crawlers) { |
|||
if (crawler.getName().equals(name)) { |
|||
crawler.crawl(); |
|||
showStats(); |
|||
return; |
|||
} |
|||
} |
|||
Logger.error("未找到爬虫: " + name); |
|||
} |
|||
|
|||
public void showStats() { |
|||
if (databaseStorage != null) { |
|||
StorageStats stats = databaseStorage.getStats(); |
|||
System.out.println("\n========== 数据库统计 =========="); |
|||
System.out.println("总记录数: " + stats.getTotalCount()); |
|||
System.out.println("数据源数量: " + stats.getSourceCount()); |
|||
System.out.println("================================\n"); |
|||
} |
|||
} |
|||
|
|||
public void showMenu() { |
|||
System.out.println("\n========== 爬虫管理系统 =========="); |
|||
System.out.println("1. 运行所有爬虫"); |
|||
System.out.println("2. 运行指定爬虫"); |
|||
System.out.println("3. 查看统计信息"); |
|||
System.out.println("4. 清空数据库"); |
|||
System.out.println("5. 退出"); |
|||
System.out.println("=================================="); |
|||
System.out.print("请选择操作: "); |
|||
} |
|||
|
|||
public void interactive() { |
|||
Scanner scanner = new Scanner(System.in); |
|||
|
|||
while (true) { |
|||
showMenu(); |
|||
String choice = scanner.nextLine().trim(); |
|||
|
|||
switch (choice) { |
|||
case "1": |
|||
runAll(); |
|||
break; |
|||
|
|||
case "2": |
|||
System.out.println("\n可用爬虫:"); |
|||
for (int i = 0; i < crawlers.size(); i++) { |
|||
System.out.println((i + 1) + ". " + crawlers.get(i).getName()); |
|||
} |
|||
System.out.print("请输入爬虫名称: "); |
|||
String crawlerName = scanner.nextLine().trim(); |
|||
runCrawler(crawlerName); |
|||
break; |
|||
|
|||
case "3": |
|||
showStats(); |
|||
break; |
|||
|
|||
case "4": |
|||
System.out.print("确定要清空所有数据吗?(yes/no): "); |
|||
String confirm = scanner.nextLine().trim(); |
|||
if ("yes".equalsIgnoreCase(confirm) && databaseStorage != null) { |
|||
databaseStorage.clearAll(); |
|||
} |
|||
break; |
|||
|
|||
case "5": |
|||
System.out.println("再见!"); |
|||
close(); |
|||
return; |
|||
|
|||
default: |
|||
System.out.println("无效选择,请重试"); |
|||
} |
|||
} |
|||
} |
|||
|
|||
public void close() { |
|||
if (databaseStorage != null) { |
|||
databaseStorage.close(); |
|||
} |
|||
} |
|||
|
|||
private static class MultiStorage implements DataStorage { |
|||
private DataStorage primary; |
|||
private DataStorage secondary; |
|||
|
|||
public MultiStorage(DataStorage primary, DataStorage secondary) { |
|||
this.primary = primary; |
|||
this.secondary = secondary; |
|||
} |
|||
|
|||
@Override |
|||
public void save(Movie movie) { |
|||
primary.save(movie); |
|||
if (secondary != null) secondary.save(movie); |
|||
} |
|||
|
|||
@Override |
|||
public void saveBatch(List<Movie> movies) { |
|||
primary.saveBatch(movies); |
|||
if (secondary != null) secondary.saveBatch(movies); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> findAll() { |
|||
return primary.findAll(); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> findBySource(String source) { |
|||
return primary.findBySource(source); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> findByRankRange(int start, int end) { |
|||
return primary.findByRankRange(start, end); |
|||
} |
|||
|
|||
@Override |
|||
public void saveBook(Book book) { |
|||
primary.saveBook(book); |
|||
if (secondary != null) secondary.saveBook(book); |
|||
} |
|||
|
|||
@Override |
|||
public void saveBookBatch(List<Book> books) { |
|||
primary.saveBookBatch(books); |
|||
if (secondary != null) secondary.saveBookBatch(books); |
|||
} |
|||
|
|||
@Override |
|||
public List<Book> findAllBooks() { |
|||
return primary.findAllBooks(); |
|||
} |
|||
|
|||
@Override |
|||
public List<Book> findBooksBySource(String source) { |
|||
return primary.findBooksBySource(source); |
|||
} |
|||
|
|||
@Override |
|||
public void deleteBySource(String source) { |
|||
primary.deleteBySource(source); |
|||
} |
|||
|
|||
@Override |
|||
public void clearAll() { |
|||
primary.clearAll(); |
|||
} |
|||
|
|||
@Override |
|||
public StorageStats getStats() { |
|||
return primary.getStats(); |
|||
} |
|||
|
|||
@Override |
|||
public void close() { |
|||
primary.close(); |
|||
} |
|||
} |
|||
|
|||
public static void main(String[] args) { |
|||
CrawlerManager manager = new CrawlerManager(); |
|||
|
|||
if (args.length > 0 && args[0].equals("--auto")) { |
|||
manager.runAll(); |
|||
manager.close(); |
|||
} else { |
|||
manager.interactive(); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,86 @@ |
|||
package model; |
|||
|
|||
import java.time.LocalDateTime; |
|||
|
|||
public class Book { |
|||
private Integer id; |
|||
private String source; |
|||
private Integer rank; |
|||
private String title; |
|||
private String author; |
|||
private String publisher; |
|||
private String year; |
|||
private Double price; |
|||
private Double rating; |
|||
private Integer ratingCount; |
|||
private String category; |
|||
private String description; |
|||
private String url; |
|||
private String imageUrl; |
|||
private String isbn; |
|||
private LocalDateTime crawlTime; |
|||
|
|||
public Book() {} |
|||
|
|||
public Book(String source, Integer rank, String title, Double rating) { |
|||
this.source = source; |
|||
this.rank = rank; |
|||
this.title = title; |
|||
this.rating = rating; |
|||
this.crawlTime = LocalDateTime.now(); |
|||
} |
|||
|
|||
public Integer getId() { return id; } |
|||
public void setId(Integer id) { this.id = id; } |
|||
|
|||
public String getSource() { return source; } |
|||
public void setSource(String source) { this.source = source; } |
|||
|
|||
public Integer getRank() { return rank; } |
|||
public void setRank(Integer rank) { this.rank = rank; } |
|||
|
|||
public String getTitle() { return title; } |
|||
public void setTitle(String title) { this.title = title; } |
|||
|
|||
public String getAuthor() { return author; } |
|||
public void setAuthor(String author) { this.author = author; } |
|||
|
|||
public String getPublisher() { return publisher; } |
|||
public void setPublisher(String publisher) { this.publisher = publisher; } |
|||
|
|||
public String getYear() { return year; } |
|||
public void setYear(String year) { this.year = year; } |
|||
|
|||
public Double getPrice() { return price; } |
|||
public void setPrice(Double price) { this.price = price; } |
|||
|
|||
public Double getRating() { return rating; } |
|||
public void setRating(Double rating) { this.rating = rating; } |
|||
|
|||
public Integer getRatingCount() { return ratingCount; } |
|||
public void setRatingCount(Integer ratingCount) { this.ratingCount = ratingCount; } |
|||
|
|||
public String getCategory() { return category; } |
|||
public void setCategory(String category) { this.category = category; } |
|||
|
|||
public String getDescription() { return description; } |
|||
public void setDescription(String description) { this.description = description; } |
|||
|
|||
public String getUrl() { return url; } |
|||
public void setUrl(String url) { this.url = url; } |
|||
|
|||
public String getImageUrl() { return imageUrl; } |
|||
public void setImageUrl(String imageUrl) { this.imageUrl = imageUrl; } |
|||
|
|||
public String getIsbn() { return isbn; } |
|||
public void setIsbn(String isbn) { this.isbn = isbn; } |
|||
|
|||
public LocalDateTime getCrawlTime() { return crawlTime; } |
|||
public void setCrawlTime(LocalDateTime crawlTime) { this.crawlTime = crawlTime; } |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return String.format("Book{source='%s', rank=%d, title='%s', rating=%.1f}", |
|||
source, rank, title, rating != null ? rating : 0.0); |
|||
} |
|||
} |
|||
@ -0,0 +1,78 @@ |
|||
package model; |
|||
|
|||
import java.time.LocalDateTime; |
|||
|
|||
/** |
|||
* 电影数据模型类 |
|||
*/ |
|||
public class Movie { |
|||
private Integer id; |
|||
private String source; // 数据来源网站
|
|||
private Integer rank; // 排名
|
|||
private String name; // 电影名称
|
|||
private String director; // 导演
|
|||
private String actors; // 演员
|
|||
private String year; // 年份
|
|||
private Double rating; // 评分
|
|||
private Integer ratingCount; // 评分人数
|
|||
private String description; // 简介
|
|||
private String url; // 详情链接
|
|||
private String imageUrl; // 海报图片
|
|||
private LocalDateTime crawlTime; // 爬取时间
|
|||
|
|||
public Movie() {} |
|||
|
|||
public Movie(String source, Integer rank, String name, Double rating) { |
|||
this.source = source; |
|||
this.rank = rank; |
|||
this.name = name; |
|||
this.rating = rating; |
|||
this.crawlTime = LocalDateTime.now(); |
|||
} |
|||
|
|||
// Getters and Setters
|
|||
public Integer getId() { return id; } |
|||
public void setId(Integer id) { this.id = id; } |
|||
|
|||
public String getSource() { return source; } |
|||
public void setSource(String source) { this.source = source; } |
|||
|
|||
public Integer getRank() { return rank; } |
|||
public void setRank(Integer rank) { this.rank = rank; } |
|||
|
|||
public String getName() { return name; } |
|||
public void setName(String name) { this.name = name; } |
|||
|
|||
public String getDirector() { return director; } |
|||
public void setDirector(String director) { this.director = director; } |
|||
|
|||
public String getActors() { return actors; } |
|||
public void setActors(String actors) { this.actors = actors; } |
|||
|
|||
public String getYear() { return year; } |
|||
public void setYear(String year) { this.year = year; } |
|||
|
|||
public Double getRating() { return rating; } |
|||
public void setRating(Double rating) { this.rating = rating; } |
|||
|
|||
public Integer getRatingCount() { return ratingCount; } |
|||
public void setRatingCount(Integer ratingCount) { this.ratingCount = ratingCount; } |
|||
|
|||
public String getDescription() { return description; } |
|||
public void setDescription(String description) { this.description = description; } |
|||
|
|||
public String getUrl() { return url; } |
|||
public void setUrl(String url) { this.url = url; } |
|||
|
|||
public String getImageUrl() { return imageUrl; } |
|||
public void setImageUrl(String imageUrl) { this.imageUrl = imageUrl; } |
|||
|
|||
public LocalDateTime getCrawlTime() { return crawlTime; } |
|||
public void setCrawlTime(LocalDateTime crawlTime) { this.crawlTime = crawlTime; } |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return String.format("Movie{source='%s', rank=%d, name='%s', rating=%.1f}", |
|||
source, rank, name, rating); |
|||
} |
|||
} |
|||
@ -0,0 +1,34 @@ |
|||
package storage; |
|||
|
|||
import model.Movie; |
|||
import model.Book; |
|||
import java.util.List; |
|||
|
|||
public interface DataStorage { |
|||
|
|||
void save(Movie movie); |
|||
|
|||
void saveBatch(List<Movie> movies); |
|||
|
|||
List<Movie> findAll(); |
|||
|
|||
List<Movie> findBySource(String source); |
|||
|
|||
List<Movie> findByRankRange(int start, int end); |
|||
|
|||
void deleteBySource(String source); |
|||
|
|||
void clearAll(); |
|||
|
|||
StorageStats getStats(); |
|||
|
|||
void close(); |
|||
|
|||
void saveBook(model.Book book); |
|||
|
|||
void saveBookBatch(List<Book> books); |
|||
|
|||
List<Book> findAllBooks(); |
|||
|
|||
List<Book> findBooksBySource(String source); |
|||
} |
|||
@ -0,0 +1,237 @@ |
|||
package storage; |
|||
|
|||
import com.google.gson.Gson; |
|||
import com.google.gson.GsonBuilder; |
|||
import model.Book; |
|||
import model.Movie; |
|||
|
|||
import java.io.*; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class FileStorage implements DataStorage { |
|||
private static final Gson gson = new GsonBuilder() |
|||
.setPrettyPrinting() |
|||
.registerTypeAdapter(LocalDateTime.class, new LocalDateTimeAdapter()) |
|||
.create(); |
|||
|
|||
private final String outputDir; |
|||
|
|||
public FileStorage() { |
|||
this("output"); |
|||
} |
|||
|
|||
public FileStorage(String outputDir) { |
|||
this.outputDir = outputDir; |
|||
File dir = new File(outputDir); |
|||
if (!dir.exists()) { |
|||
dir.mkdirs(); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public void save(Movie movie) { |
|||
List<Movie> list = new ArrayList<>(); |
|||
list.add(movie); |
|||
saveBatch(list); |
|||
} |
|||
|
|||
@Override |
|||
public void saveBatch(List<Movie> movies) { |
|||
if (movies.isEmpty()) return; |
|||
|
|||
String source = movies.get(0).getSource(); |
|||
|
|||
saveMoviesAsJson(movies, source); |
|||
saveMoviesAsTxt(movies, source); |
|||
} |
|||
|
|||
private void saveMoviesAsJson(List<Movie> movies, String source) { |
|||
String filename = outputDir + "/" + sanitizeFilename(source) + "_" + |
|||
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".json"; |
|||
|
|||
try (Writer writer = new OutputStreamWriter( |
|||
new FileOutputStream(filename), StandardCharsets.UTF_8)) { |
|||
gson.toJson(movies, writer); |
|||
System.out.println("JSON文件已保存: " + filename); |
|||
} catch (IOException e) { |
|||
System.err.println("保存JSON失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
private void saveMoviesAsTxt(List<Movie> movies, String source) { |
|||
String filename = outputDir + "/" + sanitizeFilename(source) + "_" + |
|||
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".txt"; |
|||
|
|||
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( |
|||
new FileOutputStream(filename), StandardCharsets.UTF_8))) { |
|||
|
|||
writer.write("=========================================="); |
|||
writer.newLine(); |
|||
writer.write(" 数据来源: " + source); |
|||
writer.newLine(); |
|||
writer.write(" 爬取时间: " + LocalDateTime.now()); |
|||
writer.newLine(); |
|||
writer.write(" 电影数量: " + movies.size()); |
|||
writer.newLine(); |
|||
writer.write("=========================================="); |
|||
writer.newLine(); |
|||
writer.newLine(); |
|||
|
|||
for (Movie movie : movies) { |
|||
writer.write(String.format("排名: %d", movie.getRank())); |
|||
writer.newLine(); |
|||
writer.write(String.format("电影: %s", movie.getName())); |
|||
writer.newLine(); |
|||
writer.write(String.format("评分: %.1f", movie.getRating())); |
|||
writer.newLine(); |
|||
if (movie.getDirector() != null) { |
|||
writer.write(String.format("导演: %s", movie.getDirector())); |
|||
writer.newLine(); |
|||
} |
|||
if (movie.getYear() != null) { |
|||
writer.write(String.format("年份: %s", movie.getYear())); |
|||
writer.newLine(); |
|||
} |
|||
writer.write("------------------------------------------"); |
|||
writer.newLine(); |
|||
} |
|||
|
|||
System.out.println("TXT文件已保存: " + filename); |
|||
} catch (IOException e) { |
|||
System.err.println("保存TXT失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public void saveBook(Book book) { |
|||
List<Book> list = new ArrayList<>(); |
|||
list.add(book); |
|||
saveBookBatch(list); |
|||
} |
|||
|
|||
@Override |
|||
public void saveBookBatch(List<Book> books) { |
|||
if (books.isEmpty()) return; |
|||
|
|||
String source = books.get(0).getSource(); |
|||
|
|||
saveBooksAsJson(books, source); |
|||
saveBooksAsTxt(books, source); |
|||
} |
|||
|
|||
private void saveBooksAsJson(List<Book> books, String source) { |
|||
String filename = outputDir + "/" + sanitizeFilename(source) + "_" + |
|||
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".json"; |
|||
|
|||
try (Writer writer = new OutputStreamWriter( |
|||
new FileOutputStream(filename), StandardCharsets.UTF_8)) { |
|||
gson.toJson(books, writer); |
|||
System.out.println("JSON文件已保存: " + filename); |
|||
} catch (IOException e) { |
|||
System.err.println("保存JSON失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
private void saveBooksAsTxt(List<Book> books, String source) { |
|||
String filename = outputDir + "/" + sanitizeFilename(source) + "_" + |
|||
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".txt"; |
|||
|
|||
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( |
|||
new FileOutputStream(filename), StandardCharsets.UTF_8))) { |
|||
|
|||
writer.write("=========================================="); |
|||
writer.newLine(); |
|||
writer.write(" 数据来源: " + source); |
|||
writer.newLine(); |
|||
writer.write(" 爬取时间: " + LocalDateTime.now()); |
|||
writer.newLine(); |
|||
writer.write(" 图书数量: " + books.size()); |
|||
writer.newLine(); |
|||
writer.write("=========================================="); |
|||
writer.newLine(); |
|||
writer.newLine(); |
|||
|
|||
for (Book book : books) { |
|||
if (book.getRank() != null) { |
|||
writer.write(String.format("排名: %d", book.getRank())); |
|||
writer.newLine(); |
|||
} |
|||
writer.write(String.format("书名: %s", book.getTitle())); |
|||
writer.newLine(); |
|||
if (book.getRating() != null) { |
|||
writer.write(String.format("评分: %.1f", book.getRating())); |
|||
writer.newLine(); |
|||
} |
|||
if (book.getAuthor() != null) { |
|||
writer.write(String.format("作者: %s", book.getAuthor())); |
|||
writer.newLine(); |
|||
} |
|||
if (book.getPublisher() != null) { |
|||
writer.write(String.format("出版社: %s", book.getPublisher())); |
|||
writer.newLine(); |
|||
} |
|||
if (book.getPrice() != null) { |
|||
writer.write(String.format("价格: %.2f", book.getPrice())); |
|||
writer.newLine(); |
|||
} |
|||
if (book.getYear() != null) { |
|||
writer.write(String.format("年份: %s", book.getYear())); |
|||
writer.newLine(); |
|||
} |
|||
writer.write("------------------------------------------"); |
|||
writer.newLine(); |
|||
} |
|||
|
|||
System.out.println("TXT文件已保存: " + filename); |
|||
} catch (IOException e) { |
|||
System.err.println("保存TXT失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
private String sanitizeFilename(String filename) { |
|||
return filename.replaceAll("[\\\\/:*?\"<>|]", "_"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> findAll() { |
|||
return new ArrayList<>(); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> findBySource(String source) { |
|||
return new ArrayList<>(); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> findByRankRange(int start, int end) { |
|||
return new ArrayList<>(); |
|||
} |
|||
|
|||
@Override |
|||
public List<Book> findAllBooks() { |
|||
return new ArrayList<>(); |
|||
} |
|||
|
|||
@Override |
|||
public List<Book> findBooksBySource(String source) { |
|||
return new ArrayList<>(); |
|||
} |
|||
|
|||
@Override |
|||
public void deleteBySource(String source) {} |
|||
|
|||
@Override |
|||
public void clearAll() {} |
|||
|
|||
@Override |
|||
public StorageStats getStats() { |
|||
return new StorageStats(0, 0); |
|||
} |
|||
|
|||
@Override |
|||
public void close() {} |
|||
} |
|||
@ -0,0 +1,25 @@ |
|||
package storage; |
|||
|
|||
import com.google.gson.*; |
|||
|
|||
import java.lang.reflect.Type; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
|
|||
/** |
|||
* Gson LocalDateTime 适配器 |
|||
*/ |
|||
public class LocalDateTimeAdapter implements JsonSerializer<LocalDateTime>, JsonDeserializer<LocalDateTime> { |
|||
private static final DateTimeFormatter formatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME; |
|||
|
|||
@Override |
|||
public JsonElement serialize(LocalDateTime src, Type typeOfSrc, JsonSerializationContext context) { |
|||
return new JsonPrimitive(formatter.format(src)); |
|||
} |
|||
|
|||
@Override |
|||
public LocalDateTime deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) |
|||
throws JsonParseException { |
|||
return LocalDateTime.parse(json.getAsString(), formatter); |
|||
} |
|||
} |
|||
@ -0,0 +1,414 @@ |
|||
package storage; |
|||
|
|||
import model.Book; |
|||
import model.Movie; |
|||
|
|||
import java.sql.*; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class SQLiteStorage implements DataStorage { |
|||
private static final String DB_URL = "jdbc:sqlite:crawler.db"; |
|||
private Connection connection; |
|||
|
|||
public SQLiteStorage() { |
|||
try { |
|||
connection = DriverManager.getConnection(DB_URL); |
|||
initTable(); |
|||
} catch (SQLException e) { |
|||
throw new RuntimeException("数据库连接失败: " + e.getMessage(), e); |
|||
} |
|||
} |
|||
|
|||
private void initTable() throws SQLException { |
|||
String movieSql = "CREATE TABLE IF NOT EXISTS movies (" + |
|||
"id INTEGER PRIMARY KEY AUTOINCREMENT," + |
|||
"source TEXT NOT NULL," + |
|||
"rank INTEGER," + |
|||
"name TEXT NOT NULL," + |
|||
"director TEXT," + |
|||
"actors TEXT," + |
|||
"year TEXT," + |
|||
"rating REAL," + |
|||
"rating_count INTEGER," + |
|||
"description TEXT," + |
|||
"url TEXT," + |
|||
"image_url TEXT," + |
|||
"crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + |
|||
")"; |
|||
|
|||
String bookSql = "CREATE TABLE IF NOT EXISTS books (" + |
|||
"id INTEGER PRIMARY KEY AUTOINCREMENT," + |
|||
"source TEXT NOT NULL," + |
|||
"rank INTEGER," + |
|||
"title TEXT NOT NULL," + |
|||
"author TEXT," + |
|||
"publisher TEXT," + |
|||
"year TEXT," + |
|||
"price REAL," + |
|||
"rating REAL," + |
|||
"rating_count INTEGER," + |
|||
"category TEXT," + |
|||
"description TEXT," + |
|||
"url TEXT," + |
|||
"image_url TEXT," + |
|||
"isbn TEXT," + |
|||
"crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + |
|||
")"; |
|||
|
|||
try (Statement stmt = connection.createStatement()) { |
|||
stmt.execute(movieSql); |
|||
stmt.execute(bookSql); |
|||
} |
|||
|
|||
String indexSql1 = "CREATE INDEX IF NOT EXISTS idx_movie_source ON movies(source)"; |
|||
String indexSql2 = "CREATE INDEX IF NOT EXISTS idx_book_source ON books(source)"; |
|||
try (Statement stmt = connection.createStatement()) { |
|||
stmt.execute(indexSql1); |
|||
stmt.execute(indexSql2); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public void save(Movie movie) { |
|||
String sql = "INSERT INTO movies (source, rank, name, director, actors, year, " + |
|||
"rating, rating_count, description, url, image_url, crawl_time) " + |
|||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; |
|||
|
|||
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
|||
pstmt.setString(1, movie.getSource()); |
|||
pstmt.setObject(2, movie.getRank()); |
|||
pstmt.setString(3, movie.getName()); |
|||
pstmt.setString(4, movie.getDirector()); |
|||
pstmt.setString(5, movie.getActors()); |
|||
pstmt.setString(6, movie.getYear()); |
|||
pstmt.setObject(7, movie.getRating()); |
|||
pstmt.setObject(8, movie.getRatingCount()); |
|||
pstmt.setString(9, movie.getDescription()); |
|||
pstmt.setString(10, movie.getUrl()); |
|||
pstmt.setString(11, movie.getImageUrl()); |
|||
pstmt.setTimestamp(12, movie.getCrawlTime() != null ? |
|||
Timestamp.valueOf(movie.getCrawlTime()) : null); |
|||
pstmt.executeUpdate(); |
|||
} catch (SQLException e) { |
|||
System.err.println("保存电影失败: " + movie.getName() + " - " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public void saveBatch(List<Movie> movies) { |
|||
String sql = "INSERT INTO movies (source, rank, name, director, actors, year, " + |
|||
"rating, rating_count, description, url, image_url, crawl_time) " + |
|||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; |
|||
|
|||
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
|||
connection.setAutoCommit(false); |
|||
|
|||
for (Movie movie : movies) { |
|||
pstmt.setString(1, movie.getSource()); |
|||
pstmt.setObject(2, movie.getRank()); |
|||
pstmt.setString(3, movie.getName()); |
|||
pstmt.setString(4, movie.getDirector()); |
|||
pstmt.setString(5, movie.getActors()); |
|||
pstmt.setString(6, movie.getYear()); |
|||
pstmt.setObject(7, movie.getRating()); |
|||
pstmt.setObject(8, movie.getRatingCount()); |
|||
pstmt.setString(9, movie.getDescription()); |
|||
pstmt.setString(10, movie.getUrl()); |
|||
pstmt.setString(11, movie.getImageUrl()); |
|||
pstmt.setTimestamp(12, movie.getCrawlTime() != null ? |
|||
Timestamp.valueOf(movie.getCrawlTime()) : null); |
|||
pstmt.addBatch(); |
|||
} |
|||
|
|||
pstmt.executeBatch(); |
|||
connection.commit(); |
|||
System.out.println("批量保存 " + movies.size() + " 条数据成功"); |
|||
} catch (SQLException e) { |
|||
try { |
|||
connection.rollback(); |
|||
} catch (SQLException ex) { |
|||
ex.printStackTrace(); |
|||
} |
|||
System.err.println("批量保存失败: " + e.getMessage()); |
|||
} finally { |
|||
try { |
|||
connection.setAutoCommit(true); |
|||
} catch (SQLException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public void saveBook(Book book) { |
|||
String sql = "INSERT INTO books (source, rank, title, author, publisher, year, " + |
|||
"price, rating, rating_count, category, description, url, image_url, isbn, crawl_time) " + |
|||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; |
|||
|
|||
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
|||
pstmt.setString(1, book.getSource()); |
|||
pstmt.setObject(2, book.getRank()); |
|||
pstmt.setString(3, book.getTitle()); |
|||
pstmt.setString(4, book.getAuthor()); |
|||
pstmt.setString(5, book.getPublisher()); |
|||
pstmt.setString(6, book.getYear()); |
|||
pstmt.setObject(7, book.getPrice()); |
|||
pstmt.setObject(8, book.getRating()); |
|||
pstmt.setObject(9, book.getRatingCount()); |
|||
pstmt.setString(10, book.getCategory()); |
|||
pstmt.setString(11, book.getDescription()); |
|||
pstmt.setString(12, book.getUrl()); |
|||
pstmt.setString(13, book.getImageUrl()); |
|||
pstmt.setString(14, book.getIsbn()); |
|||
pstmt.setTimestamp(15, book.getCrawlTime() != null ? |
|||
Timestamp.valueOf(book.getCrawlTime()) : null); |
|||
pstmt.executeUpdate(); |
|||
} catch (SQLException e) { |
|||
System.err.println("保存图书失败: " + book.getTitle() + " - " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public void saveBookBatch(List<Book> books) { |
|||
String sql = "INSERT INTO books (source, rank, title, author, publisher, year, " + |
|||
"price, rating, rating_count, category, description, url, image_url, isbn, crawl_time) " + |
|||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; |
|||
|
|||
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
|||
connection.setAutoCommit(false); |
|||
|
|||
for (Book book : books) { |
|||
pstmt.setString(1, book.getSource()); |
|||
pstmt.setObject(2, book.getRank()); |
|||
pstmt.setString(3, book.getTitle()); |
|||
pstmt.setString(4, book.getAuthor()); |
|||
pstmt.setString(5, book.getPublisher()); |
|||
pstmt.setString(6, book.getYear()); |
|||
pstmt.setObject(7, book.getPrice()); |
|||
pstmt.setObject(8, book.getRating()); |
|||
pstmt.setObject(9, book.getRatingCount()); |
|||
pstmt.setString(10, book.getCategory()); |
|||
pstmt.setString(11, book.getDescription()); |
|||
pstmt.setString(12, book.getUrl()); |
|||
pstmt.setString(13, book.getImageUrl()); |
|||
pstmt.setString(14, book.getIsbn()); |
|||
pstmt.setTimestamp(15, book.getCrawlTime() != null ? |
|||
Timestamp.valueOf(book.getCrawlTime()) : null); |
|||
pstmt.addBatch(); |
|||
} |
|||
|
|||
pstmt.executeBatch(); |
|||
connection.commit(); |
|||
System.out.println("批量保存 " + books.size() + " 条图书数据成功"); |
|||
} catch (SQLException e) { |
|||
try { |
|||
connection.rollback(); |
|||
} catch (SQLException ex) { |
|||
ex.printStackTrace(); |
|||
} |
|||
System.err.println("批量保存图书失败: " + e.getMessage()); |
|||
} finally { |
|||
try { |
|||
connection.setAutoCommit(true); |
|||
} catch (SQLException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> findAll() { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
String sql = "SELECT * FROM movies ORDER BY source, rank"; |
|||
|
|||
try (Statement stmt = connection.createStatement(); |
|||
ResultSet rs = stmt.executeQuery(sql)) { |
|||
while (rs.next()) { |
|||
movies.add(mapResultSetToMovie(rs)); |
|||
} |
|||
} catch (SQLException e) { |
|||
System.err.println("查询失败: " + e.getMessage()); |
|||
} |
|||
return movies; |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> findBySource(String source) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
String sql = "SELECT * FROM movies WHERE source = ? ORDER BY rank"; |
|||
|
|||
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
|||
pstmt.setString(1, source); |
|||
ResultSet rs = pstmt.executeQuery(); |
|||
while (rs.next()) { |
|||
movies.add(mapResultSetToMovie(rs)); |
|||
} |
|||
} catch (SQLException e) { |
|||
System.err.println("查询失败: " + e.getMessage()); |
|||
} |
|||
return movies; |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> findByRankRange(int start, int end) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
String sql = "SELECT * FROM movies WHERE rank BETWEEN ? AND ? ORDER BY rank"; |
|||
|
|||
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
|||
pstmt.setInt(1, start); |
|||
pstmt.setInt(2, end); |
|||
ResultSet rs = pstmt.executeQuery(); |
|||
while (rs.next()) { |
|||
movies.add(mapResultSetToMovie(rs)); |
|||
} |
|||
} catch (SQLException e) { |
|||
System.err.println("查询失败: " + e.getMessage()); |
|||
} |
|||
return movies; |
|||
} |
|||
|
|||
@Override |
|||
public List<Book> findAllBooks() { |
|||
List<Book> books = new ArrayList<>(); |
|||
String sql = "SELECT * FROM books ORDER BY source, rank"; |
|||
|
|||
try (Statement stmt = connection.createStatement(); |
|||
ResultSet rs = stmt.executeQuery(sql)) { |
|||
while (rs.next()) { |
|||
books.add(mapResultSetToBook(rs)); |
|||
} |
|||
} catch (SQLException e) { |
|||
System.err.println("查询失败: " + e.getMessage()); |
|||
} |
|||
return books; |
|||
} |
|||
|
|||
@Override |
|||
public List<Book> findBooksBySource(String source) { |
|||
List<Book> books = new ArrayList<>(); |
|||
String sql = "SELECT * FROM books WHERE source = ? ORDER BY rank"; |
|||
|
|||
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
|||
pstmt.setString(1, source); |
|||
ResultSet rs = pstmt.executeQuery(); |
|||
while (rs.next()) { |
|||
books.add(mapResultSetToBook(rs)); |
|||
} |
|||
} catch (SQLException e) { |
|||
System.err.println("查询失败: " + e.getMessage()); |
|||
} |
|||
return books; |
|||
} |
|||
|
|||
@Override |
|||
public void deleteBySource(String source) { |
|||
String sql1 = "DELETE FROM movies WHERE source = ?"; |
|||
String sql2 = "DELETE FROM books WHERE source = ?"; |
|||
try (PreparedStatement pstmt1 = connection.prepareStatement(sql1); |
|||
PreparedStatement pstmt2 = connection.prepareStatement(sql2)) { |
|||
pstmt1.setString(1, source); |
|||
pstmt2.setString(1, source); |
|||
int count1 = pstmt1.executeUpdate(); |
|||
int count2 = pstmt2.executeUpdate(); |
|||
System.out.println("删除 " + source + " 的 " + (count1 + count2) + " 条数据"); |
|||
} catch (SQLException e) { |
|||
System.err.println("删除失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public void clearAll() { |
|||
try (Statement stmt = connection.createStatement()) { |
|||
stmt.execute("DELETE FROM movies"); |
|||
stmt.execute("DELETE FROM books"); |
|||
System.out.println("清空所有数据"); |
|||
} catch (SQLException e) { |
|||
System.err.println("清空失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public StorageStats getStats() { |
|||
int totalCount = 0; |
|||
int sourceCount = 0; |
|||
|
|||
try (Statement stmt = connection.createStatement()) { |
|||
ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM movies"); |
|||
if (rs.next()) { |
|||
totalCount = rs.getInt(1); |
|||
} |
|||
rs = stmt.executeQuery("SELECT COUNT(*) FROM books"); |
|||
if (rs.next()) { |
|||
totalCount += rs.getInt(1); |
|||
} |
|||
rs = stmt.executeQuery("SELECT COUNT(DISTINCT source) FROM movies"); |
|||
if (rs.next()) { |
|||
sourceCount = rs.getInt(1); |
|||
} |
|||
rs = stmt.executeQuery("SELECT COUNT(DISTINCT source) FROM books"); |
|||
if (rs.next()) { |
|||
sourceCount += rs.getInt(1); |
|||
} |
|||
} catch (SQLException e) { |
|||
System.err.println("统计失败: " + e.getMessage()); |
|||
} |
|||
return new StorageStats(totalCount, sourceCount); |
|||
} |
|||
|
|||
@Override |
|||
public void close() { |
|||
try { |
|||
if (connection != null && !connection.isClosed()) { |
|||
connection.close(); |
|||
} |
|||
} catch (SQLException e) { |
|||
System.err.println("关闭连接失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
private Movie mapResultSetToMovie(ResultSet rs) throws SQLException { |
|||
Movie movie = new Movie(); |
|||
movie.setId(rs.getInt("id")); |
|||
movie.setSource(rs.getString("source")); |
|||
movie.setRank(rs.getInt("rank")); |
|||
movie.setName(rs.getString("name")); |
|||
movie.setDirector(rs.getString("director")); |
|||
movie.setActors(rs.getString("actors")); |
|||
movie.setYear(rs.getString("year")); |
|||
movie.setRating(rs.getDouble("rating")); |
|||
movie.setRatingCount(rs.getInt("rating_count")); |
|||
movie.setDescription(rs.getString("description")); |
|||
movie.setUrl(rs.getString("url")); |
|||
movie.setImageUrl(rs.getString("image_url")); |
|||
Timestamp ts = rs.getTimestamp("crawl_time"); |
|||
if (ts != null) { |
|||
movie.setCrawlTime(ts.toLocalDateTime()); |
|||
} |
|||
return movie; |
|||
} |
|||
|
|||
private Book mapResultSetToBook(ResultSet rs) throws SQLException { |
|||
Book book = new Book(); |
|||
book.setId(rs.getInt("id")); |
|||
book.setSource(rs.getString("source")); |
|||
book.setRank(rs.getInt("rank")); |
|||
book.setTitle(rs.getString("title")); |
|||
book.setAuthor(rs.getString("author")); |
|||
book.setPublisher(rs.getString("publisher")); |
|||
book.setYear(rs.getString("year")); |
|||
book.setPrice(rs.getDouble("price")); |
|||
book.setRating(rs.getDouble("rating")); |
|||
book.setRatingCount(rs.getInt("rating_count")); |
|||
book.setCategory(rs.getString("category")); |
|||
book.setDescription(rs.getString("description")); |
|||
book.setUrl(rs.getString("url")); |
|||
book.setImageUrl(rs.getString("image_url")); |
|||
book.setIsbn(rs.getString("isbn")); |
|||
Timestamp ts = rs.getTimestamp("crawl_time"); |
|||
if (ts != null) { |
|||
book.setCrawlTime(ts.toLocalDateTime()); |
|||
} |
|||
return book; |
|||
} |
|||
} |
|||
@ -0,0 +1,23 @@ |
|||
package storage; |
|||
|
|||
/** |
|||
* 存储统计信息 |
|||
*/ |
|||
public class StorageStats { |
|||
private int totalCount; |
|||
private int sourceCount; |
|||
|
|||
public StorageStats(int totalCount, int sourceCount) { |
|||
this.totalCount = totalCount; |
|||
this.sourceCount = sourceCount; |
|||
} |
|||
|
|||
public int getTotalCount() { return totalCount; } |
|||
public int getSourceCount() { return sourceCount; } |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return String.format("StorageStats{totalCount=%d, sourceCount=%d}", |
|||
totalCount, sourceCount); |
|||
} |
|||
} |
|||
@ -0,0 +1,115 @@ |
|||
package strategy; |
|||
|
|||
import exception.CrawlerResult; |
|||
import exception.NetworkException; |
|||
import exception.ParseException; |
|||
import model.Book; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import storage.DataStorage; |
|||
import util.Logger; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public abstract class AbstractBookCrawlerStrategy implements BookCrawlerStrategy { |
|||
protected DataStorage storage; |
|||
protected int delayMs = 1500; |
|||
|
|||
@Override |
|||
public void setStorage(DataStorage storage) { |
|||
this.storage = storage; |
|||
} |
|||
|
|||
@Override |
|||
public int getDelayMs() { |
|||
return delayMs; |
|||
} |
|||
|
|||
protected Document fetchDocument(String url) throws IOException { |
|||
return Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + |
|||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
|||
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
|||
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") |
|||
.timeout(15000) |
|||
.get(); |
|||
} |
|||
|
|||
protected void delay() { |
|||
try { |
|||
Thread.sleep(delayMs); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public CrawlerResult execute() { |
|||
long startTime = System.currentTimeMillis(); |
|||
String sourceName = getName(); |
|||
List<Book> allBooks = new ArrayList<>(); |
|||
|
|||
try { |
|||
List<String> urls = getPageUrls(); |
|||
Logger.info(String.format("[%s] 开始爬取,共 %d 个页面", sourceName, urls.size())); |
|||
|
|||
for (int i = 0; i < urls.size(); i++) { |
|||
String url = urls.get(i); |
|||
Logger.info(String.format("[%s] 爬取第 %d/%d 页: %s", sourceName, i + 1, urls.size(), url)); |
|||
|
|||
try { |
|||
Document doc = fetchDocument(url); |
|||
List<Book> books = parseBooks(doc.html()); |
|||
|
|||
for (Book book : books) { |
|||
book.setSource(sourceName); |
|||
} |
|||
|
|||
allBooks.addAll(books); |
|||
Logger.info(String.format("[%s] 第 %d 页获取 %d 条数据", sourceName, i + 1, books.size())); |
|||
|
|||
} catch (IOException e) { |
|||
Logger.error(String.format("[%s] 网络请求失败: %s", sourceName, url), e); |
|||
throw new NetworkException("网络请求失败: " + url, sourceName, e); |
|||
} catch (Exception e) { |
|||
Logger.error(String.format("[%s] 解析页面失败: %s", sourceName, url), e); |
|||
throw new ParseException("解析页面失败: " + url, sourceName, e); |
|||
} |
|||
|
|||
if (i < urls.size() - 1) { |
|||
delay(); |
|||
} |
|||
} |
|||
|
|||
if (storage != null && !allBooks.isEmpty()) { |
|||
storage.saveBookBatch(allBooks); |
|||
Logger.info(String.format("[%s] 数据已保存到存储", sourceName)); |
|||
} |
|||
|
|||
long elapsedTime = System.currentTimeMillis() - startTime; |
|||
return CrawlerResult.success(sourceName) |
|||
.message("爬取成功") |
|||
.dataCount(allBooks.size()) |
|||
.elapsedTime(elapsedTime) |
|||
.build(); |
|||
|
|||
} catch (NetworkException e) { |
|||
long elapsedTime = System.currentTimeMillis() - startTime; |
|||
return CrawlerResult.failure(sourceName, "NETWORK_ERROR", e.getMessage()) |
|||
.elapsedTime(elapsedTime) |
|||
.build(); |
|||
} catch (ParseException e) { |
|||
long elapsedTime = System.currentTimeMillis() - startTime; |
|||
return CrawlerResult.failure(sourceName, "PARSE_ERROR", e.getMessage()) |
|||
.elapsedTime(elapsedTime) |
|||
.build(); |
|||
} catch (Exception e) { |
|||
long elapsedTime = System.currentTimeMillis() - startTime; |
|||
return CrawlerResult.failure(sourceName, "UNKNOWN_ERROR", e.getMessage()) |
|||
.elapsedTime(elapsedTime) |
|||
.build(); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,114 @@ |
|||
package strategy; |
|||
|
|||
import exception.CrawlerResult; |
|||
import exception.NetworkException; |
|||
import exception.ParseException; |
|||
import model.Movie; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import storage.DataStorage; |
|||
import util.Logger; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public abstract class AbstractCrawlerStrategy implements CrawlerStrategy { |
|||
protected DataStorage storage; |
|||
protected int delayMs = 1500; |
|||
|
|||
public void setStorage(DataStorage storage) { |
|||
this.storage = storage; |
|||
} |
|||
|
|||
@Override |
|||
public int getDelayMs() { |
|||
return delayMs; |
|||
} |
|||
|
|||
protected Document fetchDocument(String url) throws IOException { |
|||
return Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + |
|||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
|||
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
|||
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") |
|||
.timeout(15000) |
|||
.get(); |
|||
} |
|||
|
|||
protected void delay() { |
|||
try { |
|||
Thread.sleep(delayMs); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public CrawlerResult execute() { |
|||
long startTime = System.currentTimeMillis(); |
|||
String sourceName = getName(); |
|||
List<Movie> allMovies = new ArrayList<>(); |
|||
|
|||
try { |
|||
List<String> urls = getPageUrls(); |
|||
Logger.info(String.format("[%s] 开始爬取,共 %d 个页面", sourceName, urls.size())); |
|||
|
|||
for (int i = 0; i < urls.size(); i++) { |
|||
String url = urls.get(i); |
|||
Logger.info(String.format("[%s] 爬取第 %d/%d 页: %s", sourceName, i + 1, urls.size(), url)); |
|||
|
|||
try { |
|||
Document doc = fetchDocument(url); |
|||
List<Movie> movies = parseMovies(doc.html()); |
|||
|
|||
for (Movie movie : movies) { |
|||
movie.setSource(sourceName); |
|||
} |
|||
|
|||
allMovies.addAll(movies); |
|||
Logger.info(String.format("[%s] 第 %d 页获取 %d 条数据", sourceName, i + 1, movies.size())); |
|||
|
|||
} catch (IOException e) { |
|||
Logger.error(String.format("[%s] 网络请求失败: %s", sourceName, url), e); |
|||
throw new NetworkException("网络请求失败: " + url, sourceName, e); |
|||
} catch (Exception e) { |
|||
Logger.error(String.format("[%s] 解析页面失败: %s", sourceName, url), e); |
|||
throw new ParseException("解析页面失败: " + url, sourceName, e); |
|||
} |
|||
|
|||
if (i < urls.size() - 1) { |
|||
delay(); |
|||
} |
|||
} |
|||
|
|||
if (storage != null && !allMovies.isEmpty()) { |
|||
storage.saveBatch(allMovies); |
|||
Logger.info(String.format("[%s] 数据已保存到存储", sourceName)); |
|||
} |
|||
|
|||
long elapsedTime = System.currentTimeMillis() - startTime; |
|||
return CrawlerResult.success(sourceName) |
|||
.message("爬取成功") |
|||
.dataCount(allMovies.size()) |
|||
.elapsedTime(elapsedTime) |
|||
.build(); |
|||
|
|||
} catch (NetworkException e) { |
|||
long elapsedTime = System.currentTimeMillis() - startTime; |
|||
return CrawlerResult.failure(sourceName, "NETWORK_ERROR", e.getMessage()) |
|||
.elapsedTime(elapsedTime) |
|||
.build(); |
|||
} catch (ParseException e) { |
|||
long elapsedTime = System.currentTimeMillis() - startTime; |
|||
return CrawlerResult.failure(sourceName, "PARSE_ERROR", e.getMessage()) |
|||
.elapsedTime(elapsedTime) |
|||
.build(); |
|||
} catch (Exception e) { |
|||
long elapsedTime = System.currentTimeMillis() - startTime; |
|||
return CrawlerResult.failure(sourceName, "UNKNOWN_ERROR", e.getMessage()) |
|||
.elapsedTime(elapsedTime) |
|||
.build(); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,15 @@ |
|||
package strategy; |
|||
|
|||
import exception.CrawlerResult; |
|||
import storage.DataStorage; |
|||
import java.util.List; |
|||
|
|||
public interface BookCrawlerStrategy { |
|||
String getName(); |
|||
String getBaseUrl(); |
|||
List<String> getPageUrls(); |
|||
List<model.Book> parseBooks(String htmlContent); |
|||
CrawlerResult execute(); |
|||
int getDelayMs(); |
|||
void setStorage(DataStorage storage); |
|||
} |
|||
@ -0,0 +1,16 @@ |
|||
package strategy; |
|||
|
|||
import exception.CrawlerResult; |
|||
import model.Movie; |
|||
import storage.DataStorage; |
|||
import java.util.List; |
|||
|
|||
public interface CrawlerStrategy { |
|||
String getName(); |
|||
String getBaseUrl(); |
|||
List<String> getPageUrls(); |
|||
List<Movie> parseMovies(String htmlContent); |
|||
CrawlerResult execute(); |
|||
int getDelayMs(); |
|||
void setStorage(DataStorage storage); |
|||
} |
|||
@ -0,0 +1,116 @@ |
|||
package strategy.impl; |
|||
|
|||
import model.Book; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import strategy.AbstractBookCrawlerStrategy; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class BooksToScrapeStrategy extends AbstractBookCrawlerStrategy { |
|||
private static final String NAME = "BooksToScrape"; |
|||
private static final String BASE_URL = "https://books.toscrape.com"; |
|||
|
|||
public BooksToScrapeStrategy() { |
|||
this.delayMs = 1000; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return NAME; |
|||
} |
|||
|
|||
@Override |
|||
public String getBaseUrl() { |
|||
return BASE_URL; |
|||
} |
|||
|
|||
@Override |
|||
public List<String> getPageUrls() { |
|||
List<String> urls = new ArrayList<>(); |
|||
urls.add(BASE_URL); |
|||
for (int i = 2; i <= 50; i++) { |
|||
urls.add(BASE_URL + "/catalogue/page-" + i + ".html"); |
|||
} |
|||
return urls; |
|||
} |
|||
|
|||
@Override |
|||
public List<Book> parseBooks(String htmlContent) { |
|||
List<Book> books = new ArrayList<>(); |
|||
Document doc = Jsoup.parse(htmlContent); |
|||
Elements items = doc.select("article.product_pod"); |
|||
|
|||
int rank = 1; |
|||
for (Element item : items) { |
|||
try { |
|||
Book book = new Book(); |
|||
|
|||
Element titleElement = item.select("h3 a").first(); |
|||
if (titleElement != null) { |
|||
book.setTitle(titleElement.attr("title")); |
|||
String href = titleElement.attr("href"); |
|||
if (href.startsWith("../")) { |
|||
book.setUrl(BASE_URL + "/catalogue/" + href.substring(3)); |
|||
} else { |
|||
book.setUrl(BASE_URL + "/" + href); |
|||
} |
|||
} |
|||
|
|||
Element priceElement = item.select("p.price_color").first(); |
|||
if (priceElement != null) { |
|||
String priceStr = priceElement.text().replace("£", "").replace("Â", "").trim(); |
|||
try { |
|||
book.setPrice(Double.parseDouble(priceStr)); |
|||
} catch (NumberFormatException e) { |
|||
// ignore
|
|||
} |
|||
} |
|||
|
|||
Element ratingElement = item.select("p.star-rating").first(); |
|||
if (ratingElement != null) { |
|||
String ratingClass = ratingElement.className(); |
|||
int rating = parseRating(ratingClass); |
|||
book.setRating((double) rating); |
|||
} |
|||
|
|||
Element imgElement = item.select("img").first(); |
|||
if (imgElement != null) { |
|||
String src = imgElement.attr("src"); |
|||
if (src.startsWith("../")) { |
|||
book.setImageUrl(BASE_URL + "/" + src.substring(3)); |
|||
} else { |
|||
book.setImageUrl(BASE_URL + "/" + src); |
|||
} |
|||
} |
|||
|
|||
Element availabilityElement = item.select("p.instock.availability").first(); |
|||
if (availabilityElement != null) { |
|||
String availability = availabilityElement.text().trim(); |
|||
} |
|||
|
|||
book.setRank(rank++); |
|||
|
|||
if (book.getTitle() != null && !book.getTitle().isEmpty()) { |
|||
books.add(book); |
|||
} |
|||
} catch (Exception e) { |
|||
// skip invalid item
|
|||
} |
|||
} |
|||
|
|||
return books; |
|||
} |
|||
|
|||
private int parseRating(String ratingClass) { |
|||
if (ratingClass.contains("One")) return 1; |
|||
if (ratingClass.contains("Two")) return 2; |
|||
if (ratingClass.contains("Three")) return 3; |
|||
if (ratingClass.contains("Four")) return 4; |
|||
if (ratingClass.contains("Five")) return 5; |
|||
return 0; |
|||
} |
|||
} |
|||
@ -0,0 +1,159 @@ |
|||
package strategy.impl; |
|||
|
|||
import model.Book; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import strategy.AbstractBookCrawlerStrategy; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class DoubanBookStrategy extends AbstractBookCrawlerStrategy { |
|||
private static final String NAME = "豆瓣读书Top250"; |
|||
private static final String BASE_URL = "https://book.douban.com/top250"; |
|||
|
|||
public DoubanBookStrategy() { |
|||
this.delayMs = 2000; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return NAME; |
|||
} |
|||
|
|||
@Override |
|||
public String getBaseUrl() { |
|||
return BASE_URL; |
|||
} |
|||
|
|||
@Override |
|||
public List<String> getPageUrls() { |
|||
List<String> urls = new ArrayList<>(); |
|||
for (int i = 0; i < 10; i++) { |
|||
urls.add(BASE_URL + "?start=" + (i * 25)); |
|||
} |
|||
return urls; |
|||
} |
|||
|
|||
@Override |
|||
public List<Book> parseBooks(String htmlContent) { |
|||
List<Book> books = new ArrayList<>(); |
|||
Document doc = Jsoup.parse(htmlContent); |
|||
Elements items = doc.select("tr.item"); |
|||
|
|||
for (Element item : items) { |
|||
try { |
|||
Book book = new Book(); |
|||
|
|||
Element indent = item.select("td.indent").first(); |
|||
if (indent != null) { |
|||
String rankStr = indent.select("div.starcount").text(); |
|||
if (!rankStr.isEmpty()) { |
|||
book.setRank(parseNumber(rankStr)); |
|||
} |
|||
} |
|||
|
|||
if (book.getRank() == null) { |
|||
Element order = item.select("div.starcount").first(); |
|||
if (order != null) { |
|||
book.setRank(parseNumber(order.text())); |
|||
} |
|||
} |
|||
|
|||
Element titleElement = item.select("div.pl2 a").first(); |
|||
if (titleElement != null) { |
|||
String title = titleElement.attr("title"); |
|||
if (title.isEmpty()) { |
|||
title = titleElement.text().split("\\s")[0]; |
|||
} |
|||
book.setTitle(title.trim()); |
|||
book.setUrl(titleElement.attr("href")); |
|||
} |
|||
|
|||
Element ratingElement = item.select("span.rating_nums").first(); |
|||
if (ratingElement != null) { |
|||
String ratingStr = ratingElement.text(); |
|||
if (!ratingStr.isEmpty()) { |
|||
book.setRating(Double.parseDouble(ratingStr)); |
|||
} |
|||
} |
|||
|
|||
Element countElement = item.select("span.pl").first(); |
|||
if (countElement != null) { |
|||
String countText = countElement.text(); |
|||
if (countText.contains("人评价")) { |
|||
String num = countText.replace("人评价", "").replace("(", "").replace(")", "").trim(); |
|||
book.setRatingCount(parseNumber(num)); |
|||
} |
|||
} |
|||
|
|||
Element infoElement = item.select("p.pl").first(); |
|||
if (infoElement != null) { |
|||
String info = infoElement.text(); |
|||
parseBookInfo(book, info); |
|||
} |
|||
|
|||
Element imgElement = item.select("img").first(); |
|||
if (imgElement != null) { |
|||
book.setImageUrl(imgElement.attr("src")); |
|||
} |
|||
|
|||
if (book.getTitle() != null && !book.getTitle().isEmpty()) { |
|||
books.add(book); |
|||
} |
|||
} catch (Exception e) { |
|||
// skip invalid item
|
|||
} |
|||
} |
|||
|
|||
return books; |
|||
} |
|||
|
|||
private void parseBookInfo(Book book, String info) { |
|||
String[] parts = info.split(" / "); |
|||
for (int i = 0; i < parts.length; i++) { |
|||
String part = parts[i].trim(); |
|||
|
|||
if (i == 0 && !part.matches("\\d{4}.*") && !part.matches(".*\\d+\\.\\d+.*")) { |
|||
book.setAuthor(part); |
|||
} |
|||
|
|||
if (part.matches("\\d{4}")) { |
|||
book.setYear(part); |
|||
} |
|||
|
|||
if (part.contains("出版社")) { |
|||
book.setPublisher(part.replace("出版社", "").trim()); |
|||
} |
|||
|
|||
if (part.matches(".*\\d+\\.\\d+元")) { |
|||
String priceStr = part.replace("元", "").trim(); |
|||
try { |
|||
book.setPrice(Double.parseDouble(priceStr)); |
|||
} catch (NumberFormatException e) { |
|||
// ignore
|
|||
} |
|||
} |
|||
|
|||
if (part.matches("ISBN.*")) { |
|||
book.setIsbn(part.replace("ISBN", "").trim()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private Integer parseNumber(String str) { |
|||
try { |
|||
if (str == null || str.isEmpty()) return null; |
|||
str = str.replaceAll("[^0-9.]", ""); |
|||
if (str.isEmpty()) return null; |
|||
if (str.contains(".")) { |
|||
return (int) Double.parseDouble(str); |
|||
} |
|||
return Integer.parseInt(str); |
|||
} catch (NumberFormatException e) { |
|||
return null; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,111 @@ |
|||
package strategy.impl; |
|||
|
|||
import model.Movie; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import strategy.AbstractCrawlerStrategy; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class DoubanStrategy extends AbstractCrawlerStrategy { |
|||
private static final String NAME = "豆瓣电影Top250"; |
|||
private static final String BASE_URL = "https://movie.douban.com/top250"; |
|||
|
|||
public DoubanStrategy() { |
|||
this.delayMs = 1500; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return NAME; |
|||
} |
|||
|
|||
@Override |
|||
public String getBaseUrl() { |
|||
return BASE_URL; |
|||
} |
|||
|
|||
@Override |
|||
public List<String> getPageUrls() { |
|||
List<String> urls = new ArrayList<>(); |
|||
for (int i = 0; i < 10; i++) { |
|||
urls.add(BASE_URL + "?start=" + (i * 25)); |
|||
} |
|||
return urls; |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> parseMovies(String htmlContent) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
Document doc = Jsoup.parse(htmlContent); |
|||
Elements items = doc.select("div.item"); |
|||
|
|||
for (Element item : items) { |
|||
try { |
|||
Movie movie = new Movie(); |
|||
|
|||
String rankStr = item.select("em").text(); |
|||
movie.setRank(Integer.parseInt(rankStr)); |
|||
|
|||
Element titleElement = item.select("span.title").first(); |
|||
if (titleElement != null) { |
|||
movie.setName(titleElement.text()); |
|||
} |
|||
|
|||
String ratingStr = item.select("span.rating_num").text(); |
|||
if (!ratingStr.isEmpty()) { |
|||
movie.setRating(Double.parseDouble(ratingStr)); |
|||
} |
|||
|
|||
String ratingCountStr = item.select("div.star span").last().text(); |
|||
if (ratingCountStr != null && ratingCountStr.contains("人评价")) { |
|||
String num = ratingCountStr.replace("人评价", "").trim(); |
|||
movie.setRatingCount(parseNumber(num)); |
|||
} |
|||
|
|||
String info = item.select("div.bd p").first().text(); |
|||
if (info != null) { |
|||
String[] parts = info.split(" / "); |
|||
if (parts.length > 0) { |
|||
String firstPart = parts[0]; |
|||
if (firstPart.contains("导演: ")) { |
|||
movie.setDirector(firstPart.replace("导演: ", "").trim()); |
|||
} |
|||
for (String part : parts) { |
|||
if (part.matches("\\d{4}") || part.matches("\\d{4}.*")) { |
|||
movie.setYear(part.trim().split("\\s+")[0]); |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
String link = item.select("div.hd a").attr("href"); |
|||
movie.setUrl(link); |
|||
|
|||
String imgUrl = item.select("div.pic img").attr("src"); |
|||
movie.setImageUrl(imgUrl); |
|||
|
|||
movies.add(movie); |
|||
} catch (Exception e) { |
|||
// skip invalid item
|
|||
} |
|||
} |
|||
|
|||
return movies; |
|||
} |
|||
|
|||
private Integer parseNumber(String str) { |
|||
try { |
|||
if (str.contains("万")) { |
|||
return (int) (Double.parseDouble(str.replace("万", "")) * 10000); |
|||
} |
|||
return Integer.parseInt(str.replace(",", "")); |
|||
} catch (NumberFormatException e) { |
|||
return null; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,94 @@ |
|||
package strategy.impl; |
|||
|
|||
import model.Movie; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import strategy.AbstractCrawlerStrategy; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class MaoyanStrategy extends AbstractCrawlerStrategy { |
|||
private static final String NAME = "猫眼电影Top100"; |
|||
private static final String BASE_URL = "https://maoyan.com/board/4"; |
|||
|
|||
public MaoyanStrategy() { |
|||
this.delayMs = 1500; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return NAME; |
|||
} |
|||
|
|||
@Override |
|||
public String getBaseUrl() { |
|||
return BASE_URL; |
|||
} |
|||
|
|||
@Override |
|||
public List<String> getPageUrls() { |
|||
List<String> urls = new ArrayList<>(); |
|||
for (int i = 0; i < 10; i++) { |
|||
urls.add(BASE_URL + "?offset=" + (i * 10)); |
|||
} |
|||
return urls; |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> parseMovies(String htmlContent) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
Document doc = Jsoup.parse(htmlContent); |
|||
Elements items = doc.select("dl.board-wrapper dd"); |
|||
|
|||
for (Element item : items) { |
|||
try { |
|||
Movie movie = new Movie(); |
|||
|
|||
String rankStr = item.select("i.board-index").text(); |
|||
movie.setRank(Integer.parseInt(rankStr)); |
|||
|
|||
String name = item.select("p.name a").text(); |
|||
movie.setName(name); |
|||
|
|||
String ratingStr = item.select("i.integer").text() + |
|||
item.select("i.fraction").text(); |
|||
if (!ratingStr.isEmpty()) { |
|||
movie.setRating(Double.parseDouble(ratingStr)); |
|||
} |
|||
|
|||
String actors = item.select("p.star").text(); |
|||
if (actors != null && actors.contains("主演:")) { |
|||
movie.setActors(actors.replace("主演:", "").trim()); |
|||
} |
|||
|
|||
String releaseTime = item.select("p.releasetime").text(); |
|||
if (releaseTime != null && releaseTime.contains("上映时间:")) { |
|||
String timeStr = releaseTime.replace("上映时间:", "").trim(); |
|||
if (timeStr.matches("\\d{4}.*")) { |
|||
movie.setYear(timeStr.substring(0, 4)); |
|||
} |
|||
} |
|||
|
|||
String link = item.select("p.name a").attr("href"); |
|||
if (!link.isEmpty()) { |
|||
movie.setUrl("https://maoyan.com" + link); |
|||
} |
|||
|
|||
String imgUrl = item.select("img.board-img").attr("data-src"); |
|||
if (imgUrl.isEmpty()) { |
|||
imgUrl = item.select("img.board-img").attr("src"); |
|||
} |
|||
movie.setImageUrl(imgUrl); |
|||
|
|||
movies.add(movie); |
|||
} catch (Exception e) { |
|||
// skip invalid item
|
|||
} |
|||
} |
|||
|
|||
return movies; |
|||
} |
|||
} |
|||
@ -0,0 +1,105 @@ |
|||
package strategy.impl; |
|||
|
|||
import model.Movie; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import strategy.AbstractCrawlerStrategy; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class RottenTomatoesStrategy extends AbstractCrawlerStrategy { |
|||
private static final String NAME = "烂番茄Top100"; |
|||
private static final String BASE_URL = "https://www.rottentomatoes.com/top/bestofrt/"; |
|||
|
|||
public RottenTomatoesStrategy() { |
|||
this.delayMs = 2000; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return NAME; |
|||
} |
|||
|
|||
@Override |
|||
public String getBaseUrl() { |
|||
return BASE_URL; |
|||
} |
|||
|
|||
@Override |
|||
public List<String> getPageUrls() { |
|||
List<String> urls = new ArrayList<>(); |
|||
urls.add(BASE_URL); |
|||
return urls; |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> parseMovies(String htmlContent) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
Document doc = Jsoup.parse(htmlContent); |
|||
Elements items = doc.select("table.table tr"); |
|||
|
|||
int rank = 0; |
|||
for (Element item : items) { |
|||
try { |
|||
Element rankElement = item.selectFirst("td.rank"); |
|||
if (rankElement == null) continue; |
|||
|
|||
Movie movie = new Movie(); |
|||
|
|||
String rankStr = rankElement.text(); |
|||
if (!rankStr.isEmpty()) { |
|||
movie.setRank(Integer.parseInt(rankStr)); |
|||
} else { |
|||
movie.setRank(++rank); |
|||
} |
|||
|
|||
Element titleElement = item.selectFirst("td.title a"); |
|||
if (titleElement != null) { |
|||
String fullTitle = titleElement.text(); |
|||
if (fullTitle.contains("(") && fullTitle.contains(")")) { |
|||
int start = fullTitle.lastIndexOf("("); |
|||
int end = fullTitle.lastIndexOf(")"); |
|||
if (start > 0 && end > start) { |
|||
String yearStr = fullTitle.substring(start + 1, end); |
|||
if (yearStr.matches("\\d{4}")) { |
|||
movie.setYear(yearStr); |
|||
} |
|||
movie.setName(fullTitle.substring(0, start).trim()); |
|||
} else { |
|||
movie.setName(fullTitle); |
|||
} |
|||
} else { |
|||
movie.setName(fullTitle); |
|||
} |
|||
|
|||
String link = titleElement.attr("href"); |
|||
if (!link.isEmpty()) { |
|||
if (link.startsWith("/")) { |
|||
movie.setUrl("https://www.rottentomatoes.com" + link); |
|||
} else { |
|||
movie.setUrl(link); |
|||
} |
|||
} |
|||
} |
|||
|
|||
Element scoreElement = item.selectFirst("td.score span.tMeterScore"); |
|||
if (scoreElement != null) { |
|||
String scoreStr = scoreElement.text(); |
|||
if (scoreStr.matches("\\d+%")) { |
|||
double rating = Double.parseDouble(scoreStr.replace("%", "")) / 10; |
|||
movie.setRating(Math.round(rating * 10) / 10.0); |
|||
} |
|||
} |
|||
|
|||
movies.add(movie); |
|||
} catch (Exception e) { |
|||
// skip invalid item
|
|||
} |
|||
} |
|||
|
|||
return movies; |
|||
} |
|||
} |
|||
@ -0,0 +1,54 @@ |
|||
package util; |
|||
|
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
|
|||
/** |
|||
* 简单日志工具类 |
|||
*/ |
|||
public class Logger { |
|||
private static final DateTimeFormatter formatter = |
|||
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
|
|||
public enum Level { |
|||
DEBUG, INFO, WARN, ERROR |
|||
} |
|||
|
|||
private static Level currentLevel = Level.INFO; |
|||
|
|||
public static void setLevel(Level level) { |
|||
currentLevel = level; |
|||
} |
|||
|
|||
private static void log(Level level, String message) { |
|||
if (level.ordinal() >= currentLevel.ordinal()) { |
|||
String timestamp = LocalDateTime.now().format(formatter); |
|||
String threadName = Thread.currentThread().getName(); |
|||
System.out.printf("[%s] [%s] [%s] %s%n", |
|||
timestamp, level, threadName, message); |
|||
} |
|||
} |
|||
|
|||
public static void debug(String message) { |
|||
log(Level.DEBUG, message); |
|||
} |
|||
|
|||
public static void info(String message) { |
|||
log(Level.INFO, message); |
|||
} |
|||
|
|||
public static void warn(String message) { |
|||
log(Level.WARN, message); |
|||
} |
|||
|
|||
public static void error(String message) { |
|||
log(Level.ERROR, message); |
|||
} |
|||
|
|||
public static void error(String message, Throwable e) { |
|||
log(Level.ERROR, message + " - " + e.getMessage()); |
|||
if (currentLevel == Level.DEBUG) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,109 @@ |
|||
package view; |
|||
|
|||
import exception.CrawlerResult; |
|||
import java.util.List; |
|||
|
|||
public class CrawlerView { |
|||
|
|||
public void showWelcome() { |
|||
System.out.println(); |
|||
System.out.println("╔════════════════════════════════════════════════╗"); |
|||
System.out.println("║ Java 爬虫管理系统 v3.0 (电影+图书) ║"); |
|||
System.out.println("╚════════════════════════════════════════════════╝"); |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void showHelp() { |
|||
System.out.println(); |
|||
System.out.println("═══════════════ 使用帮助 ═══════════════"); |
|||
System.out.println(" java -jar crawler.jar <命令> [参数]"); |
|||
System.out.println(); |
|||
System.out.println("可用命令:"); |
|||
System.out.println(" list - 列出所有爬虫"); |
|||
System.out.println(" run <爬虫名> - 运行指定爬虫"); |
|||
System.out.println(" run-all - 运行所有爬虫"); |
|||
System.out.println(" stats - 显示统计信息"); |
|||
System.out.println(" clear - 清空所有数据"); |
|||
System.out.println(" help - 显示帮助信息"); |
|||
System.out.println(); |
|||
System.out.println("电影爬虫:"); |
|||
System.out.println(" - 豆瓣电影Top250"); |
|||
System.out.println(" - 猫眼电影Top100"); |
|||
System.out.println(" - RottenTomatoes"); |
|||
System.out.println(); |
|||
System.out.println("图书爬虫:"); |
|||
System.out.println(" - 豆瓣读书Top250"); |
|||
System.out.println(" - BooksToScrape"); |
|||
System.out.println(); |
|||
System.out.println("示例:"); |
|||
System.out.println(" java -jar crawler.jar list"); |
|||
System.out.println(" java -jar crawler.jar run 豆瓣电影Top250"); |
|||
System.out.println(" java -jar crawler.jar run 豆瓣读书Top250"); |
|||
System.out.println(" java -jar crawler.jar run BooksToScrape"); |
|||
System.out.println(" java -jar crawler.jar run-all"); |
|||
System.out.println("═══════════════════════════════════════════"); |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void showCrawlerList(List<String> crawlers) { |
|||
System.out.println(); |
|||
System.out.println("═══════════════ 爬虫列表 ═══════════════"); |
|||
for (int i = 0; i < crawlers.size(); i++) { |
|||
System.out.println(" " + (i + 1) + ". " + crawlers.get(i)); |
|||
} |
|||
System.out.println("═══════════════════════════════════════════"); |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void showResult(CrawlerResult result) { |
|||
System.out.println(); |
|||
if (result.isSuccess()) { |
|||
System.out.println("╔════════════════════════════════════════╗"); |
|||
System.out.printf("║ SUCCESS: %-30s ║%n", result.getSource()); |
|||
System.out.printf("║ 数据条数: %-28d ║%n", result.getDataCount()); |
|||
System.out.printf("║ 耗时: %-30dms ║%n", result.getElapsedTime()); |
|||
System.out.println("╚════════════════════════════════════════╝"); |
|||
} else { |
|||
System.out.println("╔════════════════════════════════════════╗"); |
|||
System.out.printf("║ FAILURE: [%s] %-20s ║%n", result.getErrorCode(), result.getSource()); |
|||
System.out.printf("║ 错误信息: %-28s ║%n", result.getMessage()); |
|||
System.out.println("╚════════════════════════════════════════╝"); |
|||
} |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void showResults(List<CrawlerResult> results) { |
|||
System.out.println(); |
|||
System.out.println("═══════════════ 执行结果 ═══════════════"); |
|||
|
|||
int successCount = 0; |
|||
int totalData = 0; |
|||
|
|||
for (CrawlerResult result : results) { |
|||
System.out.println(result.toString()); |
|||
if (result.isSuccess()) { |
|||
successCount++; |
|||
totalData += result.getDataCount(); |
|||
} |
|||
} |
|||
|
|||
System.out.println("─────────────────────────────────────────"); |
|||
System.out.printf(" 成功: %d/%d | 总数据: %d 条%n", |
|||
successCount, results.size(), totalData); |
|||
System.out.println("═══════════════════════════════════════════"); |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void showError(String message) { |
|||
System.out.println(); |
|||
System.out.println("╔════════════════════════════════════════╗"); |
|||
System.out.println("║ 错误信息 ║"); |
|||
System.out.printf("║ %-36s ║%n", message); |
|||
System.out.println("╚════════════════════════════════════════╝"); |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void showMessage(String message) { |
|||
System.out.println(message); |
|||
} |
|||
} |
|||
@ -0,0 +1,88 @@ |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
|
|||
// 抽象父类:封装通用爬虫逻辑
|
|||
abstract class BaseCrawler { |
|||
// 封装属性
|
|||
private String baseUrl; |
|||
|
|||
public BaseCrawler(String baseUrl) { |
|||
this.baseUrl = baseUrl; |
|||
} |
|||
|
|||
// 封装:获取网页文档
|
|||
protected Document getDoc(String url) throws IOException { |
|||
return Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0") |
|||
.timeout(8000) |
|||
.get(); |
|||
} |
|||
|
|||
// 抽象方法:交给子类实现(多态基础)
|
|||
public abstract void parse(Document doc, FileWriter writer) throws IOException; |
|||
|
|||
// 封装:统一执行入口
|
|||
public void start(FileWriter writer) { |
|||
try { |
|||
for (int i = 0; i < 10; i++) { |
|||
int start = i * 25; |
|||
String url = baseUrl + "?start=" + start; |
|||
System.out.println("正在爬取第 " + (i + 1) + " 页"); |
|||
|
|||
Document doc = getDoc(url); |
|||
parse(doc, writer); // 多态:调用子类的parse
|
|||
Thread.sleep(1000); |
|||
} |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
|
|||
// 子类:继承父类,实现豆瓣电影解析
|
|||
class DoubanCrawler extends BaseCrawler { |
|||
|
|||
public DoubanCrawler() { |
|||
super("https://movie.douban.com/top250"); |
|||
} |
|||
|
|||
// 重写方法 → 多态
|
|||
@Override |
|||
public void parse(Document doc, FileWriter writer) throws IOException { |
|||
Elements items = doc.select("div.item"); |
|||
|
|||
for (Element item : items) { |
|||
String rank = item.select("em").text(); |
|||
String name = item.select("span.title").first().text(); |
|||
String score = item.select("span.rating_num").text(); |
|||
|
|||
String line = "排名:" + rank + " 电影:" + name + " 评分:" + score; |
|||
System.out.println(line); |
|||
writer.write(line + "\r\n"); // 写入文件
|
|||
} |
|||
} |
|||
} |
|||
|
|||
// 主类
|
|||
public class TestMain { |
|||
public static void main(String[] args) { |
|||
try { |
|||
// 直接写入桌面,好找!
|
|||
FileWriter writer = new FileWriter("douban_top250.txt"); |
|||
|
|||
// 多态:父类引用 指向 子类对象
|
|||
BaseCrawler crawler = new DoubanCrawler(); |
|||
|
|||
crawler.start(writer); |
|||
|
|||
writer.close(); |
|||
System.out.println("===== 全部爬完,文件已保存到桌面 ====="); |
|||
} catch (IOException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
Binary file not shown.
@ -0,0 +1,27 @@ |
|||
import java.util.Scanner; |
|||
public class BMICalculator { |
|||
public static void main(String[] args) { |
|||
Scanner scanner = new Scanner(System.in); |
|||
|
|||
System.out.print("请输入身高(米):"); |
|||
double height = scanner.nextDouble(); |
|||
|
|||
System.out.print("请输入体重(千克):"); |
|||
double weight = scanner.nextDouble(); |
|||
|
|||
double bmi = weight / (height * height); |
|||
System.out.printf("你的 BMI 值为:%.2f%n", bmi); |
|||
|
|||
// BMI 范围判断
|
|||
if (bmi < 18.5) { |
|||
System.out.println("体重过轻"); |
|||
} else if (bmi < 24) { |
|||
System.out.println("正常范围"); |
|||
} else if (bmi < 28) { |
|||
System.out.println("超重"); |
|||
} else { |
|||
System.out.println("肥胖"); |
|||
} |
|||
scanner.close(); |
|||
} |
|||
} |
|||
@ -0,0 +1,4 @@ |
|||
package PACKAGE_NAME; |
|||
|
|||
public class ShapeTest { |
|||
} |
|||
@ -0,0 +1,4 @@ |
|||
package PACKAGE_NAME; |
|||
|
|||
public class AnimalTest { |
|||
} |
|||
@ -0,0 +1,4 @@ |
|||
package PACKAGE_NAME; |
|||
|
|||
public class DoubanMovieCrawler { |
|||
} |
|||
Loading…
Reference in new issue