84 changed files with 19633 additions and 0 deletions
@ -0,0 +1,39 @@ |
|||||
|
target/ |
||||
|
!.mvn/wrapper/maven-wrapper.jar |
||||
|
!**/src/main/**/target/ |
||||
|
!**/src/test/**/target/ |
||||
|
.kotlin |
||||
|
|
||||
|
### IntelliJ IDEA ### |
||||
|
.idea/modules.xml |
||||
|
.idea/jarRepositories.xml |
||||
|
.idea/compiler.xml |
||||
|
.idea/libraries/ |
||||
|
*.iws |
||||
|
*.iml |
||||
|
*.ipr |
||||
|
|
||||
|
### Eclipse ### |
||||
|
.apt_generated |
||||
|
.classpath |
||||
|
.factorypath |
||||
|
.project |
||||
|
.settings |
||||
|
.springBeans |
||||
|
.sts4-cache |
||||
|
|
||||
|
### NetBeans ### |
||||
|
/nbproject/private/ |
||||
|
/nbbuild/ |
||||
|
/dist/ |
||||
|
/nbdist/ |
||||
|
/.nb-gradle/ |
||||
|
build/ |
||||
|
!**/src/main/**/build/ |
||||
|
!**/src/test/**/build/ |
||||
|
|
||||
|
### VS Code ### |
||||
|
.vscode/ |
||||
|
|
||||
|
### Mac OS ### |
||||
|
.DS_Store |
||||
@ -0,0 +1,10 @@ |
|||||
|
# 默认忽略的文件 |
||||
|
/shelf/ |
||||
|
/workspace.xml |
||||
|
# 已忽略包含查询文件的默认文件夹 |
||||
|
/queries/ |
||||
|
# Datasource local storage ignored files |
||||
|
/dataSources/ |
||||
|
/dataSources.local.xml |
||||
|
# 基于编辑器的 HTTP 客户端请求 |
||||
|
/httpRequests/ |
||||
@ -0,0 +1,7 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project version="4"> |
||||
|
<component name="Encoding"> |
||||
|
<file url="file://$PROJECT_DIR$/src/main/Douban.java" charset="UTF-8" /> |
||||
|
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" /> |
||||
|
</component> |
||||
|
</project> |
||||
@ -0,0 +1,14 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project version="4"> |
||||
|
<component name="ExternalStorageConfigurationManager" enabled="true" /> |
||||
|
<component name="MavenProjectsManager"> |
||||
|
<option name="originalFiles"> |
||||
|
<list> |
||||
|
<option value="$PROJECT_DIR$/pom.xml" /> |
||||
|
</list> |
||||
|
</option> |
||||
|
</component> |
||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="25" project-jdk-type="JavaSDK"> |
||||
|
<output url="file://$PROJECT_DIR$/out" /> |
||||
|
</component> |
||||
|
</project> |
||||
@ -0,0 +1,87 @@ |
|||||
|
# Web Crawler Application |
||||
|
|
||||
|
基于 Java 的多网站爬虫应用,采用 MVC、Command 模式、策略模式和完整的异常体系设计。 |
||||
|
|
||||
|
## 功能特性 |
||||
|
|
||||
|
- 爬取豆瓣电影排行榜 |
||||
|
- 爬取豆瓣读书排行榜 |
||||
|
- 爬取 Books to Scrape 网站 |
||||
|
- 数据保存为 JSON 格式文件 |
||||
|
- 支持交互式和命令行模式 |
||||
|
|
||||
|
## 项目架构 |
||||
|
|
||||
|
### 设计模式 |
||||
|
|
||||
|
1. **MVC 模式** |
||||
|
- Model: `Movie`, `Book`, `ScrapeBook` |
||||
|
- View: `ConsoleView` |
||||
|
- Controller: `CrawlerController` |
||||
|
|
||||
|
2. **策略模式 (Strategy Pattern)** |
||||
|
- `CrawlerStrategy` 接口 |
||||
|
- `DoubanMovieStrategy` - 豆瓣电影策略 |
||||
|
- `DoubanBookStrategy` - 豆瓣读书策略 |
||||
|
- `BooksToScrapeStrategy` - Books to Scrape 策略 |
||||
|
|
||||
|
3. **命令模式 (Command Pattern)** |
||||
|
- `Command` 接口 |
||||
|
- `CrawlCommand` - 单个爬虫命令 |
||||
|
- `CrawlAllCommand` - 组合命令,执行所有爬虫 |
||||
|
|
||||
|
4. **异常体系** |
||||
|
- `CrawlerException` - 基类异常 |
||||
|
- `NetworkException` - 网络异常 |
||||
|
- `ParseException` - 解析异常 |
||||
|
- `FileException` - 文件操作异常 |
||||
|
|
||||
|
## 使用方法 |
||||
|
|
||||
|
### 编译项目 |
||||
|
|
||||
|
```bash |
||||
|
mvn clean package |
||||
|
``` |
||||
|
|
||||
|
### 运行方式 |
||||
|
|
||||
|
#### 1. 交互式模式 |
||||
|
|
||||
|
```bash |
||||
|
java -jar target/web-crawler-1.0-SNAPSHOT.jar -i |
||||
|
``` |
||||
|
|
||||
|
#### 2. 命令行模式 |
||||
|
|
||||
|
爬取所有网站: |
||||
|
```bash |
||||
|
java -jar target/web-crawler-1.0-SNAPSHOT.jar |
||||
|
``` |
||||
|
|
||||
|
爬取指定网站: |
||||
|
```bash |
||||
|
java -jar target/web-crawler-1.0-SNAPSHOT.jar -s douban-movie |
||||
|
java -jar target/web-crawler-1.0-SNAPSHOT.jar -s douban-book |
||||
|
java -jar target/web-crawler-1.0-SNAPSHOT.jar -s books-to-scrape |
||||
|
``` |
||||
|
|
||||
|
查看帮助: |
||||
|
```bash |
||||
|
java -jar target/web-crawler-1.0-SNAPSHOT.jar --help |
||||
|
``` |
||||
|
|
||||
|
## 输出文件 |
||||
|
|
||||
|
爬取的数据将保存到 `data/` 目录下: |
||||
|
|
||||
|
- `douban_movies.json` - 豆瓣电影数据 |
||||
|
- `douban_books.json` - 豆瓣读书数据 |
||||
|
- `books_to_scrape.json` - Books to Scrape 数据 |
||||
|
|
||||
|
## 依赖项 |
||||
|
|
||||
|
- Jsoup - HTML 解析 |
||||
|
- Gson - JSON 处理 |
||||
|
- Picocli - 命令行解析 |
||||
|
- SLF4J - 日志框架 |
||||
@ -0,0 +1,96 @@ |
|||||
|
mvn : WARNING: A restricted method in java.lang.System has been called |
||||
|
所在位置 行:1 字符: 72 |
||||
|
+ ... 嘻哈哈\Git\java爬虫\TestMaven"; mvn clean package -DskipTests 2>&1 | Out-F ... |
||||
|
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
||||
|
+ CategoryInfo : NotSpecified: (WARNING: A rest...has been called |
||||
|
:String) [], RemoteException |
||||
|
+ FullyQualifiedErrorId : NativeCommandError |
||||
|
|
||||
|
WARNING: java.lang.System::load has been called by org.fusesource.jansi.interna |
||||
|
l.JansiLoader in an unnamed module (file:/D:/嘻嘻哈哈/Git/java/apache-maven-3.9.6/l |
||||
|
ib/jansi-2.4.0.jar) |
||||
|
WARNING: Use --enable-native-access=ALL-UNNAMED to avoid a warning for callers |
||||
|
in this module |
||||
|
WARNING: Restricted methods will be blocked in a future release unless native a |
||||
|
ccess is enabled |
||||
|
|
||||
|
WARNING: A terminally deprecated method in sun.misc.Unsafe has been called |
||||
|
WARNING: sun.misc.Unsafe::objectFieldOffset has been called by com.google.commo |
||||
|
n.util.concurrent.AbstractFuture$UnsafeAtomicHelper (file:/D:/嘻嘻哈哈/Git/java/apa |
||||
|
che-maven-3.9.6/lib/guava-32.0.1-jre.jar) |
||||
|
WARNING: Please consider reporting this to the maintainers of class com.google. |
||||
|
common.util.concurrent.AbstractFuture$UnsafeAtomicHelper |
||||
|
WARNING: sun.misc.Unsafe::objectFieldOffset will be removed in a future release |
||||
|
[INFO] Scanning for projects... |
||||
|
[INFO] |
||||
|
[INFO] -----------------------< com.example:TestMaven >------------------------ |
||||
|
[INFO] Building TestMaven 1.0-SNAPSHOT |
||||
|
[INFO] from pom.xml |
||||
|
[INFO] --------------------------------[ jar ]--------------------------------- |
||||
|
[INFO] |
||||
|
[INFO] --- clean:3.2.0:clean (default-clean) @ TestMaven --- |
||||
|
[INFO] Deleting D:\鍢诲樆鍝堝搱\Git\java鐖櫕\TestMaven\target |
||||
|
[INFO] |
||||
|
[INFO] --- resources:3.3.1:resources (default-resources) @ TestMaven --- |
||||
|
[INFO] Copying 0 resource from src\main\resources to target\classes |
||||
|
[INFO] |
||||
|
[INFO] --- compiler:3.11.0:compile (default-compile) @ TestMaven --- |
||||
|
[INFO] Changes detected - recompiling the module! :source |
||||
|
[INFO] Compiling 41 source files with javac [debug target 8] to target\classes |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[WARNING] COMPILATION WARNING : |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[WARNING] 鏈笌 -source 8 涓€璧疯缃紩瀵肩被璺緞 |
||||
|
涓嶈缃紩瀵肩被璺緞鍙兘浼氬鑷寸被鏂囦欢鏃犳硶鍦?JDK 8 涓婅繍琛? 寤鸿浣跨敤 --release 8 鑰屼笉鏄?-source 8 -target 8锛屽洜涓哄畠浼氳嚜鍔ㄨ缃紩瀵肩被璺緞 |
||||
|
[WARNING] 婧愬€?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎 |
||||
|
[WARNING] 鐩爣鍊?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎 |
||||
|
[WARNING] 瑕侀殣钘忔湁鍏冲凡杩囨椂閫夐」鐨勮鍛? 璇蜂娇鐢?-Xlint:-options銆? |
||||
|
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
||||
|
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
||||
|
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
||||
|
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
||||
|
[INFO] 8 warnings |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[ERROR] COMPILATION ERROR : |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/main/CrawlerManager.java:[178,20] main.CrawlerManager.MultiStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String) |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
||||
|
浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
||||
|
浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
||||
|
浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
||||
|
浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/storage/SQLiteStorage.java:[12,8] storage.SQLiteStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String) |
||||
|
[INFO] 6 errors |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[INFO] ------------------------------------------------------------------------ |
||||
|
[INFO] BUILD FAILURE |
||||
|
[INFO] ------------------------------------------------------------------------ |
||||
|
[INFO] Total time: 15.493 s |
||||
|
[INFO] Finished at: 2026-05-31T23:13:59+08:00 |
||||
|
[INFO] ------------------------------------------------------------------------ |
||||
|
[ERROR] Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.11.0:compile (default-compile) on project TestMaven: Compilation failure: Compilation failure: |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/main/CrawlerManager.java:[178,20] main.CrawlerManager.MultiStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String) |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙? |
||||
|
[ERROR] 绗﹀彿: 绫?var |
||||
|
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙? |
||||
|
[ERROR] 绗﹀彿: 绫?var |
||||
|
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙? |
||||
|
[ERROR] 绗﹀彿: 绫?var |
||||
|
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙? |
||||
|
[ERROR] 绗﹀彿: 绫?var |
||||
|
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/storage/SQLiteStorage.java:[12,8] storage.SQLiteStorage涓嶆槸鎶借薄鐨? 骞朵笖鏈鐩杝torage.DataStorage涓殑鎶借薄鏂规硶findBooksBySource(java.lang.String) |
||||
|
[ERROR] -> [Help 1] |
||||
|
[ERROR] |
||||
|
[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch. |
||||
|
[ERROR] Re-run Maven using the -X switch to enable full debug logging. |
||||
|
[ERROR] |
||||
|
[ERROR] For more information about the errors and possible solutions, please read the following articles: |
||||
|
[ERROR] [Help 1] http://cwiki.apache.org/confluence/display/MAVEN/MojoFailureException |
||||
@ -0,0 +1,92 @@ |
|||||
|
mvn : WARNING: A restricted method in java.lang.System has been called |
||||
|
所在位置 行:1 字符: 72 |
||||
|
+ ... JAVA_HOME = "D:\嘻嘻哈哈\Git"; mvn clean package -DskipTests 2>&1 | Out-F ... |
||||
|
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
||||
|
+ CategoryInfo : NotSpecified: (WARNING: A rest...has been called |
||||
|
:String) [], RemoteException |
||||
|
+ FullyQualifiedErrorId : NativeCommandError |
||||
|
|
||||
|
WARNING: java.lang.System::load has been called by org.fusesource.jansi.interna |
||||
|
l.JansiLoader in an unnamed module (file:/D:/嘻嘻哈哈/Git/java/apache-maven-3.9.6/l |
||||
|
ib/jansi-2.4.0.jar) |
||||
|
WARNING: Use --enable-native-access=ALL-UNNAMED to avoid a warning for callers |
||||
|
in this module |
||||
|
WARNING: Restricted methods will be blocked in a future release unless native a |
||||
|
ccess is enabled |
||||
|
|
||||
|
WARNING: A terminally deprecated method in sun.misc.Unsafe has been called |
||||
|
WARNING: sun.misc.Unsafe::objectFieldOffset has been called by com.google.commo |
||||
|
n.util.concurrent.AbstractFuture$UnsafeAtomicHelper (file:/D:/嘻嘻哈哈/Git/java/apa |
||||
|
che-maven-3.9.6/lib/guava-32.0.1-jre.jar) |
||||
|
WARNING: Please consider reporting this to the maintainers of class com.google. |
||||
|
common.util.concurrent.AbstractFuture$UnsafeAtomicHelper |
||||
|
WARNING: sun.misc.Unsafe::objectFieldOffset will be removed in a future release |
||||
|
[INFO] Scanning for projects... |
||||
|
[INFO] |
||||
|
[INFO] -----------------------< com.example:TestMaven >------------------------ |
||||
|
[INFO] Building TestMaven 1.0-SNAPSHOT |
||||
|
[INFO] from pom.xml |
||||
|
[INFO] --------------------------------[ jar ]--------------------------------- |
||||
|
[INFO] |
||||
|
[INFO] --- clean:3.2.0:clean (default-clean) @ TestMaven --- |
||||
|
[INFO] Deleting D:\鍢诲樆鍝堝搱\Git\java鐖櫕\TestMaven\target |
||||
|
[INFO] |
||||
|
[INFO] --- resources:3.3.1:resources (default-resources) @ TestMaven --- |
||||
|
[INFO] Copying 0 resource from src\main\resources to target\classes |
||||
|
[INFO] |
||||
|
[INFO] --- compiler:3.11.0:compile (default-compile) @ TestMaven --- |
||||
|
[INFO] Changes detected - recompiling the module! :source |
||||
|
[INFO] Compiling 36 source files with javac [debug target 8] to target\classes |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[WARNING] COMPILATION WARNING : |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[WARNING] 鏈笌 -source 8 涓€璧疯缃紩瀵肩被璺緞 |
||||
|
涓嶈缃紩瀵肩被璺緞鍙兘浼氬鑷寸被鏂囦欢鏃犳硶鍦?JDK 8 涓婅繍琛? 寤鸿浣跨敤 --release 8 鑰屼笉鏄?-source 8 -target 8锛屽洜涓哄畠浼氳嚜鍔ㄨ缃紩瀵肩被璺緞 |
||||
|
[WARNING] 婧愬€?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎 |
||||
|
[WARNING] 鐩爣鍊?8 宸茶繃鏃讹紝灏嗗湪鏈潵鍙戣鐗堜腑鍒犻櫎 |
||||
|
[WARNING] 瑕侀殣钘忔湁鍏冲凡杩囨椂閫夐」鐨勮鍛? 璇蜂娇鐢?-Xlint:-options銆? |
||||
|
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
||||
|
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
||||
|
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
||||
|
[WARNING] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,13] 浠庡彂琛岀増 10 寮€濮嬶紝'var' 鏄彈闄愮被鍨嬪悕绉帮紝涓嶈兘鐢ㄤ簬绫诲瀷澹版槑锛屼篃涓嶈兘鐢ㄤ綔鏁扮粍鐨勫厓绱犵被鍨? |
||||
|
[INFO] 8 warnings |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[ERROR] COMPILATION ERROR : |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
||||
|
浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
||||
|
浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
||||
|
浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙? 绗﹀彿: 绫?var |
||||
|
浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[INFO] 4 errors |
||||
|
[INFO] ------------------------------------------------------------- |
||||
|
[INFO] ------------------------------------------------------------------------ |
||||
|
[INFO] BUILD FAILURE |
||||
|
[INFO] ------------------------------------------------------------------------ |
||||
|
[INFO] Total time: 15.527 s |
||||
|
[INFO] Finished at: 2026-05-31T22:16:51+08:00 |
||||
|
[INFO] ------------------------------------------------------------------------ |
||||
|
[ERROR] Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.11.0:compile (default-compile) on project TestMaven: Compilation failure: Compilation failure: |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[106,9] 鎵句笉鍒扮鍙? |
||||
|
[ERROR] 绗﹀彿: 绫?var |
||||
|
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[112,9] 鎵句笉鍒扮鍙? |
||||
|
[ERROR] 绗﹀彿: 绫?var |
||||
|
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[118,9] 鎵句笉鍒扮鍙? |
||||
|
[ERROR] 绗﹀彿: 绫?var |
||||
|
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] /D:/鍢诲樆鍝堝搱/Git/java鐖櫕/TestMaven/src/main/java/cli/CrawlerCLI.java:[124,9] 鎵句笉鍒扮鍙? |
||||
|
[ERROR] 绗﹀彿: 绫?var |
||||
|
[ERROR] 浣嶇疆: 绫?cli.CrawlerCLI |
||||
|
[ERROR] -> [Help 1] |
||||
|
[ERROR] |
||||
|
[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch. |
||||
|
[ERROR] Re-run Maven using the -X switch to enable full debug logging. |
||||
|
[ERROR] |
||||
|
[ERROR] For more information about the errors and possible solutions, please read the following articles: |
||||
|
[ERROR] [Help 1] http://cwiki.apache.org/confluence/display/MAVEN/MojoFailureException |
||||
Binary file not shown.
@ -0,0 +1,22 @@ |
|||||
|
# 爬虫配置文件 |
||||
|
|
||||
|
# 请求延迟(毫秒)- 避免请求过快被封 |
||||
|
delay.ms=1500 |
||||
|
|
||||
|
# 请求超时时间(毫秒) |
||||
|
timeout.ms=15000 |
||||
|
|
||||
|
# User-Agent |
||||
|
user.agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 |
||||
|
|
||||
|
# 数据库路径 |
||||
|
db.path=crawler.db |
||||
|
|
||||
|
# 输出目录 |
||||
|
output.dir=output |
||||
|
|
||||
|
# 是否启用数据库存储 |
||||
|
enable.database=true |
||||
|
|
||||
|
# 是否启用文件输出 |
||||
|
enable.file=true |
||||
@ -0,0 +1,162 @@ |
|||||
|
[ |
||||
|
{ |
||||
|
"title": "A Light in the Attic", |
||||
|
"price": "£51.77", |
||||
|
"rating": "3", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Tipping the Velvet", |
||||
|
"price": "£53.74", |
||||
|
"rating": "1", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Soumission", |
||||
|
"price": "£50.10", |
||||
|
"rating": "1", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/soumission_998/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Sharp Objects", |
||||
|
"price": "£47.82", |
||||
|
"rating": "4", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/sharp-objects_997/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Sapiens: A Brief History of Humankind", |
||||
|
"price": "£54.23", |
||||
|
"rating": "5", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "The Requiem Red", |
||||
|
"price": "£22.65", |
||||
|
"rating": "1", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/the-requiem-red_995/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "The Dirty Little Secrets of Getting Your Dream Job", |
||||
|
"price": "£33.34", |
||||
|
"rating": "4", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull", |
||||
|
"price": "£17.93", |
||||
|
"rating": "3", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics", |
||||
|
"price": "£22.60", |
||||
|
"rating": "4", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "The Black Maria", |
||||
|
"price": "£52.15", |
||||
|
"rating": "1", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/the-black-maria_991/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Starving Hearts (Triangular Trade Trilogy, #1)", |
||||
|
"price": "£13.99", |
||||
|
"rating": "2", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Shakespeare\u0027s Sonnets", |
||||
|
"price": "£20.66", |
||||
|
"rating": "4", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/shakespeares-sonnets_989/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Set Me Free", |
||||
|
"price": "£17.46", |
||||
|
"rating": "5", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/set-me-free_988/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Scott Pilgrim\u0027s Precious Little Life (Scott Pilgrim #1)", |
||||
|
"price": "£52.29", |
||||
|
"rating": "5", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Rip it Up and Start Again", |
||||
|
"price": "£35.02", |
||||
|
"rating": "5", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/rip-it-up-and-start-again_986/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", |
||||
|
"price": "£57.25", |
||||
|
"rating": "3", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/our-band-could-be-your-life-scenes-from-the-american-indie-underground-1981-1991_985/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Olio", |
||||
|
"price": "£23.88", |
||||
|
"rating": "1", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/olio_984/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Mesaerion: The Best Science Fiction Stories 1800-1849", |
||||
|
"price": "£37.59", |
||||
|
"rating": "1", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/mesaerion-the-best-science-fiction-stories-1800-1849_983/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "Libertarianism for Beginners", |
||||
|
"price": "£51.33", |
||||
|
"rating": "2", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/libertarianism-for-beginners_982/index.html" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "It\u0027s Only the Himalayas", |
||||
|
"price": "£45.17", |
||||
|
"rating": "2", |
||||
|
"availability": "In stock", |
||||
|
"imageUrl": "http://books.toscrape.com/media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg", |
||||
|
"productUrl": "http://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html" |
||||
|
} |
||||
|
] |
||||
@ -0,0 +1,82 @@ |
|||||
|
[ |
||||
|
{ |
||||
|
"title": "安定此心:我当精神科医生的12000天", |
||||
|
"url": "https://book.douban.com/subject/37502923/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "挽救计划", |
||||
|
"url": "https://book.douban.com/subject/38210508/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "咸的玩笑", |
||||
|
"url": "https://book.douban.com/subject/37833272/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "真事隐:康熙废储与正史虚构", |
||||
|
"url": "https://book.douban.com/subject/37920184/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "大厂小民:我在互联网公司的1480天", |
||||
|
"url": "https://book.douban.com/subject/38208793/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "天色已晚", |
||||
|
"url": "https://book.douban.com/subject/37890167/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "她和她的决心", |
||||
|
"url": "https://book.douban.com/subject/38178826/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "凯罗斯", |
||||
|
"url": "https://book.douban.com/subject/37825000/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "幸福蒙太奇", |
||||
|
"url": "https://book.douban.com/subject/37841159/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "螃蟹的邀请", |
||||
|
"url": "https://book.douban.com/subject/37496217/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "抄写员巴托比", |
||||
|
"url": "https://book.douban.com/subject/38392174/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "我收养了一个朋友", |
||||
|
"url": "https://book.douban.com/subject/37938861/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "哈萨比斯:谷歌AI之脑", |
||||
|
"url": "https://book.douban.com/subject/38357884/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "像女孩那样丢球", |
||||
|
"url": "https://book.douban.com/subject/37126780/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "刚刚离开的世界", |
||||
|
"url": "https://book.douban.com/subject/37447242/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "故纸浮生.1-2", |
||||
|
"url": "https://book.douban.com/subject/37648813/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "在世与认知", |
||||
|
"url": "https://book.douban.com/subject/37112076/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "呼啸山庄", |
||||
|
"url": "https://book.douban.com/subject/30471282/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "我们如何理解这个世界:与齐格蒙特·鲍曼对谈", |
||||
|
"url": "https://book.douban.com/subject/37930972/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "刮风下雨", |
||||
|
"url": "https://book.douban.com/subject/38240709/" |
||||
|
} |
||||
|
] |
||||
@ -0,0 +1,62 @@ |
|||||
|
[ |
||||
|
{ |
||||
|
"title": "爱情抓马", |
||||
|
"rating": "6.9", |
||||
|
"ratingCount": "(34363人评价)", |
||||
|
"url": "https://movie.douban.com/subject/36995126/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "世界的主人", |
||||
|
"rating": "9.1", |
||||
|
"ratingCount": "(116736人评价)", |
||||
|
"url": "https://movie.douban.com/subject/37116612/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "木乃伊", |
||||
|
"rating": "6.2", |
||||
|
"ratingCount": "(13705人评价)", |
||||
|
"url": "https://movie.douban.com/subject/36929221/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "蜂蜜的针", |
||||
|
"rating": "6.7", |
||||
|
"ratingCount": "(48214人评价)", |
||||
|
"url": "https://movie.douban.com/subject/26022233/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "杀的就是你", |
||||
|
"rating": "6.9", |
||||
|
"ratingCount": "(21794人评价)", |
||||
|
"url": "https://movie.douban.com/subject/36926954/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "惩罚者:最后一击", |
||||
|
"rating": "6.8", |
||||
|
"ratingCount": "(5478人评价)", |
||||
|
"url": "https://movie.douban.com/subject/37259325/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "蒙特利尔,我的美人", |
||||
|
"rating": "7.6", |
||||
|
"ratingCount": "(14162人评价)", |
||||
|
"url": "https://movie.douban.com/subject/37019075/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "与王生活的男人", |
||||
|
"rating": "7.4", |
||||
|
"ratingCount": "(10007人评价)", |
||||
|
"url": "https://movie.douban.com/subject/36978169/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "挽救计划", |
||||
|
"rating": "8.6", |
||||
|
"ratingCount": "(463129人评价)", |
||||
|
"url": "https://movie.douban.com/subject/35010610/" |
||||
|
}, |
||||
|
{ |
||||
|
"title": "长夜将尽", |
||||
|
"rating": "6.5", |
||||
|
"ratingCount": "(10878人评价)", |
||||
|
"url": "https://movie.douban.com/subject/35590993/" |
||||
|
} |
||||
|
] |
||||
@ -0,0 +1,250 @@ |
|||||
|
排名:1 电影:肖申克的救赎 评分:9.7 |
||||
|
排名:2 电影:霸王别姬 评分:9.6 |
||||
|
排名:3 电影:泰坦尼克号 评分:9.5 |
||||
|
排名:4 电影:阿甘正传 评分:9.5 |
||||
|
排名:5 电影:千与千寻 评分:9.4 |
||||
|
排名:6 电影:美丽人生 评分:9.5 |
||||
|
排名:7 电影:星际穿越 评分:9.4 |
||||
|
排名:8 电影:这个杀手不太冷 评分:9.4 |
||||
|
排名:9 电影:盗梦空间 评分:9.4 |
||||
|
排名:10 电影:楚门的世界 评分:9.4 |
||||
|
排名:11 电影:辛德勒的名单 评分:9.5 |
||||
|
排名:12 电影:忠犬八公的故事 评分:9.4 |
||||
|
排名:13 电影:海上钢琴师 评分:9.3 |
||||
|
排名:14 电影:疯狂动物城 评分:9.3 |
||||
|
排名:15 电影:三傻大闹宝莱坞 评分:9.2 |
||||
|
排名:16 电影:机器人总动员 评分:9.3 |
||||
|
排名:17 电影:放牛班的春天 评分:9.3 |
||||
|
排名:18 电影:无间道 评分:9.3 |
||||
|
排名:19 电影:控方证人 评分:9.6 |
||||
|
排名:20 电影:寻梦环游记 评分:9.1 |
||||
|
排名:21 电影:大话西游之大圣娶亲 评分:9.2 |
||||
|
排名:22 电影:熔炉 评分:9.3 |
||||
|
排名:23 电影:触不可及 评分:9.3 |
||||
|
排名:24 电影:教父 评分:9.3 |
||||
|
排名:25 电影:末代皇帝 评分:9.3 |
||||
|
排名:26 电影:哈利·波特与魔法石 评分:9.2 |
||||
|
排名:27 电影:当幸福来敲门 评分:9.1 |
||||
|
排名:28 电影:龙猫 评分:9.2 |
||||
|
排名:29 电影:活着 评分:9.3 |
||||
|
排名:30 电影:怦然心动 评分:9.1 |
||||
|
排名:31 电影:蝙蝠侠:黑暗骑士 评分:9.2 |
||||
|
排名:32 电影:指环王3:王者无敌 评分:9.3 |
||||
|
排名:33 电影:我不是药神 评分:9.0 |
||||
|
排名:34 电影:乱世佳人 评分:9.3 |
||||
|
排名:35 电影:飞屋环游记 评分:9.1 |
||||
|
排名:36 电影:让子弹飞 评分:9.0 |
||||
|
排名:37 电影:哈尔的移动城堡 评分:9.1 |
||||
|
排名:38 电影:十二怒汉 评分:9.4 |
||||
|
排名:39 电影:海蒂和爷爷 评分:9.3 |
||||
|
排名:40 电影:素媛 评分:9.3 |
||||
|
排名:41 电影:猫鼠游戏 评分:9.1 |
||||
|
排名:42 电影:天空之城 评分:9.2 |
||||
|
排名:43 电影:鬼子来了 评分:9.3 |
||||
|
排名:44 电影:摔跤吧!爸爸 评分:9.0 |
||||
|
排名:45 电影:少年派的奇幻漂流 评分:9.1 |
||||
|
排名:46 电影:钢琴家 评分:9.3 |
||||
|
排名:47 电影:指环王2:双塔奇兵 评分:9.2 |
||||
|
排名:48 电影:死亡诗社 评分:9.2 |
||||
|
排名:49 电影:大话西游之月光宝盒 评分:9.0 |
||||
|
排名:50 电影:绿皮书 评分:8.9 |
||||
|
排名:51 电影:何以为家 评分:9.1 |
||||
|
排名:52 电影:闻香识女人 评分:9.1 |
||||
|
排名:53 电影:大闹天宫 评分:9.4 |
||||
|
排名:54 电影:黑客帝国 评分:9.1 |
||||
|
排名:55 电影:指环王1:护戒使者 评分:9.1 |
||||
|
排名:56 电影:罗马假日 评分:9.1 |
||||
|
排名:57 电影:教父2 评分:9.3 |
||||
|
排名:58 电影:狮子王 评分:9.1 |
||||
|
排名:59 电影:天堂电影院 评分:9.2 |
||||
|
排名:60 电影:饮食男女 评分:9.2 |
||||
|
排名:61 电影:辩护人 评分:9.2 |
||||
|
排名:62 电影:本杰明·巴顿奇事 评分:9.0 |
||||
|
排名:63 电影:搏击俱乐部 评分:9.0 |
||||
|
排名:64 电影:美丽心灵 评分:9.1 |
||||
|
排名:65 电影:穿条纹睡衣的男孩 评分:9.2 |
||||
|
排名:66 电影:哈利·波特与死亡圣器(下) 评分:9.0 |
||||
|
排名:67 电影:情书 评分:8.9 |
||||
|
排名:68 电影:两杆大烟枪 评分:9.1 |
||||
|
排名:69 电影:窃听风暴 评分:9.2 |
||||
|
排名:70 电影:音乐之声 评分:9.1 |
||||
|
排名:71 电影:功夫 评分:8.9 |
||||
|
排名:72 电影:哈利·波特与阿兹卡班的囚徒 评分:9.0 |
||||
|
排名:73 电影:阿凡达 评分:8.8 |
||||
|
排名:74 电影:西西里的美丽传说 评分:8.9 |
||||
|
排名:75 电影:看不见的客人 评分:8.8 |
||||
|
排名:76 电影:拯救大兵瑞恩 评分:9.1 |
||||
|
排名:77 电影:沉默的羔羊 评分:8.9 |
||||
|
排名:78 电影:小鞋子 评分:9.2 |
||||
|
排名:79 电影:布达佩斯大饭店 评分:8.9 |
||||
|
排名:80 电影:蝴蝶效应 评分:8.9 |
||||
|
排名:81 电影:飞越疯人院 评分:9.1 |
||||
|
排名:82 电影:还有明天 评分:9.3 |
||||
|
排名:83 电影:禁闭岛 评分:8.9 |
||||
|
排名:84 电影:心灵捕手 评分:9.0 |
||||
|
排名:85 电影:致命魔术 评分:8.9 |
||||
|
排名:86 电影:低俗小说 评分:8.9 |
||||
|
排名:87 电影:哈利·波特与密室 评分:8.9 |
||||
|
排名:88 电影:超脱 评分:9.0 |
||||
|
排名:89 电影:一一 评分:9.1 |
||||
|
排名:90 电影:喜剧之王 评分:8.8 |
||||
|
排名:91 电影:杀人回忆 评分:8.9 |
||||
|
排名:92 电影:致命ID 评分:8.9 |
||||
|
排名:93 电影:摩登时代 评分:9.3 |
||||
|
排名:94 电影:春光乍泄 评分:9.0 |
||||
|
排名:95 电影:加勒比海盗 评分:8.8 |
||||
|
排名:96 电影:海豚湾 评分:9.3 |
||||
|
排名:97 电影:美国往事 评分:9.1 |
||||
|
排名:98 电影:红辣椒 评分:9.0 |
||||
|
排名:99 电影:七宗罪 评分:8.8 |
||||
|
排名:100 电影:唐伯虎点秋香 评分:8.8 |
||||
|
排名:101 电影:狩猎 评分:9.1 |
||||
|
排名:102 电影:幽灵公主 评分:8.9 |
||||
|
排名:103 电影:甜蜜蜜 评分:8.9 |
||||
|
排名:104 电影:寄生虫 评分:8.8 |
||||
|
排名:105 电影:天书奇谭 评分:9.2 |
||||
|
排名:106 电影:蝙蝠侠:黑暗骑士崛起 评分:8.9 |
||||
|
排名:107 电影:超能陆战队 评分:8.8 |
||||
|
排名:108 电影:7号房的礼物 评分:8.9 |
||||
|
排名:109 电影:茶馆 评分:9.5 |
||||
|
排名:110 电影:第六感 评分:8.9 |
||||
|
排名:111 电影:爱在黎明破晓前 评分:8.8 |
||||
|
排名:112 电影:爱在日落黄昏时 评分:8.9 |
||||
|
排名:113 电影:被嫌弃的松子的一生 评分:8.8 |
||||
|
排名:114 电影:头脑特工队 评分:8.8 |
||||
|
排名:115 电影:哈利·波特与火焰杯 评分:8.8 |
||||
|
排名:116 电影:未麻的部屋 评分:9.1 |
||||
|
排名:117 电影:重庆森林 评分:8.8 |
||||
|
排名:118 电影:借东西的小人阿莉埃蒂 评分:8.9 |
||||
|
排名:119 电影:菊次郎的夏天 评分:8.9 |
||||
|
排名:120 电影:入殓师 评分:8.9 |
||||
|
排名:121 电影:断背山 评分:8.8 |
||||
|
排名:122 电影:剪刀手爱德华 评分:8.7 |
||||
|
排名:123 电影:勇敢的心 评分:8.9 |
||||
|
排名:124 电影:时空恋旅人 评分:8.8 |
||||
|
排名:125 电影:驯龙高手 评分:8.8 |
||||
|
排名:126 电影:消失的爱人 评分:8.7 |
||||
|
排名:127 电影:无人知晓 评分:9.1 |
||||
|
排名:128 电影:傲慢与偏见 评分:8.7 |
||||
|
排名:129 电影:倩女幽魂 评分:8.8 |
||||
|
排名:130 电影:新世界 评分:8.9 |
||||
|
排名:131 电影:花样年华 评分:8.8 |
||||
|
排名:132 电影:玩具总动员3 评分:8.9 |
||||
|
排名:133 电影:一个叫欧维的男人决定去死 评分:8.9 |
||||
|
排名:134 电影:色,戒 评分:8.7 |
||||
|
排名:135 电影:完美的世界 评分:9.1 |
||||
|
排名:136 电影:阳光灿烂的日子 评分:8.8 |
||||
|
排名:137 电影:怪兽电力公司 评分:8.8 |
||||
|
排名:138 电影:教父3 评分:9.0 |
||||
|
排名:139 电影:小森林 夏秋篇 评分:9.0 |
||||
|
排名:140 电影:天使爱美丽 评分:8.7 |
||||
|
排名:141 电影:侧耳倾听 评分:8.9 |
||||
|
排名:142 电影:哪吒闹海 评分:9.2 |
||||
|
排名:143 电影:九品芝麻官 评分:8.8 |
||||
|
排名:144 电影:被解救的姜戈 评分:8.8 |
||||
|
排名:145 电影:请以你的名字呼唤我 评分:8.8 |
||||
|
排名:146 电影:幸福终点站 评分:8.8 |
||||
|
排名:147 电影:釜山行 评分:8.6 |
||||
|
排名:148 电影:神偷奶爸 评分:8.7 |
||||
|
排名:149 电影:小森林 冬春篇 评分:9.0 |
||||
|
排名:150 电影:喜宴 评分:9.0 |
||||
|
排名:151 电影:萤火之森 评分:8.8 |
||||
|
排名:152 电影:告白 评分:8.8 |
||||
|
排名:153 电影:玛丽和麦克斯 评分:9.0 |
||||
|
排名:154 电影:七武士 评分:9.3 |
||||
|
排名:155 电影:头号玩家 评分:8.6 |
||||
|
排名:156 电影:模仿游戏 评分:8.8 |
||||
|
排名:157 电影:惊魂记 评分:9.0 |
||||
|
排名:158 电影:大鱼 评分:8.8 |
||||
|
排名:159 电影:机器人之梦 评分:9.1 |
||||
|
排名:160 电影:心灵奇旅 评分:8.7 |
||||
|
排名:161 电影:背靠背,脸对脸 评分:9.5 |
||||
|
排名:162 电影:射雕英雄传之东成西就 评分:8.7 |
||||
|
排名:163 电影:血战钢锯岭 评分:8.7 |
||||
|
排名:164 电影:你的名字。 评分:8.5 |
||||
|
排名:165 电影:我是山姆 评分:9.0 |
||||
|
排名:166 电影:阳光姐妹淘 评分:8.8 |
||||
|
排名:167 电影:恐怖直播 评分:8.7 |
||||
|
排名:168 电影:黑客帝国3:矩阵革命 评分:8.8 |
||||
|
排名:169 电影:末路狂花 评分:9.0 |
||||
|
排名:170 电影:高山下的花环 评分:9.5 |
||||
|
排名:171 电影:小丑 评分:8.7 |
||||
|
排名:172 电影:谍影重重3 评分:8.9 |
||||
|
排名:173 电影:三块广告牌 评分:8.7 |
||||
|
排名:174 电影:电锯惊魂 评分:8.7 |
||||
|
排名:175 电影:无间道2 评分:8.8 |
||||
|
排名:176 电影:达拉斯买家俱乐部 评分:8.8 |
||||
|
排名:177 电影:疯狂原始人 评分:8.7 |
||||
|
排名:178 电影:绿里奇迹 评分:8.9 |
||||
|
排名:179 电影:爱在午夜降临前 评分:8.9 |
||||
|
排名:180 电影:疯狂的石头 评分:8.6 |
||||
|
排名:181 电影:雨中曲 评分:9.1 |
||||
|
排名:182 电影:2001太空漫游 评分:8.9 |
||||
|
排名:183 电影:海街日记 评分:8.8 |
||||
|
排名:184 电影:风之谷 评分:8.9 |
||||
|
排名:185 电影:上帝之城 评分:9.0 |
||||
|
排名:186 电影:心迷宫 评分:8.7 |
||||
|
排名:187 电影:英雄本色 评分:8.6 |
||||
|
排名:188 电影:记忆碎片 评分:8.7 |
||||
|
排名:189 电影:纵横四海 评分:8.8 |
||||
|
排名:190 电影:无敌破坏王 评分:8.7 |
||||
|
排名:191 电影:卢旺达饭店 评分:8.9 |
||||
|
排名:192 电影:牯岭街少年杀人事件 评分:8.9 |
||||
|
排名:193 电影:恐怖游轮 评分:8.5 |
||||
|
排名:194 电影:东京教父 评分:9.0 |
||||
|
排名:195 电影:小偷家族 评分:8.7 |
||||
|
排名:196 电影:魔女宅急便 评分:8.7 |
||||
|
排名:197 电影:冰川时代 评分:8.7 |
||||
|
排名:198 电影:芙蓉镇 评分:9.3 |
||||
|
排名:199 电影:忠犬八公物语 评分:9.2 |
||||
|
排名:200 电影:岁月神偷 评分:8.7 |
||||
|
排名:201 电影:遗愿清单 评分:8.7 |
||||
|
排名:202 电影:荒蛮故事 评分:8.7 |
||||
|
排名:203 电影:大佛普拉斯 评分:8.7 |
||||
|
排名:204 电影:源代码 评分:8.6 |
||||
|
排名:205 电影:花束般的恋爱 评分:8.6 |
||||
|
排名:206 电影:白日梦想家 评分:8.6 |
||||
|
排名:207 电影:爱乐之城 评分:8.4 |
||||
|
排名:208 电影:疯狂的麦克斯4:狂暴之路 评分:8.7 |
||||
|
排名:209 电影:可可西里 评分:8.9 |
||||
|
排名:210 电影:你看起来好像很好吃 评分:8.9 |
||||
|
排名:211 电影:贫民窟的百万富翁 评分:8.6 |
||||
|
排名:212 电影:波西米亚狂想曲 评分:8.6 |
||||
|
排名:213 电影:城市之光 评分:9.3 |
||||
|
排名:214 电影:爆裂鼓手 评分:8.6 |
||||
|
排名:215 电影:青蛇 评分:8.6 |
||||
|
排名:216 电影:哈利·波特与死亡圣器(上) 评分:8.6 |
||||
|
排名:217 电影:无耻混蛋 评分:8.7 |
||||
|
排名:218 电影:东邪西毒 评分:8.6 |
||||
|
排名:219 电影:终结者2:审判日 评分:8.8 |
||||
|
排名:220 电影:大红灯笼高高挂 评分:8.8 |
||||
|
排名:221 电影:黑天鹅 评分:8.6 |
||||
|
排名:222 电影:新龙门客栈 评分:8.7 |
||||
|
排名:223 电影:初恋这件小事 评分:8.5 |
||||
|
排名:224 电影:千钧一发 评分:8.8 |
||||
|
排名:225 电影:人工智能 评分:8.7 |
||||
|
排名:226 电影:崖上的波妞 评分:8.6 |
||||
|
排名:227 电影:雨人 评分:8.7 |
||||
|
排名:228 电影:虎口脱险 评分:8.9 |
||||
|
排名:229 电影:哈利·波特与凤凰社 评分:8.6 |
||||
|
排名:230 电影:彗星来的那一夜 评分:8.6 |
||||
|
排名:231 电影:罗生门 评分:8.8 |
||||
|
排名:232 电影:海边的曼彻斯特 评分:8.6 |
||||
|
排名:233 电影:恋恋笔记本 评分:8.5 |
||||
|
排名:234 电影:火星救援 评分:8.5 |
||||
|
排名:235 电影:真爱至上 评分:8.5 |
||||
|
排名:236 电影:黑客帝国2:重装上阵 评分:8.7 |
||||
|
排名:237 电影:冰雪奇缘 评分:8.5 |
||||
|
排名:238 电影:步履不停 评分:8.8 |
||||
|
排名:239 电影:奇迹男孩 评分:8.6 |
||||
|
排名:240 电影:千年女优 评分:8.8 |
||||
|
排名:241 电影:谍影重重2 评分:8.7 |
||||
|
排名:242 电影:战争之王 评分:8.7 |
||||
|
排名:243 电影:蜘蛛侠:平行宇宙 评分:8.6 |
||||
|
排名:244 电影:攻壳机动队 评分:9.0 |
||||
|
排名:245 电影:血钻 评分:8.7 |
||||
|
排名:246 电影:小姐 评分:8.5 |
||||
|
排名:247 电影:隐藏人物 评分:8.9 |
||||
|
排名:248 电影:血观音 评分:8.6 |
||||
|
排名:249 电影:魂断蓝桥 评分:8.8 |
||||
|
排名:250 电影:房间 评分:8.7 |
||||
File diff suppressed because it is too large
File diff suppressed because it is too large
@ -0,0 +1,74 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" |
||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
||||
|
<modelVersion>4.0.0</modelVersion> |
||||
|
|
||||
|
<groupId>com.crawler</groupId> |
||||
|
<artifactId>web-crawler</artifactId> |
||||
|
<version>1.0-SNAPSHOT</version> |
||||
|
<packaging>jar</packaging> |
||||
|
|
||||
|
<name>Web Crawler</name> |
||||
|
<description>Multi-site web crawler with CLI, MVC, Command pattern and Strategy pattern</description> |
||||
|
|
||||
|
<properties> |
||||
|
<maven.compiler.source>11</maven.compiler.source> |
||||
|
<maven.compiler.target>11</maven.compiler.target> |
||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
||||
|
</properties> |
||||
|
|
||||
|
<dependencies> |
||||
|
<dependency> |
||||
|
<groupId>org.jsoup</groupId> |
||||
|
<artifactId>jsoup</artifactId> |
||||
|
<version>1.16.1</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>com.google.code.gson</groupId> |
||||
|
<artifactId>gson</artifactId> |
||||
|
<version>2.10.1</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>org.slf4j</groupId> |
||||
|
<artifactId>slf4j-api</artifactId> |
||||
|
<version>2.0.9</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>org.slf4j</groupId> |
||||
|
<artifactId>slf4j-simple</artifactId> |
||||
|
<version>2.0.9</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>info.picocli</groupId> |
||||
|
<artifactId>picocli</artifactId> |
||||
|
<version>4.7.5</version> |
||||
|
</dependency> |
||||
|
</dependencies> |
||||
|
|
||||
|
<build> |
||||
|
<plugins> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-jar-plugin</artifactId> |
||||
|
<version>3.3.0</version> |
||||
|
<configuration> |
||||
|
<archive> |
||||
|
<manifest> |
||||
|
<mainClass>com.crawler.Main</mainClass> |
||||
|
</manifest> |
||||
|
</archive> |
||||
|
</configuration> |
||||
|
</plugin> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-compiler-plugin</artifactId> |
||||
|
<version>3.11.0</version> |
||||
|
<configuration> |
||||
|
<source>11</source> |
||||
|
<target>11</target> |
||||
|
</configuration> |
||||
|
</plugin> |
||||
|
</plugins> |
||||
|
</build> |
||||
|
</project> |
||||
@ -0,0 +1,24 @@ |
|||||
|
@echo off |
||||
|
echo ======================================== |
||||
|
echo Web Crawler Application |
||||
|
echo ======================================== |
||||
|
echo. |
||||
|
|
||||
|
if not exist "target\web-crawler-1.0-SNAPSHOT.jar" ( |
||||
|
echo Compiling project... |
||||
|
mvn clean package |
||||
|
if errorlevel 1 ( |
||||
|
echo Compilation failed! |
||||
|
pause |
||||
|
exit /b 1 |
||||
|
) |
||||
|
echo. |
||||
|
) |
||||
|
|
||||
|
echo Running crawler... |
||||
|
echo. |
||||
|
|
||||
|
java -jar target\web-crawler-1.0-SNAPSHOT.jar %* |
||||
|
|
||||
|
echo. |
||||
|
pause |
||||
@ -0,0 +1 @@ |
|||||
|
// 此文件已废弃,请使用 main.CrawlerManager
|
||||
@ -0,0 +1,133 @@ |
|||||
|
package cli; |
||||
|
|
||||
|
import command.*; |
||||
|
import controller.CrawlerController; |
||||
|
import exception.CrawlerException; |
||||
|
import exception.CrawlerResult; |
||||
|
import exception.ValidationException; |
||||
|
import view.CrawlerView; |
||||
|
|
||||
|
import java.util.Arrays; |
||||
|
|
||||
|
public class CrawlerCLI { |
||||
|
private final CrawlerController controller; |
||||
|
private final CrawlerView view; |
||||
|
private final CommandRegistry commandRegistry; |
||||
|
|
||||
|
public CrawlerCLI() { |
||||
|
this.controller = new CrawlerController(); |
||||
|
this.view = new CrawlerView(); |
||||
|
this.commandRegistry = new CommandRegistry(); |
||||
|
initCommands(); |
||||
|
} |
||||
|
|
||||
|
public CrawlerCLI(String outputDir) { |
||||
|
this.controller = new CrawlerController(outputDir); |
||||
|
this.view = new CrawlerView(); |
||||
|
this.commandRegistry = new CommandRegistry(); |
||||
|
initCommands(); |
||||
|
} |
||||
|
|
||||
|
private void initCommands() { |
||||
|
commandRegistry.register(new RunAllCommand(controller)); |
||||
|
commandRegistry.register(new ListCrawlersCommand(controller)); |
||||
|
commandRegistry.register(new StatsCommand(controller)); |
||||
|
commandRegistry.register(new ClearCommand(controller)); |
||||
|
} |
||||
|
|
||||
|
public void run(String[] args) { |
||||
|
view.showWelcome(); |
||||
|
|
||||
|
if (args == null || args.length == 0) { |
||||
|
view.showHelp(); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String commandName = args[0].toLowerCase().trim(); |
||||
|
|
||||
|
try { |
||||
|
switch (commandName) { |
||||
|
case "help": |
||||
|
case "-h": |
||||
|
case "--help": |
||||
|
view.showHelp(); |
||||
|
break; |
||||
|
|
||||
|
case "list": |
||||
|
case "ls": |
||||
|
handleList(); |
||||
|
break; |
||||
|
|
||||
|
case "run": |
||||
|
handleRun(args); |
||||
|
break; |
||||
|
|
||||
|
case "run-all": |
||||
|
case "all": |
||||
|
handleRunAll(); |
||||
|
break; |
||||
|
|
||||
|
case "stats": |
||||
|
handleStats(); |
||||
|
break; |
||||
|
|
||||
|
case "clear": |
||||
|
handleClear(); |
||||
|
break; |
||||
|
|
||||
|
default: |
||||
|
view.showError("未知命令: " + commandName + "\n使用 'help' 查看可用命令"); |
||||
|
} |
||||
|
} catch (ValidationException e) { |
||||
|
view.showError(e.getMessage()); |
||||
|
view.showHelp(); |
||||
|
} catch (CrawlerException e) { |
||||
|
view.showError("爬虫错误 [" + e.getErrorCode() + "]: " + e.getMessage()); |
||||
|
} catch (Exception e) { |
||||
|
view.showError("系统错误: " + e.getMessage()); |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void handleList() { |
||||
|
Command cmd = commandRegistry.getCommand("list"); |
||||
|
cmd.execute(); |
||||
|
view.showCrawlerList(controller.getAllCrawlerNames()); |
||||
|
} |
||||
|
|
||||
|
private void handleRun(String[] args) { |
||||
|
if (args.length < 2) { |
||||
|
view.showError("请指定爬虫名称\n示例: run 豆瓣电影Top250"); |
||||
|
view.showCrawlerList(controller.getAllCrawlerNames()); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
String crawlerName = args[1]; |
||||
|
Command cmd = new RunSingleCommand(controller, crawlerName); |
||||
|
CrawlerResult result = cmd.execute(); |
||||
|
view.showResult(result); |
||||
|
} |
||||
|
|
||||
|
private void handleRunAll() { |
||||
|
Command cmd = commandRegistry.getCommand("run-all"); |
||||
|
CrawlerResult result = cmd.execute(); |
||||
|
view.showResult(result); |
||||
|
} |
||||
|
|
||||
|
private void handleStats() { |
||||
|
Command cmd = commandRegistry.getCommand("stats"); |
||||
|
CrawlerResult result = cmd.execute(); |
||||
|
view.showMessage(result.getMessage()); |
||||
|
} |
||||
|
|
||||
|
private void handleClear() { |
||||
|
Command cmd = commandRegistry.getCommand("clear"); |
||||
|
CrawlerResult result = cmd.execute(); |
||||
|
view.showResult(result); |
||||
|
} |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
CrawlerCLI cli = new CrawlerCLI(); |
||||
|
cli.run(args); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,34 @@ |
|||||
|
package com.crawler; |
||||
|
|
||||
|
import com.crawler.controller.CrawlerController; |
||||
|
import picocli.CommandLine; |
||||
|
import picocli.CommandLine.Command; |
||||
|
import picocli.CommandLine.Option; |
||||
|
|
||||
|
@Command(name = "crawler", mixinStandardHelpOptions = true, version = "1.0", |
||||
|
description = "Web Crawler - Crawl Douban Movies, Douban Books, and Books to Scrape") |
||||
|
public class Main implements Runnable { |
||||
|
|
||||
|
@Option(names = {"-s", "--site"}, description = "Site to crawl: douban-movie, douban-book, books-to-scrape, all", |
||||
|
defaultValue = "all") |
||||
|
private String site; |
||||
|
|
||||
|
@Option(names = {"-i", "--interactive"}, description = "Run in interactive mode") |
||||
|
private boolean interactive; |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
int exitCode = new CommandLine(new Main()).execute(args); |
||||
|
System.exit(exitCode); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void run() { |
||||
|
CrawlerController controller = new CrawlerController(); |
||||
|
|
||||
|
if (interactive) { |
||||
|
controller.runInteractive(); |
||||
|
} else { |
||||
|
controller.crawlBySite(site); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,8 @@ |
|||||
|
package com.crawler.command; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
|
||||
|
public interface Command { |
||||
|
void execute() throws CrawlerException; |
||||
|
String getDescription(); |
||||
|
} |
||||
@ -0,0 +1,37 @@ |
|||||
|
package com.crawler.command; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CrawlAllCommand implements Command { |
||||
|
private final List<Command> commands; |
||||
|
private final String description; |
||||
|
|
||||
|
public CrawlAllCommand(String description) { |
||||
|
this.commands = new ArrayList<>(); |
||||
|
this.description = description; |
||||
|
} |
||||
|
|
||||
|
public void addCommand(Command command) { |
||||
|
commands.add(command); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws CrawlerException { |
||||
|
for (Command command : commands) { |
||||
|
try { |
||||
|
command.execute(); |
||||
|
} catch (CrawlerException e) { |
||||
|
System.err.println("Error executing command: " + command.getDescription()); |
||||
|
System.err.println("Error: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return description; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,28 @@ |
|||||
|
package com.crawler.command; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.strategy.CrawlerStrategy; |
||||
|
import com.crawler.util.FileUtil; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CrawlCommand<T> implements Command { |
||||
|
private final CrawlerStrategy<T> strategy; |
||||
|
private final String description; |
||||
|
|
||||
|
public CrawlCommand(CrawlerStrategy<T> strategy, String description) { |
||||
|
this.strategy = strategy; |
||||
|
this.description = description; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws CrawlerException { |
||||
|
List<T> data = strategy.crawl(); |
||||
|
FileUtil.saveToJsonFile(data, strategy.getOutputFileName()); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return description; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,119 @@ |
|||||
|
package com.crawler.controller; |
||||
|
|
||||
|
import com.crawler.command.Command; |
||||
|
import com.crawler.command.CrawlAllCommand; |
||||
|
import com.crawler.command.CrawlCommand; |
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.strategy.BooksToScrapeStrategy; |
||||
|
import com.crawler.strategy.CrawlerStrategy; |
||||
|
import com.crawler.strategy.DoubanBookStrategy; |
||||
|
import com.crawler.strategy.DoubanMovieStrategy; |
||||
|
import com.crawler.view.ConsoleView; |
||||
|
|
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
public class CrawlerController { |
||||
|
private final ConsoleView view; |
||||
|
private final Scanner scanner; |
||||
|
|
||||
|
public CrawlerController() { |
||||
|
this.view = new ConsoleView(); |
||||
|
this.scanner = new Scanner(System.in); |
||||
|
} |
||||
|
|
||||
|
public void runInteractive() { |
||||
|
view.displayWelcome(); |
||||
|
|
||||
|
while (true) { |
||||
|
view.displayMenu(); |
||||
|
String input = scanner.nextLine().trim(); |
||||
|
|
||||
|
try { |
||||
|
int choice = Integer.parseInt(input); |
||||
|
switch (choice) { |
||||
|
case 1: |
||||
|
crawlDoubanMovies(); |
||||
|
break; |
||||
|
case 2: |
||||
|
crawlDoubanBooks(); |
||||
|
break; |
||||
|
case 3: |
||||
|
crawlBooksToScrape(); |
||||
|
break; |
||||
|
case 4: |
||||
|
crawlAll(); |
||||
|
break; |
||||
|
case 0: |
||||
|
view.displayGoodbye(); |
||||
|
return; |
||||
|
default: |
||||
|
view.displayInvalidChoice(); |
||||
|
} |
||||
|
} catch (NumberFormatException e) { |
||||
|
view.displayInvalidChoice(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void crawlDoubanMovies() { |
||||
|
CrawlerStrategy<?> strategy = new DoubanMovieStrategy(); |
||||
|
Command command = new CrawlCommand<>(strategy, "Douban Movies"); |
||||
|
executeCommand(command, strategy.getOutputFileName()); |
||||
|
} |
||||
|
|
||||
|
public void crawlDoubanBooks() { |
||||
|
CrawlerStrategy<?> strategy = new DoubanBookStrategy(); |
||||
|
Command command = new CrawlCommand<>(strategy, "Douban Books"); |
||||
|
executeCommand(command, strategy.getOutputFileName()); |
||||
|
} |
||||
|
|
||||
|
public void crawlBooksToScrape() { |
||||
|
CrawlerStrategy<?> strategy = new BooksToScrapeStrategy(); |
||||
|
Command command = new CrawlCommand<>(strategy, "Books to Scrape"); |
||||
|
executeCommand(command, strategy.getOutputFileName()); |
||||
|
} |
||||
|
|
||||
|
public void crawlAll() { |
||||
|
CrawlAllCommand allCommand = new CrawlAllCommand("Crawl All"); |
||||
|
allCommand.addCommand(new CrawlCommand<>(new DoubanMovieStrategy(), "Douban Movies")); |
||||
|
allCommand.addCommand(new CrawlCommand<>(new DoubanBookStrategy(), "Douban Books")); |
||||
|
allCommand.addCommand(new CrawlCommand<>(new BooksToScrapeStrategy(), "Books to Scrape")); |
||||
|
|
||||
|
try { |
||||
|
view.displayCrawling("All Websites"); |
||||
|
allCommand.execute(); |
||||
|
view.displaySuccess("data/ (all files)"); |
||||
|
} catch (CrawlerException e) { |
||||
|
view.displayError(e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void crawlBySite(String site) { |
||||
|
switch (site.toLowerCase()) { |
||||
|
case "douban-movie": |
||||
|
crawlDoubanMovies(); |
||||
|
break; |
||||
|
case "douban-book": |
||||
|
crawlDoubanBooks(); |
||||
|
break; |
||||
|
case "books-to-scrape": |
||||
|
crawlBooksToScrape(); |
||||
|
break; |
||||
|
case "all": |
||||
|
crawlAll(); |
||||
|
break; |
||||
|
default: |
||||
|
view.displayError("Unknown site: " + site); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void executeCommand(Command command, String fileName) { |
||||
|
try { |
||||
|
view.displayCrawling(command.getDescription()); |
||||
|
command.execute(); |
||||
|
view.displaySuccess(fileName); |
||||
|
} catch (CrawlerException e) { |
||||
|
view.displayError(e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class CrawlerException extends Exception { |
||||
|
public CrawlerException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class FileException extends CrawlerException { |
||||
|
public FileException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public FileException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class NetworkException extends CrawlerException { |
||||
|
public NetworkException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.crawler.exception; |
||||
|
|
||||
|
public class ParseException extends CrawlerException { |
||||
|
public ParseException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,105 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
public class Book { |
||||
|
private String title; |
||||
|
private String author; |
||||
|
private String rating; |
||||
|
private String ratingCount; |
||||
|
private String publisher; |
||||
|
private String publishDate; |
||||
|
private String price; |
||||
|
private String isbn; |
||||
|
private String summary; |
||||
|
private String url; |
||||
|
|
||||
|
public Book() {} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public String getAuthor() { |
||||
|
return author; |
||||
|
} |
||||
|
|
||||
|
public void setAuthor(String author) { |
||||
|
this.author = author; |
||||
|
} |
||||
|
|
||||
|
public String getRating() { |
||||
|
return rating; |
||||
|
} |
||||
|
|
||||
|
public void setRating(String rating) { |
||||
|
this.rating = rating; |
||||
|
} |
||||
|
|
||||
|
public String getRatingCount() { |
||||
|
return ratingCount; |
||||
|
} |
||||
|
|
||||
|
public void setRatingCount(String ratingCount) { |
||||
|
this.ratingCount = ratingCount; |
||||
|
} |
||||
|
|
||||
|
public String getPublisher() { |
||||
|
return publisher; |
||||
|
} |
||||
|
|
||||
|
public void setPublisher(String publisher) { |
||||
|
this.publisher = publisher; |
||||
|
} |
||||
|
|
||||
|
public String getPublishDate() { |
||||
|
return publishDate; |
||||
|
} |
||||
|
|
||||
|
public void setPublishDate(String publishDate) { |
||||
|
this.publishDate = publishDate; |
||||
|
} |
||||
|
|
||||
|
public String getPrice() { |
||||
|
return price; |
||||
|
} |
||||
|
|
||||
|
public void setPrice(String price) { |
||||
|
this.price = price; |
||||
|
} |
||||
|
|
||||
|
public String getIsbn() { |
||||
|
return isbn; |
||||
|
} |
||||
|
|
||||
|
public void setIsbn(String isbn) { |
||||
|
this.isbn = isbn; |
||||
|
} |
||||
|
|
||||
|
public String getSummary() { |
||||
|
return summary; |
||||
|
} |
||||
|
|
||||
|
public void setSummary(String summary) { |
||||
|
this.summary = summary; |
||||
|
} |
||||
|
|
||||
|
public String getUrl() { |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
public void setUrl(String url) { |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Book{" + |
||||
|
"title='" + title + '\'' + |
||||
|
", author='" + author + '\'' + |
||||
|
", rating='" + rating + '\'' + |
||||
|
'}'; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,96 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
public class Movie { |
||||
|
private String title; |
||||
|
private String rating; |
||||
|
private String ratingCount; |
||||
|
private String year; |
||||
|
private String director; |
||||
|
private String actors; |
||||
|
private String genre; |
||||
|
private String summary; |
||||
|
private String url; |
||||
|
|
||||
|
public Movie() {} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public String getRating() { |
||||
|
return rating; |
||||
|
} |
||||
|
|
||||
|
public void setRating(String rating) { |
||||
|
this.rating = rating; |
||||
|
} |
||||
|
|
||||
|
public String getRatingCount() { |
||||
|
return ratingCount; |
||||
|
} |
||||
|
|
||||
|
public void setRatingCount(String ratingCount) { |
||||
|
this.ratingCount = ratingCount; |
||||
|
} |
||||
|
|
||||
|
public String getYear() { |
||||
|
return year; |
||||
|
} |
||||
|
|
||||
|
public void setYear(String year) { |
||||
|
this.year = year; |
||||
|
} |
||||
|
|
||||
|
public String getDirector() { |
||||
|
return director; |
||||
|
} |
||||
|
|
||||
|
public void setDirector(String director) { |
||||
|
this.director = director; |
||||
|
} |
||||
|
|
||||
|
public String getActors() { |
||||
|
return actors; |
||||
|
} |
||||
|
|
||||
|
public void setActors(String actors) { |
||||
|
this.actors = actors; |
||||
|
} |
||||
|
|
||||
|
public String getGenre() { |
||||
|
return genre; |
||||
|
} |
||||
|
|
||||
|
public void setGenre(String genre) { |
||||
|
this.genre = genre; |
||||
|
} |
||||
|
|
||||
|
public String getSummary() { |
||||
|
return summary; |
||||
|
} |
||||
|
|
||||
|
public void setSummary(String summary) { |
||||
|
this.summary = summary; |
||||
|
} |
||||
|
|
||||
|
public String getUrl() { |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
public void setUrl(String url) { |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Movie{" + |
||||
|
"title='" + title + '\'' + |
||||
|
", rating='" + rating + '\'' + |
||||
|
", year='" + year + '\'' + |
||||
|
'}'; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,69 @@ |
|||||
|
package com.crawler.model; |
||||
|
|
||||
|
public class ScrapeBook { |
||||
|
private String title; |
||||
|
private String price; |
||||
|
private String rating; |
||||
|
private String availability; |
||||
|
private String imageUrl; |
||||
|
private String productUrl; |
||||
|
|
||||
|
public ScrapeBook() {} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public String getPrice() { |
||||
|
return price; |
||||
|
} |
||||
|
|
||||
|
public void setPrice(String price) { |
||||
|
this.price = price; |
||||
|
} |
||||
|
|
||||
|
public String getRating() { |
||||
|
return rating; |
||||
|
} |
||||
|
|
||||
|
public void setRating(String rating) { |
||||
|
this.rating = rating; |
||||
|
} |
||||
|
|
||||
|
public String getAvailability() { |
||||
|
return availability; |
||||
|
} |
||||
|
|
||||
|
public void setAvailability(String availability) { |
||||
|
this.availability = availability; |
||||
|
} |
||||
|
|
||||
|
public String getImageUrl() { |
||||
|
return imageUrl; |
||||
|
} |
||||
|
|
||||
|
public void setImageUrl(String imageUrl) { |
||||
|
this.imageUrl = imageUrl; |
||||
|
} |
||||
|
|
||||
|
public String getProductUrl() { |
||||
|
return productUrl; |
||||
|
} |
||||
|
|
||||
|
public void setProductUrl(String productUrl) { |
||||
|
this.productUrl = productUrl; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "ScrapeBook{" + |
||||
|
"title='" + title + '\'' + |
||||
|
", price='" + price + '\'' + |
||||
|
", rating='" + rating + '\'' + |
||||
|
'}'; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,72 @@ |
|||||
|
package com.crawler.strategy; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.exception.NetworkException; |
||||
|
import com.crawler.exception.ParseException; |
||||
|
import com.crawler.model.ScrapeBook; |
||||
|
import com.crawler.util.HttpUtil; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class BooksToScrapeStrategy implements CrawlerStrategy<ScrapeBook> { |
||||
|
private static final String URL = "http://books.toscrape.com/"; |
||||
|
|
||||
|
@Override |
||||
|
public List<ScrapeBook> crawl() throws CrawlerException { |
||||
|
List<ScrapeBook> books = new ArrayList<>(); |
||||
|
try { |
||||
|
Document doc = HttpUtil.getDocument(URL); |
||||
|
Elements items = doc.select("article.product_pod"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
ScrapeBook book = new ScrapeBook(); |
||||
|
Element titleEl = item.selectFirst("h3 a"); |
||||
|
if (titleEl != null) { |
||||
|
book.setTitle(titleEl.attr("title")); |
||||
|
book.setProductUrl(URL + titleEl.attr("href")); |
||||
|
} |
||||
|
|
||||
|
Element priceEl = item.selectFirst("p.price_color"); |
||||
|
if (priceEl != null) { |
||||
|
book.setPrice(priceEl.text()); |
||||
|
} |
||||
|
|
||||
|
Element availabilityEl = item.selectFirst("p.instock"); |
||||
|
if (availabilityEl != null) { |
||||
|
book.setAvailability(availabilityEl.text().trim()); |
||||
|
} |
||||
|
|
||||
|
Element starRatingEl = item.selectFirst("p.star-rating"); |
||||
|
if (starRatingEl != null) { |
||||
|
String classes = starRatingEl.className(); |
||||
|
if (classes.contains("One")) book.setRating("1"); |
||||
|
else if (classes.contains("Two")) book.setRating("2"); |
||||
|
else if (classes.contains("Three")) book.setRating("3"); |
||||
|
else if (classes.contains("Four")) book.setRating("4"); |
||||
|
else if (classes.contains("Five")) book.setRating("5"); |
||||
|
} |
||||
|
|
||||
|
Element imgEl = item.selectFirst("img"); |
||||
|
if (imgEl != null) { |
||||
|
book.setImageUrl(URL + imgEl.attr("src")); |
||||
|
} |
||||
|
|
||||
|
books.add(book); |
||||
|
} |
||||
|
} catch (NetworkException e) { |
||||
|
throw e; |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseException("Failed to parse Books to Scrape page", e); |
||||
|
} |
||||
|
return books; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getOutputFileName() { |
||||
|
return "data/books_to_scrape.json"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,9 @@ |
|||||
|
package com.crawler.strategy; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlerStrategy<T> { |
||||
|
List<T> crawl() throws CrawlerException; |
||||
|
String getOutputFileName(); |
||||
|
} |
||||
@ -0,0 +1,69 @@ |
|||||
|
package com.crawler.strategy; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.exception.NetworkException; |
||||
|
import com.crawler.exception.ParseException; |
||||
|
import com.crawler.model.Book; |
||||
|
import com.crawler.util.HttpUtil; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class DoubanBookStrategy implements CrawlerStrategy<Book> { |
||||
|
private static final String URL = "https://book.douban.com/chart"; |
||||
|
|
||||
|
@Override |
||||
|
public List<Book> crawl() throws CrawlerException { |
||||
|
List<Book> books = new ArrayList<>(); |
||||
|
try { |
||||
|
Document doc = HttpUtil.getDocument(URL); |
||||
|
Elements items = doc.select("li.media"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
Book book = new Book(); |
||||
|
Element titleEl = item.selectFirst("h2 a"); |
||||
|
if (titleEl != null) { |
||||
|
book.setTitle(titleEl.text().trim()); |
||||
|
book.setUrl(titleEl.attr("href")); |
||||
|
} |
||||
|
|
||||
|
Element ratingEl = item.selectFirst("span.rating_nums"); |
||||
|
if (ratingEl != null) { |
||||
|
book.setRating(ratingEl.text()); |
||||
|
} |
||||
|
|
||||
|
Element ratingCountEl = item.selectFirst("span.pl"); |
||||
|
if (ratingCountEl != null) { |
||||
|
book.setRatingCount(ratingCountEl.text()); |
||||
|
} |
||||
|
|
||||
|
Element infoEl = item.selectFirst("div.pub"); |
||||
|
if (infoEl != null) { |
||||
|
String info = infoEl.text(); |
||||
|
String[] parts = info.split("/"); |
||||
|
if (parts.length >= 3) { |
||||
|
book.setAuthor(parts[0].trim()); |
||||
|
book.setPublisher(parts[parts.length - 3].trim()); |
||||
|
book.setPublishDate(parts[parts.length - 2].trim()); |
||||
|
book.setPrice(parts[parts.length - 1].trim()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
books.add(book); |
||||
|
} |
||||
|
} catch (NetworkException e) { |
||||
|
throw e; |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseException("Failed to parse Douban book page", e); |
||||
|
} |
||||
|
return books; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getOutputFileName() { |
||||
|
return "data/douban_books.json"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,74 @@ |
|||||
|
package com.crawler.strategy; |
||||
|
|
||||
|
import com.crawler.exception.CrawlerException; |
||||
|
import com.crawler.exception.NetworkException; |
||||
|
import com.crawler.exception.ParseException; |
||||
|
import com.crawler.model.Movie; |
||||
|
import com.crawler.util.HttpUtil; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class DoubanMovieStrategy implements CrawlerStrategy<Movie> { |
||||
|
private static final String URL = "https://movie.douban.com/chart"; |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> crawl() throws CrawlerException { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
try { |
||||
|
Document doc = HttpUtil.getDocument(URL); |
||||
|
Elements items = doc.select("tr.item"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
Movie movie = new Movie(); |
||||
|
Element titleEl = item.selectFirst("div.pl2 a"); |
||||
|
if (titleEl != null) { |
||||
|
movie.setTitle(titleEl.text().split("/")[0].trim()); |
||||
|
movie.setUrl(titleEl.attr("href")); |
||||
|
} |
||||
|
|
||||
|
Element ratingEl = item.selectFirst("span.rating_nums"); |
||||
|
if (ratingEl != null) { |
||||
|
movie.setRating(ratingEl.text()); |
||||
|
} |
||||
|
|
||||
|
Element ratingCountEl = item.selectFirst("span.pl"); |
||||
|
if (ratingCountEl != null) { |
||||
|
movie.setRatingCount(ratingCountEl.text()); |
||||
|
} |
||||
|
|
||||
|
Element infoEl = item.selectFirst("p.pl"); |
||||
|
if (infoEl != null) { |
||||
|
String info = infoEl.text(); |
||||
|
movie.setYear(extractYear(info)); |
||||
|
} |
||||
|
|
||||
|
movies.add(movie); |
||||
|
} |
||||
|
} catch (NetworkException e) { |
||||
|
throw e; |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseException("Failed to parse Douban movie page", e); |
||||
|
} |
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
private String extractYear(String info) { |
||||
|
String[] parts = info.split("/"); |
||||
|
for (String part : parts) { |
||||
|
part = part.trim(); |
||||
|
if (part.matches("\\d{4}.*")) { |
||||
|
return part; |
||||
|
} |
||||
|
} |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getOutputFileName() { |
||||
|
return "data/douban_movies.json"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,60 @@ |
|||||
|
package com.crawler.util; |
||||
|
|
||||
|
import com.crawler.exception.FileException; |
||||
|
import com.google.gson.Gson; |
||||
|
import com.google.gson.GsonBuilder; |
||||
|
import com.google.gson.reflect.TypeToken; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.lang.reflect.Type; |
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.nio.file.Paths; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class FileUtil { |
||||
|
private static final Gson GSON = new GsonBuilder().setPrettyPrinting().create(); |
||||
|
|
||||
|
public static <T> void saveToJsonFile(List<T> data, String filePath) throws FileException { |
||||
|
try { |
||||
|
Path path = Paths.get(filePath); |
||||
|
Path parentDir = path.getParent(); |
||||
|
if (parentDir != null && !Files.exists(parentDir)) { |
||||
|
Files.createDirectories(parentDir); |
||||
|
} |
||||
|
|
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) { |
||||
|
GSON.toJson(data, writer); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
throw new FileException("Failed to save data to file: " + filePath, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static <T> void saveToCsvFile(List<T> data, String filePath, String[] headers, CsvRowMapper<T> rowMapper) throws FileException { |
||||
|
try { |
||||
|
Path path = Paths.get(filePath); |
||||
|
Path parentDir = path.getParent(); |
||||
|
if (parentDir != null && !Files.exists(parentDir)) { |
||||
|
Files.createDirectories(parentDir); |
||||
|
} |
||||
|
|
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) { |
||||
|
writer.write(String.join(",", headers)); |
||||
|
writer.newLine(); |
||||
|
for (T item : data) { |
||||
|
writer.write(rowMapper.mapToCsvRow(item)); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
throw new FileException("Failed to save data to CSV file: " + filePath, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public interface CsvRowMapper<T> { |
||||
|
String mapToCsvRow(T item); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,24 @@ |
|||||
|
package com.crawler.util; |
||||
|
|
||||
|
import com.crawler.exception.NetworkException; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
|
||||
|
public class HttpUtil { |
||||
|
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; |
||||
|
private static final int TIMEOUT = 10000; |
||||
|
|
||||
|
public static Document getDocument(String url) throws NetworkException { |
||||
|
try { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent(USER_AGENT) |
||||
|
.timeout(TIMEOUT) |
||||
|
.ignoreHttpErrors(true) |
||||
|
.get(); |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("Failed to fetch URL: " + url, e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,59 @@ |
|||||
|
package com.crawler.view; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ConsoleView { |
||||
|
public void displayWelcome() { |
||||
|
System.out.println("========================================"); |
||||
|
System.out.println(" Web Crawler Application"); |
||||
|
System.out.println("========================================"); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void displayMenu() { |
||||
|
System.out.println("Please select an option:"); |
||||
|
System.out.println("1. Crawl Douban Movies"); |
||||
|
System.out.println("2. Crawl Douban Books"); |
||||
|
System.out.println("3. Crawl Books to Scrape"); |
||||
|
System.out.println("4. Crawl All"); |
||||
|
System.out.println("0. Exit"); |
||||
|
System.out.println(); |
||||
|
System.out.print("Enter your choice: "); |
||||
|
} |
||||
|
|
||||
|
public void displayCrawling(String description) { |
||||
|
System.out.println(); |
||||
|
System.out.println("----------------------------------------"); |
||||
|
System.out.println("Crawling: " + description); |
||||
|
System.out.println("----------------------------------------"); |
||||
|
} |
||||
|
|
||||
|
public void displaySuccess(String fileName) { |
||||
|
System.out.println("✓ Data saved to: " + fileName); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void displayError(String message) { |
||||
|
System.err.println("✗ Error: " + message); |
||||
|
System.err.println(); |
||||
|
} |
||||
|
|
||||
|
public void displayResults(List<?> data) { |
||||
|
System.out.println("Found " + data.size() + " items:"); |
||||
|
for (Object item : data) { |
||||
|
System.out.println("- " + item); |
||||
|
} |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void displayGoodbye() { |
||||
|
System.out.println("========================================"); |
||||
|
System.out.println(" Goodbye!"); |
||||
|
System.out.println("========================================"); |
||||
|
} |
||||
|
|
||||
|
public void displayInvalidChoice() { |
||||
|
System.out.println("Invalid choice. Please try again."); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,38 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import controller.CrawlerController; |
||||
|
import exception.CrawlerResult; |
||||
|
import java.util.Collections; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ClearCommand implements Command { |
||||
|
private final CrawlerController controller; |
||||
|
|
||||
|
public ClearCommand(CrawlerController controller) { |
||||
|
this.controller = controller; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "clear"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return "清空所有数据"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CrawlerResult execute() { |
||||
|
controller.clearAllData(); |
||||
|
return CrawlerResult.success("SYSTEM") |
||||
|
.message("数据已清空") |
||||
|
.dataCount(0) |
||||
|
.build(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<String> getRequiredSources() { |
||||
|
return Collections.emptyList(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import exception.CrawlerResult; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface Command { |
||||
|
String getName(); |
||||
|
String getDescription(); |
||||
|
CrawlerResult execute(); |
||||
|
List<String> getRequiredSources(); |
||||
|
} |
||||
@ -0,0 +1,41 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import exception.CrawlerResult; |
||||
|
import exception.ValidationException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.HashMap; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class CommandRegistry { |
||||
|
private final Map<String, Command> commands; |
||||
|
|
||||
|
public CommandRegistry() { |
||||
|
this.commands = new HashMap<>(); |
||||
|
} |
||||
|
|
||||
|
public void register(Command command) { |
||||
|
commands.put(command.getName(), command); |
||||
|
} |
||||
|
|
||||
|
public Command getCommand(String name) { |
||||
|
Command command = commands.get(name); |
||||
|
if (command == null) { |
||||
|
throw new ValidationException("未知命令: " + name); |
||||
|
} |
||||
|
return command; |
||||
|
} |
||||
|
|
||||
|
public List<Command> getAllCommands() { |
||||
|
return new ArrayList<>(commands.values()); |
||||
|
} |
||||
|
|
||||
|
public String getHelpText() { |
||||
|
StringBuilder sb = new StringBuilder(); |
||||
|
sb.append("可用命令:\n"); |
||||
|
for (Command cmd : commands.values()) { |
||||
|
sb.append(String.format(" %-15s - %s\n", cmd.getName(), cmd.getDescription())); |
||||
|
} |
||||
|
return sb.toString(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,37 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import controller.CrawlerController; |
||||
|
import exception.CrawlerResult; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ListCrawlersCommand implements Command { |
||||
|
private final CrawlerController controller; |
||||
|
|
||||
|
public ListCrawlersCommand(CrawlerController controller) { |
||||
|
this.controller = controller; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "list"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return "列出所有可用爬虫"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CrawlerResult execute() { |
||||
|
List<String> crawlers = controller.getAllCrawlerNames(); |
||||
|
return CrawlerResult.success("SYSTEM") |
||||
|
.message("获取爬虫列表成功") |
||||
|
.dataCount(crawlers.size()) |
||||
|
.build(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<String> getRequiredSources() { |
||||
|
return controller.getAllCrawlerNames(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,59 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import controller.CrawlerController; |
||||
|
import exception.CrawlerResult; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class RunAllCommand implements Command { |
||||
|
private final CrawlerController controller; |
||||
|
|
||||
|
public RunAllCommand(CrawlerController controller) { |
||||
|
this.controller = controller; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "run-all"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return "运行所有爬虫"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CrawlerResult execute() { |
||||
|
long startTime = System.currentTimeMillis(); |
||||
|
List<CrawlerResult> results = controller.runAllCrawlers(); |
||||
|
long elapsedTime = System.currentTimeMillis() - startTime; |
||||
|
|
||||
|
int successCount = 0; |
||||
|
int totalCount = results.size(); |
||||
|
int totalData = 0; |
||||
|
|
||||
|
for (CrawlerResult result : results) { |
||||
|
if (result.isSuccess()) { |
||||
|
successCount++; |
||||
|
totalData += result.getDataCount(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (successCount == totalCount) { |
||||
|
return CrawlerResult.success("ALL") |
||||
|
.message("所有爬虫执行成功") |
||||
|
.dataCount(totalData) |
||||
|
.elapsedTime(elapsedTime) |
||||
|
.build(); |
||||
|
} else { |
||||
|
return CrawlerResult.failure("ALL", "PARTIAL_FAIL", |
||||
|
String.format("执行完成: %d/%d 成功, 获取 %d 条数据", successCount, totalCount, totalData)) |
||||
|
.elapsedTime(elapsedTime) |
||||
|
.build(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<String> getRequiredSources() { |
||||
|
return controller.getAllCrawlerNames(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,37 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import controller.CrawlerController; |
||||
|
import exception.CrawlerResult; |
||||
|
import exception.ValidationException; |
||||
|
import java.util.Collections; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class RunSingleCommand implements Command { |
||||
|
private final CrawlerController controller; |
||||
|
private final String crawlerName; |
||||
|
|
||||
|
public RunSingleCommand(CrawlerController controller, String crawlerName) { |
||||
|
this.controller = controller; |
||||
|
this.crawlerName = crawlerName; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "run"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return "运行指定爬虫: " + crawlerName; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CrawlerResult execute() { |
||||
|
return controller.runCrawler(crawlerName); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<String> getRequiredSources() { |
||||
|
return Collections.singletonList(crawlerName); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,38 @@ |
|||||
|
package command; |
||||
|
|
||||
|
import controller.CrawlerController; |
||||
|
import exception.CrawlerResult; |
||||
|
import java.util.Collections; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class StatsCommand implements Command { |
||||
|
private final CrawlerController controller; |
||||
|
|
||||
|
public StatsCommand(CrawlerController controller) { |
||||
|
this.controller = controller; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "stats"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getDescription() { |
||||
|
return "显示统计信息"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CrawlerResult execute() { |
||||
|
String stats = controller.getStats(); |
||||
|
return CrawlerResult.success("STATS") |
||||
|
.message(stats) |
||||
|
.dataCount(0) |
||||
|
.build(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<String> getRequiredSources() { |
||||
|
return Collections.emptyList(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,73 @@ |
|||||
|
package config; |
||||
|
|
||||
|
import java.io.FileInputStream; |
||||
|
import java.io.IOException; |
||||
|
import java.io.InputStream; |
||||
|
import java.util.Properties; |
||||
|
|
||||
|
/** |
||||
|
* 爬虫配置类 |
||||
|
*/ |
||||
|
public class CrawlerConfig { |
||||
|
private static final String CONFIG_FILE = "crawler.properties"; |
||||
|
private static Properties props = new Properties(); |
||||
|
|
||||
|
// 默认配置
|
||||
|
static { |
||||
|
props.setProperty("delay.ms", "1000"); |
||||
|
props.setProperty("timeout.ms", "15000"); |
||||
|
props.setProperty("user.agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); |
||||
|
props.setProperty("db.path", "crawler.db"); |
||||
|
props.setProperty("output.dir", "output"); |
||||
|
props.setProperty("enable.database", "true"); |
||||
|
props.setProperty("enable.file", "true"); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 加载配置文件 |
||||
|
*/ |
||||
|
public static void load() { |
||||
|
try (InputStream is = new FileInputStream(CONFIG_FILE)) { |
||||
|
props.load(is); |
||||
|
System.out.println("配置文件加载成功: " + CONFIG_FILE); |
||||
|
} catch (IOException e) { |
||||
|
System.out.println("使用默认配置(未找到配置文件: " + CONFIG_FILE + ")"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static int getDelayMs() { |
||||
|
return Integer.parseInt(props.getProperty("delay.ms", "1000")); |
||||
|
} |
||||
|
|
||||
|
public static int getTimeoutMs() { |
||||
|
return Integer.parseInt(props.getProperty("timeout.ms", "15000")); |
||||
|
} |
||||
|
|
||||
|
public static String getUserAgent() { |
||||
|
return props.getProperty("user.agent"); |
||||
|
} |
||||
|
|
||||
|
public static String getDbPath() { |
||||
|
return props.getProperty("db.path", "crawler.db"); |
||||
|
} |
||||
|
|
||||
|
public static String getOutputDir() { |
||||
|
return props.getProperty("output.dir", "output"); |
||||
|
} |
||||
|
|
||||
|
public static boolean isDatabaseEnabled() { |
||||
|
return Boolean.parseBoolean(props.getProperty("enable.database", "true")); |
||||
|
} |
||||
|
|
||||
|
public static boolean isFileOutputEnabled() { |
||||
|
return Boolean.parseBoolean(props.getProperty("enable.file", "true")); |
||||
|
} |
||||
|
|
||||
|
public static String getProperty(String key) { |
||||
|
return props.getProperty(key); |
||||
|
} |
||||
|
|
||||
|
public static String getProperty(String key, String defaultValue) { |
||||
|
return props.getProperty(key, defaultValue); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,177 @@ |
|||||
|
package controller; |
||||
|
|
||||
|
import exception.CrawlerResult; |
||||
|
import exception.ValidationException; |
||||
|
import model.Movie; |
||||
|
import storage.DataStorage; |
||||
|
import storage.FileStorage; |
||||
|
import storage.StorageStats; |
||||
|
import strategy.CrawlerStrategy; |
||||
|
import strategy.BookCrawlerStrategy; |
||||
|
import strategy.impl.DoubanStrategy; |
||||
|
import strategy.impl.MaoyanStrategy; |
||||
|
import strategy.impl.RottenTomatoesStrategy; |
||||
|
import strategy.impl.DoubanBookStrategy; |
||||
|
import strategy.impl.BooksToScrapeStrategy; |
||||
|
import util.Logger; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.HashMap; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class CrawlerController { |
||||
|
private Map<String, CrawlerStrategy> movieCrawlers; |
||||
|
private Map<String, BookCrawlerStrategy> bookCrawlers; |
||||
|
private DataStorage storage; |
||||
|
private String outputDir; |
||||
|
|
||||
|
public CrawlerController() { |
||||
|
this.movieCrawlers = new HashMap<>(); |
||||
|
this.bookCrawlers = new HashMap<>(); |
||||
|
this.outputDir = "output"; |
||||
|
initStorage(); |
||||
|
registerDefaultCrawlers(); |
||||
|
} |
||||
|
|
||||
|
public CrawlerController(String outputDir) { |
||||
|
this.movieCrawlers = new HashMap<>(); |
||||
|
this.bookCrawlers = new HashMap<>(); |
||||
|
this.outputDir = outputDir; |
||||
|
initStorage(); |
||||
|
registerDefaultCrawlers(); |
||||
|
} |
||||
|
|
||||
|
private void initStorage() { |
||||
|
this.storage = new FileStorage(outputDir); |
||||
|
Logger.info("文件存储初始化完成,输出目录: " + outputDir); |
||||
|
} |
||||
|
|
||||
|
private void registerDefaultCrawlers() { |
||||
|
registerMovieCrawler(new DoubanStrategy()); |
||||
|
registerMovieCrawler(new MaoyanStrategy()); |
||||
|
registerMovieCrawler(new RottenTomatoesStrategy()); |
||||
|
registerBookCrawler(new DoubanBookStrategy()); |
||||
|
registerBookCrawler(new BooksToScrapeStrategy()); |
||||
|
} |
||||
|
|
||||
|
public void registerMovieCrawler(CrawlerStrategy strategy) { |
||||
|
strategy.setStorage(storage); |
||||
|
movieCrawlers.put(strategy.getName(), strategy); |
||||
|
Logger.info("已注册电影爬虫: " + strategy.getName()); |
||||
|
} |
||||
|
|
||||
|
public void registerBookCrawler(BookCrawlerStrategy strategy) { |
||||
|
strategy.setStorage(storage); |
||||
|
bookCrawlers.put(strategy.getName(), strategy); |
||||
|
Logger.info("已注册图书爬虫: " + strategy.getName()); |
||||
|
} |
||||
|
|
||||
|
public void registerCrawler(CrawlerStrategy strategy) { |
||||
|
registerMovieCrawler(strategy); |
||||
|
} |
||||
|
|
||||
|
public void registerCrawler(CrawlerStrategy strategy, DataStorage customStorage) { |
||||
|
strategy.setStorage(customStorage); |
||||
|
movieCrawlers.put(strategy.getName(), strategy); |
||||
|
Logger.info("已注册爬虫: " + strategy.getName()); |
||||
|
} |
||||
|
|
||||
|
public List<String> getAllCrawlerNames() { |
||||
|
List<String> names = new ArrayList<>(); |
||||
|
names.addAll(movieCrawlers.keySet()); |
||||
|
names.addAll(bookCrawlers.keySet()); |
||||
|
return names; |
||||
|
} |
||||
|
|
||||
|
public List<String> getMovieCrawlerNames() { |
||||
|
return new ArrayList<>(movieCrawlers.keySet()); |
||||
|
} |
||||
|
|
||||
|
public List<String> getBookCrawlerNames() { |
||||
|
return new ArrayList<>(bookCrawlers.keySet()); |
||||
|
} |
||||
|
|
||||
|
public CrawlerResult runCrawler(String name) { |
||||
|
if (movieCrawlers.containsKey(name)) { |
||||
|
CrawlerStrategy strategy = movieCrawlers.get(name); |
||||
|
Logger.info("开始执行电影爬虫: " + name); |
||||
|
CrawlerResult result = strategy.execute(); |
||||
|
Logger.info("爬虫执行完成: " + result); |
||||
|
return result; |
||||
|
} else if (bookCrawlers.containsKey(name)) { |
||||
|
BookCrawlerStrategy strategy = bookCrawlers.get(name); |
||||
|
Logger.info("开始执行图书爬虫: " + name); |
||||
|
CrawlerResult result = strategy.execute(); |
||||
|
Logger.info("爬虫执行完成: " + result); |
||||
|
return result; |
||||
|
} else { |
||||
|
throw new ValidationException("未找到爬虫: " + name); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public List<CrawlerResult> runAllCrawlers() { |
||||
|
List<CrawlerResult> results = new ArrayList<>(); |
||||
|
int total = movieCrawlers.size() + bookCrawlers.size(); |
||||
|
Logger.info("开始执行所有爬虫,共 " + total + " 个"); |
||||
|
|
||||
|
for (CrawlerStrategy strategy : movieCrawlers.values()) { |
||||
|
try { |
||||
|
CrawlerResult result = strategy.execute(); |
||||
|
results.add(result); |
||||
|
} catch (Exception e) { |
||||
|
Logger.error("爬虫执行失败: " + strategy.getName(), e); |
||||
|
results.add(CrawlerResult.failure(strategy.getName(), "EXEC_ERROR", e.getMessage()).build()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
for (BookCrawlerStrategy strategy : bookCrawlers.values()) { |
||||
|
try { |
||||
|
CrawlerResult result = strategy.execute(); |
||||
|
results.add(result); |
||||
|
} catch (Exception e) { |
||||
|
Logger.error("爬虫执行失败: " + strategy.getName(), e); |
||||
|
results.add(CrawlerResult.failure(strategy.getName(), "EXEC_ERROR", e.getMessage()).build()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
public String getStats() { |
||||
|
StringBuilder sb = new StringBuilder(); |
||||
|
sb.append("========== 爬虫统计 ==========\n"); |
||||
|
sb.append("电影爬虫数量: ").append(movieCrawlers.size()).append("\n"); |
||||
|
sb.append("图书爬虫数量: ").append(bookCrawlers.size()).append("\n"); |
||||
|
sb.append("总爬虫数量: ").append(movieCrawlers.size() + bookCrawlers.size()).append("\n"); |
||||
|
sb.append("\n电影爬虫列表:\n"); |
||||
|
for (String name : movieCrawlers.keySet()) { |
||||
|
sb.append(" - ").append(name).append("\n"); |
||||
|
} |
||||
|
sb.append("\n图书爬虫列表:\n"); |
||||
|
for (String name : bookCrawlers.keySet()) { |
||||
|
sb.append(" - ").append(name).append("\n"); |
||||
|
} |
||||
|
sb.append("============================="); |
||||
|
return sb.toString(); |
||||
|
} |
||||
|
|
||||
|
public void clearAllData() { |
||||
|
if (storage != null) { |
||||
|
storage.clearAll(); |
||||
|
Logger.info("所有数据已清空"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public DataStorage getStorage() { |
||||
|
return storage; |
||||
|
} |
||||
|
|
||||
|
public Map<String, CrawlerStrategy> getCrawlers() { |
||||
|
return movieCrawlers; |
||||
|
} |
||||
|
|
||||
|
public Map<String, BookCrawlerStrategy> getBookCrawlers() { |
||||
|
return bookCrawlers; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,139 @@ |
|||||
|
package crawler; |
||||
|
|
||||
|
import model.Movie; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import storage.DataStorage; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 爬虫抽象基类 |
||||
|
*/ |
||||
|
public abstract class BaseCrawler { |
||||
|
protected String name; // 爬虫名称
|
||||
|
protected String baseUrl; // 基础URL
|
||||
|
protected int delayMs; // 请求延迟(毫秒)
|
||||
|
protected DataStorage storage; // 数据存储
|
||||
|
|
||||
|
public BaseCrawler(String name, String baseUrl) { |
||||
|
this(name, baseUrl, 1000); |
||||
|
} |
||||
|
|
||||
|
public BaseCrawler(String name, String baseUrl, int delayMs) { |
||||
|
this.name = name; |
||||
|
this.baseUrl = baseUrl; |
||||
|
this.delayMs = delayMs; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 设置数据存储 |
||||
|
*/ |
||||
|
public void setStorage(DataStorage storage) { |
||||
|
this.storage = storage; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 获取爬虫名称 |
||||
|
*/ |
||||
|
public String getName() { |
||||
|
return name; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 获取网页文档 |
||||
|
*/ |
||||
|
protected Document fetchDocument(String url) throws IOException { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + |
||||
|
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
||||
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
||||
|
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") |
||||
|
.timeout(15000) |
||||
|
.get(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 延迟等待 |
||||
|
*/ |
||||
|
protected void delay() { |
||||
|
try { |
||||
|
Thread.sleep(delayMs); |
||||
|
} catch (InterruptedException e) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 开始爬取(模板方法模式) |
||||
|
*/ |
||||
|
public final void crawl() { |
||||
|
System.out.println("========================================"); |
||||
|
System.out.println("开始爬取: " + name); |
||||
|
System.out.println("目标URL: " + baseUrl); |
||||
|
System.out.println("========================================"); |
||||
|
|
||||
|
long startTime = System.currentTimeMillis(); |
||||
|
List<Movie> allMovies = new ArrayList<>(); |
||||
|
|
||||
|
try { |
||||
|
// 获取所有需要爬取的URL列表
|
||||
|
List<String> urls = getUrls(); |
||||
|
System.out.println("共 " + urls.size() + " 个页面需要爬取"); |
||||
|
|
||||
|
for (int i = 0; i < urls.size(); i++) { |
||||
|
String url = urls.get(i); |
||||
|
System.out.println("\n正在爬取第 " + (i + 1) + "/" + urls.size() + " 页: " + url); |
||||
|
|
||||
|
try { |
||||
|
Document doc = fetchDocument(url); |
||||
|
List<Movie> movies = parsePage(doc); |
||||
|
|
||||
|
// 设置数据来源
|
||||
|
for (Movie movie : movies) { |
||||
|
movie.setSource(name); |
||||
|
} |
||||
|
|
||||
|
allMovies.addAll(movies); |
||||
|
System.out.println("本页获取 " + movies.size() + " 条数据"); |
||||
|
|
||||
|
// 延迟,避免被封
|
||||
|
if (i < urls.size() - 1) { |
||||
|
delay(); |
||||
|
} |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
System.err.println("爬取页面失败: " + url + " - " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 保存数据
|
||||
|
if (!allMovies.isEmpty() && storage != null) { |
||||
|
storage.saveBatch(allMovies); |
||||
|
} |
||||
|
|
||||
|
long endTime = System.currentTimeMillis(); |
||||
|
System.out.println("\n========================================"); |
||||
|
System.out.println("爬取完成!"); |
||||
|
System.out.println("总数据量: " + allMovies.size()); |
||||
|
System.out.println("耗时: " + (endTime - startTime) / 1000 + " 秒"); |
||||
|
System.out.println("========================================"); |
||||
|
|
||||
|
} catch (Exception e) { |
||||
|
System.err.println("爬取过程出错: " + e.getMessage()); |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 获取所有需要爬取的URL列表(子类实现) |
||||
|
*/ |
||||
|
protected abstract List<String> getUrls(); |
||||
|
|
||||
|
/** |
||||
|
* 解析单个页面(子类实现) |
||||
|
*/ |
||||
|
protected abstract List<Movie> parsePage(Document doc); |
||||
|
} |
||||
@ -0,0 +1,113 @@ |
|||||
|
package crawler; |
||||
|
|
||||
|
import model.Movie; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 豆瓣电影Top250爬虫 |
||||
|
*/ |
||||
|
public class DoubanCrawler extends BaseCrawler { |
||||
|
|
||||
|
public DoubanCrawler() { |
||||
|
super("豆瓣电影Top250", "https://movie.douban.com/top250", 1500); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<String> getUrls() { |
||||
|
List<String> urls = new ArrayList<>(); |
||||
|
// 豆瓣Top250共10页,每页25部
|
||||
|
for (int i = 0; i < 10; i++) { |
||||
|
urls.add(baseUrl + "?start=" + (i * 25)); |
||||
|
} |
||||
|
return urls; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Movie> parsePage(Document doc) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
Elements items = doc.select("div.item"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
try { |
||||
|
Movie movie = new Movie(); |
||||
|
|
||||
|
// 排名
|
||||
|
String rankStr = item.select("em").text(); |
||||
|
movie.setRank(Integer.parseInt(rankStr)); |
||||
|
|
||||
|
// 电影名称(取第一个标题)
|
||||
|
Element titleElement = item.select("span.title").first(); |
||||
|
if (titleElement != null) { |
||||
|
movie.setName(titleElement.text()); |
||||
|
} |
||||
|
|
||||
|
// 评分
|
||||
|
String ratingStr = item.select("span.rating_num").text(); |
||||
|
if (!ratingStr.isEmpty()) { |
||||
|
movie.setRating(Double.parseDouble(ratingStr)); |
||||
|
} |
||||
|
|
||||
|
// 评分人数
|
||||
|
String ratingCountStr = item.select("div.star span").last().text(); |
||||
|
if (ratingCountStr != null && ratingCountStr.contains("人评价")) { |
||||
|
String num = ratingCountStr.replace("人评价", "").trim(); |
||||
|
movie.setRatingCount(parseNumber(num)); |
||||
|
} |
||||
|
|
||||
|
// 其他信息(导演、年份等)
|
||||
|
String info = item.select("div.bd p").first().text(); |
||||
|
if (info != null) { |
||||
|
// 提取年份
|
||||
|
String[] parts = info.split(" / "); |
||||
|
if (parts.length > 0) { |
||||
|
String firstPart = parts[0]; |
||||
|
if (firstPart.contains("导演: ")) { |
||||
|
movie.setDirector(firstPart.replace("导演: ", "").trim()); |
||||
|
} |
||||
|
// 提取年份(通常是最后一个数字部分)
|
||||
|
for (String part : parts) { |
||||
|
if (part.matches("\\d{4}") || part.matches("\\d{4}.*")) { |
||||
|
movie.setYear(part.trim().split("\\s+")[0]); |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 详情链接
|
||||
|
String link = item.select("div.hd a").attr("href"); |
||||
|
movie.setUrl(link); |
||||
|
|
||||
|
// 海报图片
|
||||
|
String imgUrl = item.select("div.pic img").attr("src"); |
||||
|
movie.setImageUrl(imgUrl); |
||||
|
|
||||
|
movies.add(movie); |
||||
|
|
||||
|
} catch (Exception e) { |
||||
|
System.err.println("解析电影数据出错: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 解析数字(处理中文数字如"万") |
||||
|
*/ |
||||
|
private Integer parseNumber(String str) { |
||||
|
try { |
||||
|
if (str.contains("万")) { |
||||
|
return (int) (Double.parseDouble(str.replace("万", "")) * 10000); |
||||
|
} |
||||
|
return Integer.parseInt(str.replace(",", "")); |
||||
|
} catch (NumberFormatException e) { |
||||
|
return null; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,100 @@ |
|||||
|
package crawler; |
||||
|
|
||||
|
import model.Movie; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* IMDB Top250 爬虫示例 |
||||
|
*/ |
||||
|
public class ImdbCrawler extends BaseCrawler { |
||||
|
|
||||
|
public ImdbCrawler() { |
||||
|
super("IMDB电影Top250", "https://www.imdb.com/chart/top/", 2000); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<String> getUrls() { |
||||
|
List<String> urls = new ArrayList<>(); |
||||
|
urls.add(baseUrl); |
||||
|
return urls; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Movie> parsePage(Document doc) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
Elements items = doc.select("li.ipc-metadata-list-summary-item"); |
||||
|
|
||||
|
int rank = 1; |
||||
|
for (Element item : items) { |
||||
|
try { |
||||
|
Movie movie = new Movie(); |
||||
|
movie.setRank(rank++); |
||||
|
|
||||
|
// 电影名称
|
||||
|
Element titleElement = item.select("h3.ipc-title__text").first(); |
||||
|
if (titleElement != null) { |
||||
|
String fullTitle = titleElement.text(); |
||||
|
// 移除排名前缀如 "1. "
|
||||
|
if (fullTitle.matches("\\d+\\..*")) { |
||||
|
fullTitle = fullTitle.substring(fullTitle.indexOf(".") + 1).trim(); |
||||
|
} |
||||
|
movie.setName(fullTitle); |
||||
|
} |
||||
|
|
||||
|
// 评分
|
||||
|
String ratingStr = item.select("span.ipc-rating-star--rating").text(); |
||||
|
if (!ratingStr.isEmpty()) { |
||||
|
movie.setRating(Double.parseDouble(ratingStr)); |
||||
|
} |
||||
|
|
||||
|
// 评分人数
|
||||
|
String countStr = item.select("span.ipc-rating-star--voteCount").text(); |
||||
|
if (!countStr.isEmpty()) { |
||||
|
movie.setRatingCount(parseNumber(countStr.replaceAll("[()\\s]", ""))); |
||||
|
} |
||||
|
|
||||
|
// 年份
|
||||
|
String yearStr = item.select("span.cli-title-metadata-item").first().text(); |
||||
|
if (yearStr != null && yearStr.matches("\\d{4}")) { |
||||
|
movie.setYear(yearStr); |
||||
|
} |
||||
|
|
||||
|
// 详情链接
|
||||
|
String link = item.select("a.ipc-title-link-wrapper").attr("href"); |
||||
|
if (!link.isEmpty()) { |
||||
|
movie.setUrl("https://www.imdb.com" + link); |
||||
|
} |
||||
|
|
||||
|
// 海报图片
|
||||
|
String imgUrl = item.select("img.ipc-image").attr("src"); |
||||
|
movie.setImageUrl(imgUrl); |
||||
|
|
||||
|
movies.add(movie); |
||||
|
|
||||
|
} catch (Exception e) { |
||||
|
System.err.println("解析电影数据出错: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
private Integer parseNumber(String str) { |
||||
|
try { |
||||
|
if (str.contains("M")) { |
||||
|
return (int) (Double.parseDouble(str.replace("M", "")) * 1000000); |
||||
|
} |
||||
|
if (str.contains("K")) { |
||||
|
return (int) (Double.parseDouble(str.replace("K", "")) * 1000); |
||||
|
} |
||||
|
return Integer.parseInt(str.replace(",", "")); |
||||
|
} catch (NumberFormatException e) { |
||||
|
return null; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,92 @@ |
|||||
|
package crawler; |
||||
|
|
||||
|
import model.Movie; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 猫眼电影 Top100 爬虫 |
||||
|
*/ |
||||
|
public class MaoyanCrawler extends BaseCrawler { |
||||
|
|
||||
|
public MaoyanCrawler() { |
||||
|
super("猫眼电影Top100", "https://maoyan.com/board/4", 1500); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<String> getUrls() { |
||||
|
List<String> urls = new ArrayList<>(); |
||||
|
// 猫眼Top100共10页,每页10部
|
||||
|
for (int i = 0; i < 10; i++) { |
||||
|
urls.add(baseUrl + "?offset=" + (i * 10)); |
||||
|
} |
||||
|
return urls; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Movie> parsePage(Document doc) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
Elements items = doc.select("dl.board-wrapper dd"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
try { |
||||
|
Movie movie = new Movie(); |
||||
|
|
||||
|
// 排名
|
||||
|
String rankStr = item.select("i.board-index").text(); |
||||
|
movie.setRank(Integer.parseInt(rankStr)); |
||||
|
|
||||
|
// 电影名称
|
||||
|
String name = item.select("p.name a").text(); |
||||
|
movie.setName(name); |
||||
|
|
||||
|
// 评分
|
||||
|
String ratingStr = item.select("i.integer").text() + |
||||
|
item.select("i.fraction").text(); |
||||
|
if (!ratingStr.isEmpty()) { |
||||
|
movie.setRating(Double.parseDouble(ratingStr)); |
||||
|
} |
||||
|
|
||||
|
// 主演
|
||||
|
String actors = item.select("p.star").text(); |
||||
|
if (actors != null && actors.contains("主演:")) { |
||||
|
movie.setActors(actors.replace("主演:", "").trim()); |
||||
|
} |
||||
|
|
||||
|
// 上映时间
|
||||
|
String releaseTime = item.select("p.releasetime").text(); |
||||
|
if (releaseTime != null && releaseTime.contains("上映时间:")) { |
||||
|
String timeStr = releaseTime.replace("上映时间:", "").trim(); |
||||
|
// 提取年份
|
||||
|
if (timeStr.matches("\\d{4}.*")) { |
||||
|
movie.setYear(timeStr.substring(0, 4)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 详情链接
|
||||
|
String link = item.select("p.name a").attr("href"); |
||||
|
if (!link.isEmpty()) { |
||||
|
movie.setUrl("https://maoyan.com" + link); |
||||
|
} |
||||
|
|
||||
|
// 海报图片
|
||||
|
String imgUrl = item.select("img.board-img").attr("data-src"); |
||||
|
if (imgUrl.isEmpty()) { |
||||
|
imgUrl = item.select("img.board-img").attr("src"); |
||||
|
} |
||||
|
movie.setImageUrl(imgUrl); |
||||
|
|
||||
|
movies.add(movie); |
||||
|
|
||||
|
} catch (Exception e) { |
||||
|
System.err.println("解析猫眼电影数据出错: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,102 @@ |
|||||
|
package crawler; |
||||
|
|
||||
|
import model.Movie; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 烂番茄 (Rotten Tomatoes) Top100 爬虫 |
||||
|
*/ |
||||
|
public class RottenTomatoesCrawler extends BaseCrawler { |
||||
|
|
||||
|
public RottenTomatoesCrawler() { |
||||
|
super("烂番茄Top100", "https://www.rottentomatoes.com/top/bestofrt/", 2000); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<String> getUrls() { |
||||
|
List<String> urls = new ArrayList<>(); |
||||
|
urls.add(baseUrl); |
||||
|
return urls; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Movie> parsePage(Document doc) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
Elements items = doc.select("table.table tr"); |
||||
|
|
||||
|
// 跳过表头
|
||||
|
int rank = 0; |
||||
|
for (Element item : items) { |
||||
|
try { |
||||
|
// 跳过表头行
|
||||
|
Element rankElement = item.selectFirst("td.rank"); |
||||
|
if (rankElement == null) continue; |
||||
|
|
||||
|
Movie movie = new Movie(); |
||||
|
|
||||
|
// 排名
|
||||
|
String rankStr = rankElement.text(); |
||||
|
if (!rankStr.isEmpty()) { |
||||
|
movie.setRank(Integer.parseInt(rankStr)); |
||||
|
} else { |
||||
|
movie.setRank(++rank); |
||||
|
} |
||||
|
|
||||
|
// 电影名称和年份
|
||||
|
Element titleElement = item.selectFirst("td.title a"); |
||||
|
if (titleElement != null) { |
||||
|
String fullTitle = titleElement.text(); |
||||
|
// 提取年份(通常在括号里)
|
||||
|
if (fullTitle.contains("(") && fullTitle.contains(")")) { |
||||
|
int start = fullTitle.lastIndexOf("("); |
||||
|
int end = fullTitle.lastIndexOf(")"); |
||||
|
if (start > 0 && end > start) { |
||||
|
String yearStr = fullTitle.substring(start + 1, end); |
||||
|
if (yearStr.matches("\\d{4}")) { |
||||
|
movie.setYear(yearStr); |
||||
|
} |
||||
|
movie.setName(fullTitle.substring(0, start).trim()); |
||||
|
} else { |
||||
|
movie.setName(fullTitle); |
||||
|
} |
||||
|
} else { |
||||
|
movie.setName(fullTitle); |
||||
|
} |
||||
|
|
||||
|
// 详情链接
|
||||
|
String link = titleElement.attr("href"); |
||||
|
if (!link.isEmpty()) { |
||||
|
if (link.startsWith("/")) { |
||||
|
movie.setUrl("https://www.rottentomatoes.com" + link); |
||||
|
} else { |
||||
|
movie.setUrl(link); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 新鲜度评分(烂番茄特有)
|
||||
|
Element scoreElement = item.selectFirst("td.score span.tMeterScore"); |
||||
|
if (scoreElement != null) { |
||||
|
String scoreStr = scoreElement.text(); |
||||
|
if (scoreStr.matches("\\d+%")) { |
||||
|
// 转换为10分制
|
||||
|
double rating = Double.parseDouble(scoreStr.replace("%", "")) / 10; |
||||
|
movie.setRating(Math.round(rating * 10) / 10.0); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
movies.add(movie); |
||||
|
|
||||
|
} catch (Exception e) { |
||||
|
System.err.println("解析烂番茄数据出错: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,55 @@ |
|||||
|
package exception; |
||||
|
|
||||
|
public class CrawlerException extends RuntimeException { |
||||
|
private final String source; |
||||
|
private final String errorCode; |
||||
|
|
||||
|
public CrawlerException(String message) { |
||||
|
super(message); |
||||
|
this.source = "UNKNOWN"; |
||||
|
this.errorCode = "CRAWLER_001"; |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, String source) { |
||||
|
super(message); |
||||
|
this.source = source; |
||||
|
this.errorCode = "CRAWLER_001"; |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.source = "UNKNOWN"; |
||||
|
this.errorCode = "CRAWLER_002"; |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, String source, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.source = source; |
||||
|
this.errorCode = "CRAWLER_002"; |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, String source, String errorCode) { |
||||
|
super(message); |
||||
|
this.source = source; |
||||
|
this.errorCode = errorCode; |
||||
|
} |
||||
|
|
||||
|
public CrawlerException(String message, String source, String errorCode, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
this.source = source; |
||||
|
this.errorCode = errorCode; |
||||
|
} |
||||
|
|
||||
|
public String getSource() { |
||||
|
return source; |
||||
|
} |
||||
|
|
||||
|
public String getErrorCode() { |
||||
|
return errorCode; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return String.format("[%s] [%s] %s (source: %s)", errorCode, getClass().getSimpleName(), getMessage(), source); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,103 @@ |
|||||
|
package exception; |
||||
|
|
||||
|
public class CrawlerResult { |
||||
|
private final boolean success; |
||||
|
private final String source; |
||||
|
private final String message; |
||||
|
private final int dataCount; |
||||
|
private final long elapsedTime; |
||||
|
private final String errorCode; |
||||
|
|
||||
|
private CrawlerResult(Builder builder) { |
||||
|
this.success = builder.success; |
||||
|
this.source = builder.source; |
||||
|
this.message = builder.message; |
||||
|
this.dataCount = builder.dataCount; |
||||
|
this.elapsedTime = builder.elapsedTime; |
||||
|
this.errorCode = builder.errorCode; |
||||
|
} |
||||
|
|
||||
|
public boolean isSuccess() { |
||||
|
return success; |
||||
|
} |
||||
|
|
||||
|
public String getSource() { |
||||
|
return source; |
||||
|
} |
||||
|
|
||||
|
public String getMessage() { |
||||
|
return message; |
||||
|
} |
||||
|
|
||||
|
public int getDataCount() { |
||||
|
return dataCount; |
||||
|
} |
||||
|
|
||||
|
public long getElapsedTime() { |
||||
|
return elapsedTime; |
||||
|
} |
||||
|
|
||||
|
public String getErrorCode() { |
||||
|
return errorCode; |
||||
|
} |
||||
|
|
||||
|
public static Builder success(String source) { |
||||
|
return new Builder().success(true).source(source); |
||||
|
} |
||||
|
|
||||
|
public static Builder failure(String source, String errorCode, String message) { |
||||
|
return new Builder().success(false).source(source).errorCode(errorCode).message(message); |
||||
|
} |
||||
|
|
||||
|
public static class Builder { |
||||
|
private boolean success; |
||||
|
private String source; |
||||
|
private String message; |
||||
|
private int dataCount; |
||||
|
private long elapsedTime; |
||||
|
private String errorCode; |
||||
|
|
||||
|
public Builder success(boolean success) { |
||||
|
this.success = success; |
||||
|
return this; |
||||
|
} |
||||
|
|
||||
|
public Builder source(String source) { |
||||
|
this.source = source; |
||||
|
return this; |
||||
|
} |
||||
|
|
||||
|
public Builder message(String message) { |
||||
|
this.message = message; |
||||
|
return this; |
||||
|
} |
||||
|
|
||||
|
public Builder dataCount(int dataCount) { |
||||
|
this.dataCount = dataCount; |
||||
|
return this; |
||||
|
} |
||||
|
|
||||
|
public Builder elapsedTime(long elapsedTime) { |
||||
|
this.elapsedTime = elapsedTime; |
||||
|
return this; |
||||
|
} |
||||
|
|
||||
|
public Builder errorCode(String errorCode) { |
||||
|
this.errorCode = errorCode; |
||||
|
return this; |
||||
|
} |
||||
|
|
||||
|
public CrawlerResult build() { |
||||
|
return new CrawlerResult(this); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
if (success) { |
||||
|
return String.format("[SUCCESS] %s - 获取 %d 条数据 (耗时: %dms)", source, dataCount, elapsedTime); |
||||
|
} else { |
||||
|
return String.format("[FAILURE] [%s] %s - %s", errorCode, source, message); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,20 @@ |
|||||
|
package exception; |
||||
|
|
||||
|
public class NetworkException extends CrawlerException { |
||||
|
|
||||
|
public NetworkException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(String message, String source) { |
||||
|
super(message, source); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
|
||||
|
public NetworkException(String message, String source, Throwable cause) { |
||||
|
super(message, source, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,20 @@ |
|||||
|
package exception; |
||||
|
|
||||
|
public class ParseException extends CrawlerException { |
||||
|
|
||||
|
public ParseException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message, String source) { |
||||
|
super(message, source); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message, String source, Throwable cause) { |
||||
|
super(message, source, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,20 @@ |
|||||
|
package exception; |
||||
|
|
||||
|
public class StorageException extends CrawlerException { |
||||
|
|
||||
|
public StorageException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public StorageException(String message, String source) { |
||||
|
super(message, source); |
||||
|
} |
||||
|
|
||||
|
public StorageException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
|
||||
|
public StorageException(String message, String source, Throwable cause) { |
||||
|
super(message, source, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,20 @@ |
|||||
|
package exception; |
||||
|
|
||||
|
public class ValidationException extends CrawlerException { |
||||
|
|
||||
|
public ValidationException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public ValidationException(String message, String source) { |
||||
|
super(message, source); |
||||
|
} |
||||
|
|
||||
|
public ValidationException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
|
||||
|
public ValidationException(String message, String source, Throwable cause) { |
||||
|
super(message, source, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,236 @@ |
|||||
|
package main; |
||||
|
|
||||
|
import config.CrawlerConfig; |
||||
|
import crawler.BaseCrawler; |
||||
|
import crawler.DoubanCrawler; |
||||
|
import crawler.ImdbCrawler; |
||||
|
import crawler.MaoyanCrawler; |
||||
|
import crawler.RottenTomatoesCrawler; |
||||
|
import model.Book; |
||||
|
import model.Movie; |
||||
|
import storage.DataStorage; |
||||
|
import storage.FileStorage; |
||||
|
import storage.SQLiteStorage; |
||||
|
import storage.StorageStats; |
||||
|
import util.Logger; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
public class CrawlerManager { |
||||
|
private List<BaseCrawler> crawlers; |
||||
|
private DataStorage databaseStorage; |
||||
|
private DataStorage fileStorage; |
||||
|
|
||||
|
public CrawlerManager() { |
||||
|
crawlers = new ArrayList<>(); |
||||
|
|
||||
|
CrawlerConfig.load(); |
||||
|
|
||||
|
if (CrawlerConfig.isDatabaseEnabled()) { |
||||
|
databaseStorage = new SQLiteStorage(); |
||||
|
Logger.info("数据库存储已启用"); |
||||
|
} |
||||
|
if (CrawlerConfig.isFileOutputEnabled()) { |
||||
|
fileStorage = new FileStorage(CrawlerConfig.getOutputDir()); |
||||
|
Logger.info("文件输出已启用"); |
||||
|
} |
||||
|
|
||||
|
registerCrawler(new DoubanCrawler()); |
||||
|
registerCrawler(new MaoyanCrawler()); |
||||
|
registerCrawler(new RottenTomatoesCrawler()); |
||||
|
} |
||||
|
|
||||
|
public void registerCrawler(BaseCrawler crawler) { |
||||
|
if (databaseStorage != null) { |
||||
|
crawler.setStorage(new MultiStorage(databaseStorage, fileStorage)); |
||||
|
} else { |
||||
|
crawler.setStorage(fileStorage); |
||||
|
} |
||||
|
crawlers.add(crawler); |
||||
|
Logger.info("已注册爬虫: " + crawler.getName()); |
||||
|
} |
||||
|
|
||||
|
public void runAll() { |
||||
|
Logger.info("开始运行所有爬虫,共 " + crawlers.size() + " 个"); |
||||
|
for (BaseCrawler crawler : crawlers) { |
||||
|
crawler.crawl(); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
showStats(); |
||||
|
} |
||||
|
|
||||
|
public void runCrawler(String name) { |
||||
|
for (BaseCrawler crawler : crawlers) { |
||||
|
if (crawler.getName().equals(name)) { |
||||
|
crawler.crawl(); |
||||
|
showStats(); |
||||
|
return; |
||||
|
} |
||||
|
} |
||||
|
Logger.error("未找到爬虫: " + name); |
||||
|
} |
||||
|
|
||||
|
public void showStats() { |
||||
|
if (databaseStorage != null) { |
||||
|
StorageStats stats = databaseStorage.getStats(); |
||||
|
System.out.println("\n========== 数据库统计 =========="); |
||||
|
System.out.println("总记录数: " + stats.getTotalCount()); |
||||
|
System.out.println("数据源数量: " + stats.getSourceCount()); |
||||
|
System.out.println("================================\n"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void showMenu() { |
||||
|
System.out.println("\n========== 爬虫管理系统 =========="); |
||||
|
System.out.println("1. 运行所有爬虫"); |
||||
|
System.out.println("2. 运行指定爬虫"); |
||||
|
System.out.println("3. 查看统计信息"); |
||||
|
System.out.println("4. 清空数据库"); |
||||
|
System.out.println("5. 退出"); |
||||
|
System.out.println("=================================="); |
||||
|
System.out.print("请选择操作: "); |
||||
|
} |
||||
|
|
||||
|
public void interactive() { |
||||
|
Scanner scanner = new Scanner(System.in); |
||||
|
|
||||
|
while (true) { |
||||
|
showMenu(); |
||||
|
String choice = scanner.nextLine().trim(); |
||||
|
|
||||
|
switch (choice) { |
||||
|
case "1": |
||||
|
runAll(); |
||||
|
break; |
||||
|
|
||||
|
case "2": |
||||
|
System.out.println("\n可用爬虫:"); |
||||
|
for (int i = 0; i < crawlers.size(); i++) { |
||||
|
System.out.println((i + 1) + ". " + crawlers.get(i).getName()); |
||||
|
} |
||||
|
System.out.print("请输入爬虫名称: "); |
||||
|
String crawlerName = scanner.nextLine().trim(); |
||||
|
runCrawler(crawlerName); |
||||
|
break; |
||||
|
|
||||
|
case "3": |
||||
|
showStats(); |
||||
|
break; |
||||
|
|
||||
|
case "4": |
||||
|
System.out.print("确定要清空所有数据吗?(yes/no): "); |
||||
|
String confirm = scanner.nextLine().trim(); |
||||
|
if ("yes".equalsIgnoreCase(confirm) && databaseStorage != null) { |
||||
|
databaseStorage.clearAll(); |
||||
|
} |
||||
|
break; |
||||
|
|
||||
|
case "5": |
||||
|
System.out.println("再见!"); |
||||
|
close(); |
||||
|
return; |
||||
|
|
||||
|
default: |
||||
|
System.out.println("无效选择,请重试"); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public void close() { |
||||
|
if (databaseStorage != null) { |
||||
|
databaseStorage.close(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private static class MultiStorage implements DataStorage { |
||||
|
private DataStorage primary; |
||||
|
private DataStorage secondary; |
||||
|
|
||||
|
public MultiStorage(DataStorage primary, DataStorage secondary) { |
||||
|
this.primary = primary; |
||||
|
this.secondary = secondary; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void save(Movie movie) { |
||||
|
primary.save(movie); |
||||
|
if (secondary != null) secondary.save(movie); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void saveBatch(List<Movie> movies) { |
||||
|
primary.saveBatch(movies); |
||||
|
if (secondary != null) secondary.saveBatch(movies); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> findAll() { |
||||
|
return primary.findAll(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> findBySource(String source) { |
||||
|
return primary.findBySource(source); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> findByRankRange(int start, int end) { |
||||
|
return primary.findByRankRange(start, end); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void saveBook(Book book) { |
||||
|
primary.saveBook(book); |
||||
|
if (secondary != null) secondary.saveBook(book); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void saveBookBatch(List<Book> books) { |
||||
|
primary.saveBookBatch(books); |
||||
|
if (secondary != null) secondary.saveBookBatch(books); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Book> findAllBooks() { |
||||
|
return primary.findAllBooks(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Book> findBooksBySource(String source) { |
||||
|
return primary.findBooksBySource(source); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void deleteBySource(String source) { |
||||
|
primary.deleteBySource(source); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void clearAll() { |
||||
|
primary.clearAll(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public StorageStats getStats() { |
||||
|
return primary.getStats(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void close() { |
||||
|
primary.close(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
CrawlerManager manager = new CrawlerManager(); |
||||
|
|
||||
|
if (args.length > 0 && args[0].equals("--auto")) { |
||||
|
manager.runAll(); |
||||
|
manager.close(); |
||||
|
} else { |
||||
|
manager.interactive(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,86 @@ |
|||||
|
package model; |
||||
|
|
||||
|
import java.time.LocalDateTime; |
||||
|
|
||||
|
public class Book { |
||||
|
private Integer id; |
||||
|
private String source; |
||||
|
private Integer rank; |
||||
|
private String title; |
||||
|
private String author; |
||||
|
private String publisher; |
||||
|
private String year; |
||||
|
private Double price; |
||||
|
private Double rating; |
||||
|
private Integer ratingCount; |
||||
|
private String category; |
||||
|
private String description; |
||||
|
private String url; |
||||
|
private String imageUrl; |
||||
|
private String isbn; |
||||
|
private LocalDateTime crawlTime; |
||||
|
|
||||
|
public Book() {} |
||||
|
|
||||
|
public Book(String source, Integer rank, String title, Double rating) { |
||||
|
this.source = source; |
||||
|
this.rank = rank; |
||||
|
this.title = title; |
||||
|
this.rating = rating; |
||||
|
this.crawlTime = LocalDateTime.now(); |
||||
|
} |
||||
|
|
||||
|
public Integer getId() { return id; } |
||||
|
public void setId(Integer id) { this.id = id; } |
||||
|
|
||||
|
public String getSource() { return source; } |
||||
|
public void setSource(String source) { this.source = source; } |
||||
|
|
||||
|
public Integer getRank() { return rank; } |
||||
|
public void setRank(Integer rank) { this.rank = rank; } |
||||
|
|
||||
|
public String getTitle() { return title; } |
||||
|
public void setTitle(String title) { this.title = title; } |
||||
|
|
||||
|
public String getAuthor() { return author; } |
||||
|
public void setAuthor(String author) { this.author = author; } |
||||
|
|
||||
|
public String getPublisher() { return publisher; } |
||||
|
public void setPublisher(String publisher) { this.publisher = publisher; } |
||||
|
|
||||
|
public String getYear() { return year; } |
||||
|
public void setYear(String year) { this.year = year; } |
||||
|
|
||||
|
public Double getPrice() { return price; } |
||||
|
public void setPrice(Double price) { this.price = price; } |
||||
|
|
||||
|
public Double getRating() { return rating; } |
||||
|
public void setRating(Double rating) { this.rating = rating; } |
||||
|
|
||||
|
public Integer getRatingCount() { return ratingCount; } |
||||
|
public void setRatingCount(Integer ratingCount) { this.ratingCount = ratingCount; } |
||||
|
|
||||
|
public String getCategory() { return category; } |
||||
|
public void setCategory(String category) { this.category = category; } |
||||
|
|
||||
|
public String getDescription() { return description; } |
||||
|
public void setDescription(String description) { this.description = description; } |
||||
|
|
||||
|
public String getUrl() { return url; } |
||||
|
public void setUrl(String url) { this.url = url; } |
||||
|
|
||||
|
public String getImageUrl() { return imageUrl; } |
||||
|
public void setImageUrl(String imageUrl) { this.imageUrl = imageUrl; } |
||||
|
|
||||
|
public String getIsbn() { return isbn; } |
||||
|
public void setIsbn(String isbn) { this.isbn = isbn; } |
||||
|
|
||||
|
public LocalDateTime getCrawlTime() { return crawlTime; } |
||||
|
public void setCrawlTime(LocalDateTime crawlTime) { this.crawlTime = crawlTime; } |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return String.format("Book{source='%s', rank=%d, title='%s', rating=%.1f}", |
||||
|
source, rank, title, rating != null ? rating : 0.0); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,78 @@ |
|||||
|
package model; |
||||
|
|
||||
|
import java.time.LocalDateTime; |
||||
|
|
||||
|
/** |
||||
|
* 电影数据模型类 |
||||
|
*/ |
||||
|
public class Movie { |
||||
|
private Integer id; |
||||
|
private String source; // 数据来源网站
|
||||
|
private Integer rank; // 排名
|
||||
|
private String name; // 电影名称
|
||||
|
private String director; // 导演
|
||||
|
private String actors; // 演员
|
||||
|
private String year; // 年份
|
||||
|
private Double rating; // 评分
|
||||
|
private Integer ratingCount; // 评分人数
|
||||
|
private String description; // 简介
|
||||
|
private String url; // 详情链接
|
||||
|
private String imageUrl; // 海报图片
|
||||
|
private LocalDateTime crawlTime; // 爬取时间
|
||||
|
|
||||
|
public Movie() {} |
||||
|
|
||||
|
public Movie(String source, Integer rank, String name, Double rating) { |
||||
|
this.source = source; |
||||
|
this.rank = rank; |
||||
|
this.name = name; |
||||
|
this.rating = rating; |
||||
|
this.crawlTime = LocalDateTime.now(); |
||||
|
} |
||||
|
|
||||
|
// Getters and Setters
|
||||
|
public Integer getId() { return id; } |
||||
|
public void setId(Integer id) { this.id = id; } |
||||
|
|
||||
|
public String getSource() { return source; } |
||||
|
public void setSource(String source) { this.source = source; } |
||||
|
|
||||
|
public Integer getRank() { return rank; } |
||||
|
public void setRank(Integer rank) { this.rank = rank; } |
||||
|
|
||||
|
public String getName() { return name; } |
||||
|
public void setName(String name) { this.name = name; } |
||||
|
|
||||
|
public String getDirector() { return director; } |
||||
|
public void setDirector(String director) { this.director = director; } |
||||
|
|
||||
|
public String getActors() { return actors; } |
||||
|
public void setActors(String actors) { this.actors = actors; } |
||||
|
|
||||
|
public String getYear() { return year; } |
||||
|
public void setYear(String year) { this.year = year; } |
||||
|
|
||||
|
public Double getRating() { return rating; } |
||||
|
public void setRating(Double rating) { this.rating = rating; } |
||||
|
|
||||
|
public Integer getRatingCount() { return ratingCount; } |
||||
|
public void setRatingCount(Integer ratingCount) { this.ratingCount = ratingCount; } |
||||
|
|
||||
|
public String getDescription() { return description; } |
||||
|
public void setDescription(String description) { this.description = description; } |
||||
|
|
||||
|
public String getUrl() { return url; } |
||||
|
public void setUrl(String url) { this.url = url; } |
||||
|
|
||||
|
public String getImageUrl() { return imageUrl; } |
||||
|
public void setImageUrl(String imageUrl) { this.imageUrl = imageUrl; } |
||||
|
|
||||
|
public LocalDateTime getCrawlTime() { return crawlTime; } |
||||
|
public void setCrawlTime(LocalDateTime crawlTime) { this.crawlTime = crawlTime; } |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return String.format("Movie{source='%s', rank=%d, name='%s', rating=%.1f}", |
||||
|
source, rank, name, rating); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,34 @@ |
|||||
|
package storage; |
||||
|
|
||||
|
import model.Movie; |
||||
|
import model.Book; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface DataStorage { |
||||
|
|
||||
|
void save(Movie movie); |
||||
|
|
||||
|
void saveBatch(List<Movie> movies); |
||||
|
|
||||
|
List<Movie> findAll(); |
||||
|
|
||||
|
List<Movie> findBySource(String source); |
||||
|
|
||||
|
List<Movie> findByRankRange(int start, int end); |
||||
|
|
||||
|
void deleteBySource(String source); |
||||
|
|
||||
|
void clearAll(); |
||||
|
|
||||
|
StorageStats getStats(); |
||||
|
|
||||
|
void close(); |
||||
|
|
||||
|
void saveBook(model.Book book); |
||||
|
|
||||
|
void saveBookBatch(List<Book> books); |
||||
|
|
||||
|
List<Book> findAllBooks(); |
||||
|
|
||||
|
List<Book> findBooksBySource(String source); |
||||
|
} |
||||
@ -0,0 +1,237 @@ |
|||||
|
package storage; |
||||
|
|
||||
|
import com.google.gson.Gson; |
||||
|
import com.google.gson.GsonBuilder; |
||||
|
import model.Book; |
||||
|
import model.Movie; |
||||
|
|
||||
|
import java.io.*; |
||||
|
import java.nio.charset.StandardCharsets; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class FileStorage implements DataStorage { |
||||
|
private static final Gson gson = new GsonBuilder() |
||||
|
.setPrettyPrinting() |
||||
|
.registerTypeAdapter(LocalDateTime.class, new LocalDateTimeAdapter()) |
||||
|
.create(); |
||||
|
|
||||
|
private final String outputDir; |
||||
|
|
||||
|
public FileStorage() { |
||||
|
this("output"); |
||||
|
} |
||||
|
|
||||
|
public FileStorage(String outputDir) { |
||||
|
this.outputDir = outputDir; |
||||
|
File dir = new File(outputDir); |
||||
|
if (!dir.exists()) { |
||||
|
dir.mkdirs(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void save(Movie movie) { |
||||
|
List<Movie> list = new ArrayList<>(); |
||||
|
list.add(movie); |
||||
|
saveBatch(list); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void saveBatch(List<Movie> movies) { |
||||
|
if (movies.isEmpty()) return; |
||||
|
|
||||
|
String source = movies.get(0).getSource(); |
||||
|
|
||||
|
saveMoviesAsJson(movies, source); |
||||
|
saveMoviesAsTxt(movies, source); |
||||
|
} |
||||
|
|
||||
|
private void saveMoviesAsJson(List<Movie> movies, String source) { |
||||
|
String filename = outputDir + "/" + sanitizeFilename(source) + "_" + |
||||
|
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".json"; |
||||
|
|
||||
|
try (Writer writer = new OutputStreamWriter( |
||||
|
new FileOutputStream(filename), StandardCharsets.UTF_8)) { |
||||
|
gson.toJson(movies, writer); |
||||
|
System.out.println("JSON文件已保存: " + filename); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("保存JSON失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void saveMoviesAsTxt(List<Movie> movies, String source) { |
||||
|
String filename = outputDir + "/" + sanitizeFilename(source) + "_" + |
||||
|
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".txt"; |
||||
|
|
||||
|
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( |
||||
|
new FileOutputStream(filename), StandardCharsets.UTF_8))) { |
||||
|
|
||||
|
writer.write("=========================================="); |
||||
|
writer.newLine(); |
||||
|
writer.write(" 数据来源: " + source); |
||||
|
writer.newLine(); |
||||
|
writer.write(" 爬取时间: " + LocalDateTime.now()); |
||||
|
writer.newLine(); |
||||
|
writer.write(" 电影数量: " + movies.size()); |
||||
|
writer.newLine(); |
||||
|
writer.write("=========================================="); |
||||
|
writer.newLine(); |
||||
|
writer.newLine(); |
||||
|
|
||||
|
for (Movie movie : movies) { |
||||
|
writer.write(String.format("排名: %d", movie.getRank())); |
||||
|
writer.newLine(); |
||||
|
writer.write(String.format("电影: %s", movie.getName())); |
||||
|
writer.newLine(); |
||||
|
writer.write(String.format("评分: %.1f", movie.getRating())); |
||||
|
writer.newLine(); |
||||
|
if (movie.getDirector() != null) { |
||||
|
writer.write(String.format("导演: %s", movie.getDirector())); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
if (movie.getYear() != null) { |
||||
|
writer.write(String.format("年份: %s", movie.getYear())); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
writer.write("------------------------------------------"); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
|
||||
|
System.out.println("TXT文件已保存: " + filename); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("保存TXT失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void saveBook(Book book) { |
||||
|
List<Book> list = new ArrayList<>(); |
||||
|
list.add(book); |
||||
|
saveBookBatch(list); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void saveBookBatch(List<Book> books) { |
||||
|
if (books.isEmpty()) return; |
||||
|
|
||||
|
String source = books.get(0).getSource(); |
||||
|
|
||||
|
saveBooksAsJson(books, source); |
||||
|
saveBooksAsTxt(books, source); |
||||
|
} |
||||
|
|
||||
|
private void saveBooksAsJson(List<Book> books, String source) { |
||||
|
String filename = outputDir + "/" + sanitizeFilename(source) + "_" + |
||||
|
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".json"; |
||||
|
|
||||
|
try (Writer writer = new OutputStreamWriter( |
||||
|
new FileOutputStream(filename), StandardCharsets.UTF_8)) { |
||||
|
gson.toJson(books, writer); |
||||
|
System.out.println("JSON文件已保存: " + filename); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("保存JSON失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void saveBooksAsTxt(List<Book> books, String source) { |
||||
|
String filename = outputDir + "/" + sanitizeFilename(source) + "_" + |
||||
|
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + ".txt"; |
||||
|
|
||||
|
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( |
||||
|
new FileOutputStream(filename), StandardCharsets.UTF_8))) { |
||||
|
|
||||
|
writer.write("=========================================="); |
||||
|
writer.newLine(); |
||||
|
writer.write(" 数据来源: " + source); |
||||
|
writer.newLine(); |
||||
|
writer.write(" 爬取时间: " + LocalDateTime.now()); |
||||
|
writer.newLine(); |
||||
|
writer.write(" 图书数量: " + books.size()); |
||||
|
writer.newLine(); |
||||
|
writer.write("=========================================="); |
||||
|
writer.newLine(); |
||||
|
writer.newLine(); |
||||
|
|
||||
|
for (Book book : books) { |
||||
|
if (book.getRank() != null) { |
||||
|
writer.write(String.format("排名: %d", book.getRank())); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
writer.write(String.format("书名: %s", book.getTitle())); |
||||
|
writer.newLine(); |
||||
|
if (book.getRating() != null) { |
||||
|
writer.write(String.format("评分: %.1f", book.getRating())); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
if (book.getAuthor() != null) { |
||||
|
writer.write(String.format("作者: %s", book.getAuthor())); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
if (book.getPublisher() != null) { |
||||
|
writer.write(String.format("出版社: %s", book.getPublisher())); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
if (book.getPrice() != null) { |
||||
|
writer.write(String.format("价格: %.2f", book.getPrice())); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
if (book.getYear() != null) { |
||||
|
writer.write(String.format("年份: %s", book.getYear())); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
writer.write("------------------------------------------"); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
|
||||
|
System.out.println("TXT文件已保存: " + filename); |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("保存TXT失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private String sanitizeFilename(String filename) { |
||||
|
return filename.replaceAll("[\\\\/:*?\"<>|]", "_"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> findAll() { |
||||
|
return new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> findBySource(String source) { |
||||
|
return new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> findByRankRange(int start, int end) { |
||||
|
return new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Book> findAllBooks() { |
||||
|
return new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Book> findBooksBySource(String source) { |
||||
|
return new ArrayList<>(); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void deleteBySource(String source) {} |
||||
|
|
||||
|
@Override |
||||
|
public void clearAll() {} |
||||
|
|
||||
|
@Override |
||||
|
public StorageStats getStats() { |
||||
|
return new StorageStats(0, 0); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void close() {} |
||||
|
} |
||||
@ -0,0 +1,25 @@ |
|||||
|
package storage; |
||||
|
|
||||
|
import com.google.gson.*; |
||||
|
|
||||
|
import java.lang.reflect.Type; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
|
||||
|
/** |
||||
|
* Gson LocalDateTime 适配器 |
||||
|
*/ |
||||
|
public class LocalDateTimeAdapter implements JsonSerializer<LocalDateTime>, JsonDeserializer<LocalDateTime> { |
||||
|
private static final DateTimeFormatter formatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME; |
||||
|
|
||||
|
@Override |
||||
|
public JsonElement serialize(LocalDateTime src, Type typeOfSrc, JsonSerializationContext context) { |
||||
|
return new JsonPrimitive(formatter.format(src)); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public LocalDateTime deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) |
||||
|
throws JsonParseException { |
||||
|
return LocalDateTime.parse(json.getAsString(), formatter); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,414 @@ |
|||||
|
package storage; |
||||
|
|
||||
|
import model.Book; |
||||
|
import model.Movie; |
||||
|
|
||||
|
import java.sql.*; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class SQLiteStorage implements DataStorage { |
||||
|
private static final String DB_URL = "jdbc:sqlite:crawler.db"; |
||||
|
private Connection connection; |
||||
|
|
||||
|
public SQLiteStorage() { |
||||
|
try { |
||||
|
connection = DriverManager.getConnection(DB_URL); |
||||
|
initTable(); |
||||
|
} catch (SQLException e) { |
||||
|
throw new RuntimeException("数据库连接失败: " + e.getMessage(), e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void initTable() throws SQLException { |
||||
|
String movieSql = "CREATE TABLE IF NOT EXISTS movies (" + |
||||
|
"id INTEGER PRIMARY KEY AUTOINCREMENT," + |
||||
|
"source TEXT NOT NULL," + |
||||
|
"rank INTEGER," + |
||||
|
"name TEXT NOT NULL," + |
||||
|
"director TEXT," + |
||||
|
"actors TEXT," + |
||||
|
"year TEXT," + |
||||
|
"rating REAL," + |
||||
|
"rating_count INTEGER," + |
||||
|
"description TEXT," + |
||||
|
"url TEXT," + |
||||
|
"image_url TEXT," + |
||||
|
"crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + |
||||
|
")"; |
||||
|
|
||||
|
String bookSql = "CREATE TABLE IF NOT EXISTS books (" + |
||||
|
"id INTEGER PRIMARY KEY AUTOINCREMENT," + |
||||
|
"source TEXT NOT NULL," + |
||||
|
"rank INTEGER," + |
||||
|
"title TEXT NOT NULL," + |
||||
|
"author TEXT," + |
||||
|
"publisher TEXT," + |
||||
|
"year TEXT," + |
||||
|
"price REAL," + |
||||
|
"rating REAL," + |
||||
|
"rating_count INTEGER," + |
||||
|
"category TEXT," + |
||||
|
"description TEXT," + |
||||
|
"url TEXT," + |
||||
|
"image_url TEXT," + |
||||
|
"isbn TEXT," + |
||||
|
"crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP" + |
||||
|
")"; |
||||
|
|
||||
|
try (Statement stmt = connection.createStatement()) { |
||||
|
stmt.execute(movieSql); |
||||
|
stmt.execute(bookSql); |
||||
|
} |
||||
|
|
||||
|
String indexSql1 = "CREATE INDEX IF NOT EXISTS idx_movie_source ON movies(source)"; |
||||
|
String indexSql2 = "CREATE INDEX IF NOT EXISTS idx_book_source ON books(source)"; |
||||
|
try (Statement stmt = connection.createStatement()) { |
||||
|
stmt.execute(indexSql1); |
||||
|
stmt.execute(indexSql2); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void save(Movie movie) { |
||||
|
String sql = "INSERT INTO movies (source, rank, name, director, actors, year, " + |
||||
|
"rating, rating_count, description, url, image_url, crawl_time) " + |
||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; |
||||
|
|
||||
|
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
||||
|
pstmt.setString(1, movie.getSource()); |
||||
|
pstmt.setObject(2, movie.getRank()); |
||||
|
pstmt.setString(3, movie.getName()); |
||||
|
pstmt.setString(4, movie.getDirector()); |
||||
|
pstmt.setString(5, movie.getActors()); |
||||
|
pstmt.setString(6, movie.getYear()); |
||||
|
pstmt.setObject(7, movie.getRating()); |
||||
|
pstmt.setObject(8, movie.getRatingCount()); |
||||
|
pstmt.setString(9, movie.getDescription()); |
||||
|
pstmt.setString(10, movie.getUrl()); |
||||
|
pstmt.setString(11, movie.getImageUrl()); |
||||
|
pstmt.setTimestamp(12, movie.getCrawlTime() != null ? |
||||
|
Timestamp.valueOf(movie.getCrawlTime()) : null); |
||||
|
pstmt.executeUpdate(); |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("保存电影失败: " + movie.getName() + " - " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void saveBatch(List<Movie> movies) { |
||||
|
String sql = "INSERT INTO movies (source, rank, name, director, actors, year, " + |
||||
|
"rating, rating_count, description, url, image_url, crawl_time) " + |
||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; |
||||
|
|
||||
|
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
||||
|
connection.setAutoCommit(false); |
||||
|
|
||||
|
for (Movie movie : movies) { |
||||
|
pstmt.setString(1, movie.getSource()); |
||||
|
pstmt.setObject(2, movie.getRank()); |
||||
|
pstmt.setString(3, movie.getName()); |
||||
|
pstmt.setString(4, movie.getDirector()); |
||||
|
pstmt.setString(5, movie.getActors()); |
||||
|
pstmt.setString(6, movie.getYear()); |
||||
|
pstmt.setObject(7, movie.getRating()); |
||||
|
pstmt.setObject(8, movie.getRatingCount()); |
||||
|
pstmt.setString(9, movie.getDescription()); |
||||
|
pstmt.setString(10, movie.getUrl()); |
||||
|
pstmt.setString(11, movie.getImageUrl()); |
||||
|
pstmt.setTimestamp(12, movie.getCrawlTime() != null ? |
||||
|
Timestamp.valueOf(movie.getCrawlTime()) : null); |
||||
|
pstmt.addBatch(); |
||||
|
} |
||||
|
|
||||
|
pstmt.executeBatch(); |
||||
|
connection.commit(); |
||||
|
System.out.println("批量保存 " + movies.size() + " 条数据成功"); |
||||
|
} catch (SQLException e) { |
||||
|
try { |
||||
|
connection.rollback(); |
||||
|
} catch (SQLException ex) { |
||||
|
ex.printStackTrace(); |
||||
|
} |
||||
|
System.err.println("批量保存失败: " + e.getMessage()); |
||||
|
} finally { |
||||
|
try { |
||||
|
connection.setAutoCommit(true); |
||||
|
} catch (SQLException e) { |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void saveBook(Book book) { |
||||
|
String sql = "INSERT INTO books (source, rank, title, author, publisher, year, " + |
||||
|
"price, rating, rating_count, category, description, url, image_url, isbn, crawl_time) " + |
||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; |
||||
|
|
||||
|
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
||||
|
pstmt.setString(1, book.getSource()); |
||||
|
pstmt.setObject(2, book.getRank()); |
||||
|
pstmt.setString(3, book.getTitle()); |
||||
|
pstmt.setString(4, book.getAuthor()); |
||||
|
pstmt.setString(5, book.getPublisher()); |
||||
|
pstmt.setString(6, book.getYear()); |
||||
|
pstmt.setObject(7, book.getPrice()); |
||||
|
pstmt.setObject(8, book.getRating()); |
||||
|
pstmt.setObject(9, book.getRatingCount()); |
||||
|
pstmt.setString(10, book.getCategory()); |
||||
|
pstmt.setString(11, book.getDescription()); |
||||
|
pstmt.setString(12, book.getUrl()); |
||||
|
pstmt.setString(13, book.getImageUrl()); |
||||
|
pstmt.setString(14, book.getIsbn()); |
||||
|
pstmt.setTimestamp(15, book.getCrawlTime() != null ? |
||||
|
Timestamp.valueOf(book.getCrawlTime()) : null); |
||||
|
pstmt.executeUpdate(); |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("保存图书失败: " + book.getTitle() + " - " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void saveBookBatch(List<Book> books) { |
||||
|
String sql = "INSERT INTO books (source, rank, title, author, publisher, year, " + |
||||
|
"price, rating, rating_count, category, description, url, image_url, isbn, crawl_time) " + |
||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; |
||||
|
|
||||
|
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
||||
|
connection.setAutoCommit(false); |
||||
|
|
||||
|
for (Book book : books) { |
||||
|
pstmt.setString(1, book.getSource()); |
||||
|
pstmt.setObject(2, book.getRank()); |
||||
|
pstmt.setString(3, book.getTitle()); |
||||
|
pstmt.setString(4, book.getAuthor()); |
||||
|
pstmt.setString(5, book.getPublisher()); |
||||
|
pstmt.setString(6, book.getYear()); |
||||
|
pstmt.setObject(7, book.getPrice()); |
||||
|
pstmt.setObject(8, book.getRating()); |
||||
|
pstmt.setObject(9, book.getRatingCount()); |
||||
|
pstmt.setString(10, book.getCategory()); |
||||
|
pstmt.setString(11, book.getDescription()); |
||||
|
pstmt.setString(12, book.getUrl()); |
||||
|
pstmt.setString(13, book.getImageUrl()); |
||||
|
pstmt.setString(14, book.getIsbn()); |
||||
|
pstmt.setTimestamp(15, book.getCrawlTime() != null ? |
||||
|
Timestamp.valueOf(book.getCrawlTime()) : null); |
||||
|
pstmt.addBatch(); |
||||
|
} |
||||
|
|
||||
|
pstmt.executeBatch(); |
||||
|
connection.commit(); |
||||
|
System.out.println("批量保存 " + books.size() + " 条图书数据成功"); |
||||
|
} catch (SQLException e) { |
||||
|
try { |
||||
|
connection.rollback(); |
||||
|
} catch (SQLException ex) { |
||||
|
ex.printStackTrace(); |
||||
|
} |
||||
|
System.err.println("批量保存图书失败: " + e.getMessage()); |
||||
|
} finally { |
||||
|
try { |
||||
|
connection.setAutoCommit(true); |
||||
|
} catch (SQLException e) { |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> findAll() { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
String sql = "SELECT * FROM movies ORDER BY source, rank"; |
||||
|
|
||||
|
try (Statement stmt = connection.createStatement(); |
||||
|
ResultSet rs = stmt.executeQuery(sql)) { |
||||
|
while (rs.next()) { |
||||
|
movies.add(mapResultSetToMovie(rs)); |
||||
|
} |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("查询失败: " + e.getMessage()); |
||||
|
} |
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> findBySource(String source) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
String sql = "SELECT * FROM movies WHERE source = ? ORDER BY rank"; |
||||
|
|
||||
|
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
||||
|
pstmt.setString(1, source); |
||||
|
ResultSet rs = pstmt.executeQuery(); |
||||
|
while (rs.next()) { |
||||
|
movies.add(mapResultSetToMovie(rs)); |
||||
|
} |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("查询失败: " + e.getMessage()); |
||||
|
} |
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> findByRankRange(int start, int end) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
String sql = "SELECT * FROM movies WHERE rank BETWEEN ? AND ? ORDER BY rank"; |
||||
|
|
||||
|
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
||||
|
pstmt.setInt(1, start); |
||||
|
pstmt.setInt(2, end); |
||||
|
ResultSet rs = pstmt.executeQuery(); |
||||
|
while (rs.next()) { |
||||
|
movies.add(mapResultSetToMovie(rs)); |
||||
|
} |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("查询失败: " + e.getMessage()); |
||||
|
} |
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Book> findAllBooks() { |
||||
|
List<Book> books = new ArrayList<>(); |
||||
|
String sql = "SELECT * FROM books ORDER BY source, rank"; |
||||
|
|
||||
|
try (Statement stmt = connection.createStatement(); |
||||
|
ResultSet rs = stmt.executeQuery(sql)) { |
||||
|
while (rs.next()) { |
||||
|
books.add(mapResultSetToBook(rs)); |
||||
|
} |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("查询失败: " + e.getMessage()); |
||||
|
} |
||||
|
return books; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Book> findBooksBySource(String source) { |
||||
|
List<Book> books = new ArrayList<>(); |
||||
|
String sql = "SELECT * FROM books WHERE source = ? ORDER BY rank"; |
||||
|
|
||||
|
try (PreparedStatement pstmt = connection.prepareStatement(sql)) { |
||||
|
pstmt.setString(1, source); |
||||
|
ResultSet rs = pstmt.executeQuery(); |
||||
|
while (rs.next()) { |
||||
|
books.add(mapResultSetToBook(rs)); |
||||
|
} |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("查询失败: " + e.getMessage()); |
||||
|
} |
||||
|
return books; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void deleteBySource(String source) { |
||||
|
String sql1 = "DELETE FROM movies WHERE source = ?"; |
||||
|
String sql2 = "DELETE FROM books WHERE source = ?"; |
||||
|
try (PreparedStatement pstmt1 = connection.prepareStatement(sql1); |
||||
|
PreparedStatement pstmt2 = connection.prepareStatement(sql2)) { |
||||
|
pstmt1.setString(1, source); |
||||
|
pstmt2.setString(1, source); |
||||
|
int count1 = pstmt1.executeUpdate(); |
||||
|
int count2 = pstmt2.executeUpdate(); |
||||
|
System.out.println("删除 " + source + " 的 " + (count1 + count2) + " 条数据"); |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("删除失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void clearAll() { |
||||
|
try (Statement stmt = connection.createStatement()) { |
||||
|
stmt.execute("DELETE FROM movies"); |
||||
|
stmt.execute("DELETE FROM books"); |
||||
|
System.out.println("清空所有数据"); |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("清空失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public StorageStats getStats() { |
||||
|
int totalCount = 0; |
||||
|
int sourceCount = 0; |
||||
|
|
||||
|
try (Statement stmt = connection.createStatement()) { |
||||
|
ResultSet rs = stmt.executeQuery("SELECT COUNT(*) FROM movies"); |
||||
|
if (rs.next()) { |
||||
|
totalCount = rs.getInt(1); |
||||
|
} |
||||
|
rs = stmt.executeQuery("SELECT COUNT(*) FROM books"); |
||||
|
if (rs.next()) { |
||||
|
totalCount += rs.getInt(1); |
||||
|
} |
||||
|
rs = stmt.executeQuery("SELECT COUNT(DISTINCT source) FROM movies"); |
||||
|
if (rs.next()) { |
||||
|
sourceCount = rs.getInt(1); |
||||
|
} |
||||
|
rs = stmt.executeQuery("SELECT COUNT(DISTINCT source) FROM books"); |
||||
|
if (rs.next()) { |
||||
|
sourceCount += rs.getInt(1); |
||||
|
} |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("统计失败: " + e.getMessage()); |
||||
|
} |
||||
|
return new StorageStats(totalCount, sourceCount); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void close() { |
||||
|
try { |
||||
|
if (connection != null && !connection.isClosed()) { |
||||
|
connection.close(); |
||||
|
} |
||||
|
} catch (SQLException e) { |
||||
|
System.err.println("关闭连接失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private Movie mapResultSetToMovie(ResultSet rs) throws SQLException { |
||||
|
Movie movie = new Movie(); |
||||
|
movie.setId(rs.getInt("id")); |
||||
|
movie.setSource(rs.getString("source")); |
||||
|
movie.setRank(rs.getInt("rank")); |
||||
|
movie.setName(rs.getString("name")); |
||||
|
movie.setDirector(rs.getString("director")); |
||||
|
movie.setActors(rs.getString("actors")); |
||||
|
movie.setYear(rs.getString("year")); |
||||
|
movie.setRating(rs.getDouble("rating")); |
||||
|
movie.setRatingCount(rs.getInt("rating_count")); |
||||
|
movie.setDescription(rs.getString("description")); |
||||
|
movie.setUrl(rs.getString("url")); |
||||
|
movie.setImageUrl(rs.getString("image_url")); |
||||
|
Timestamp ts = rs.getTimestamp("crawl_time"); |
||||
|
if (ts != null) { |
||||
|
movie.setCrawlTime(ts.toLocalDateTime()); |
||||
|
} |
||||
|
return movie; |
||||
|
} |
||||
|
|
||||
|
private Book mapResultSetToBook(ResultSet rs) throws SQLException { |
||||
|
Book book = new Book(); |
||||
|
book.setId(rs.getInt("id")); |
||||
|
book.setSource(rs.getString("source")); |
||||
|
book.setRank(rs.getInt("rank")); |
||||
|
book.setTitle(rs.getString("title")); |
||||
|
book.setAuthor(rs.getString("author")); |
||||
|
book.setPublisher(rs.getString("publisher")); |
||||
|
book.setYear(rs.getString("year")); |
||||
|
book.setPrice(rs.getDouble("price")); |
||||
|
book.setRating(rs.getDouble("rating")); |
||||
|
book.setRatingCount(rs.getInt("rating_count")); |
||||
|
book.setCategory(rs.getString("category")); |
||||
|
book.setDescription(rs.getString("description")); |
||||
|
book.setUrl(rs.getString("url")); |
||||
|
book.setImageUrl(rs.getString("image_url")); |
||||
|
book.setIsbn(rs.getString("isbn")); |
||||
|
Timestamp ts = rs.getTimestamp("crawl_time"); |
||||
|
if (ts != null) { |
||||
|
book.setCrawlTime(ts.toLocalDateTime()); |
||||
|
} |
||||
|
return book; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,23 @@ |
|||||
|
package storage; |
||||
|
|
||||
|
/** |
||||
|
* 存储统计信息 |
||||
|
*/ |
||||
|
public class StorageStats { |
||||
|
private int totalCount; |
||||
|
private int sourceCount; |
||||
|
|
||||
|
public StorageStats(int totalCount, int sourceCount) { |
||||
|
this.totalCount = totalCount; |
||||
|
this.sourceCount = sourceCount; |
||||
|
} |
||||
|
|
||||
|
public int getTotalCount() { return totalCount; } |
||||
|
public int getSourceCount() { return sourceCount; } |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return String.format("StorageStats{totalCount=%d, sourceCount=%d}", |
||||
|
totalCount, sourceCount); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,115 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import exception.CrawlerResult; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
import model.Book; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import storage.DataStorage; |
||||
|
import util.Logger; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public abstract class AbstractBookCrawlerStrategy implements BookCrawlerStrategy { |
||||
|
protected DataStorage storage; |
||||
|
protected int delayMs = 1500; |
||||
|
|
||||
|
@Override |
||||
|
public void setStorage(DataStorage storage) { |
||||
|
this.storage = storage; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public int getDelayMs() { |
||||
|
return delayMs; |
||||
|
} |
||||
|
|
||||
|
protected Document fetchDocument(String url) throws IOException { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + |
||||
|
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
||||
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
||||
|
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") |
||||
|
.timeout(15000) |
||||
|
.get(); |
||||
|
} |
||||
|
|
||||
|
protected void delay() { |
||||
|
try { |
||||
|
Thread.sleep(delayMs); |
||||
|
} catch (InterruptedException e) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CrawlerResult execute() { |
||||
|
long startTime = System.currentTimeMillis(); |
||||
|
String sourceName = getName(); |
||||
|
List<Book> allBooks = new ArrayList<>(); |
||||
|
|
||||
|
try { |
||||
|
List<String> urls = getPageUrls(); |
||||
|
Logger.info(String.format("[%s] 开始爬取,共 %d 个页面", sourceName, urls.size())); |
||||
|
|
||||
|
for (int i = 0; i < urls.size(); i++) { |
||||
|
String url = urls.get(i); |
||||
|
Logger.info(String.format("[%s] 爬取第 %d/%d 页: %s", sourceName, i + 1, urls.size(), url)); |
||||
|
|
||||
|
try { |
||||
|
Document doc = fetchDocument(url); |
||||
|
List<Book> books = parseBooks(doc.html()); |
||||
|
|
||||
|
for (Book book : books) { |
||||
|
book.setSource(sourceName); |
||||
|
} |
||||
|
|
||||
|
allBooks.addAll(books); |
||||
|
Logger.info(String.format("[%s] 第 %d 页获取 %d 条数据", sourceName, i + 1, books.size())); |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
Logger.error(String.format("[%s] 网络请求失败: %s", sourceName, url), e); |
||||
|
throw new NetworkException("网络请求失败: " + url, sourceName, e); |
||||
|
} catch (Exception e) { |
||||
|
Logger.error(String.format("[%s] 解析页面失败: %s", sourceName, url), e); |
||||
|
throw new ParseException("解析页面失败: " + url, sourceName, e); |
||||
|
} |
||||
|
|
||||
|
if (i < urls.size() - 1) { |
||||
|
delay(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (storage != null && !allBooks.isEmpty()) { |
||||
|
storage.saveBookBatch(allBooks); |
||||
|
Logger.info(String.format("[%s] 数据已保存到存储", sourceName)); |
||||
|
} |
||||
|
|
||||
|
long elapsedTime = System.currentTimeMillis() - startTime; |
||||
|
return CrawlerResult.success(sourceName) |
||||
|
.message("爬取成功") |
||||
|
.dataCount(allBooks.size()) |
||||
|
.elapsedTime(elapsedTime) |
||||
|
.build(); |
||||
|
|
||||
|
} catch (NetworkException e) { |
||||
|
long elapsedTime = System.currentTimeMillis() - startTime; |
||||
|
return CrawlerResult.failure(sourceName, "NETWORK_ERROR", e.getMessage()) |
||||
|
.elapsedTime(elapsedTime) |
||||
|
.build(); |
||||
|
} catch (ParseException e) { |
||||
|
long elapsedTime = System.currentTimeMillis() - startTime; |
||||
|
return CrawlerResult.failure(sourceName, "PARSE_ERROR", e.getMessage()) |
||||
|
.elapsedTime(elapsedTime) |
||||
|
.build(); |
||||
|
} catch (Exception e) { |
||||
|
long elapsedTime = System.currentTimeMillis() - startTime; |
||||
|
return CrawlerResult.failure(sourceName, "UNKNOWN_ERROR", e.getMessage()) |
||||
|
.elapsedTime(elapsedTime) |
||||
|
.build(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,114 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import exception.CrawlerResult; |
||||
|
import exception.NetworkException; |
||||
|
import exception.ParseException; |
||||
|
import model.Movie; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import storage.DataStorage; |
||||
|
import util.Logger; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public abstract class AbstractCrawlerStrategy implements CrawlerStrategy { |
||||
|
protected DataStorage storage; |
||||
|
protected int delayMs = 1500; |
||||
|
|
||||
|
public void setStorage(DataStorage storage) { |
||||
|
this.storage = storage; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public int getDelayMs() { |
||||
|
return delayMs; |
||||
|
} |
||||
|
|
||||
|
protected Document fetchDocument(String url) throws IOException { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + |
||||
|
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
||||
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") |
||||
|
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") |
||||
|
.timeout(15000) |
||||
|
.get(); |
||||
|
} |
||||
|
|
||||
|
protected void delay() { |
||||
|
try { |
||||
|
Thread.sleep(delayMs); |
||||
|
} catch (InterruptedException e) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public CrawlerResult execute() { |
||||
|
long startTime = System.currentTimeMillis(); |
||||
|
String sourceName = getName(); |
||||
|
List<Movie> allMovies = new ArrayList<>(); |
||||
|
|
||||
|
try { |
||||
|
List<String> urls = getPageUrls(); |
||||
|
Logger.info(String.format("[%s] 开始爬取,共 %d 个页面", sourceName, urls.size())); |
||||
|
|
||||
|
for (int i = 0; i < urls.size(); i++) { |
||||
|
String url = urls.get(i); |
||||
|
Logger.info(String.format("[%s] 爬取第 %d/%d 页: %s", sourceName, i + 1, urls.size(), url)); |
||||
|
|
||||
|
try { |
||||
|
Document doc = fetchDocument(url); |
||||
|
List<Movie> movies = parseMovies(doc.html()); |
||||
|
|
||||
|
for (Movie movie : movies) { |
||||
|
movie.setSource(sourceName); |
||||
|
} |
||||
|
|
||||
|
allMovies.addAll(movies); |
||||
|
Logger.info(String.format("[%s] 第 %d 页获取 %d 条数据", sourceName, i + 1, movies.size())); |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
Logger.error(String.format("[%s] 网络请求失败: %s", sourceName, url), e); |
||||
|
throw new NetworkException("网络请求失败: " + url, sourceName, e); |
||||
|
} catch (Exception e) { |
||||
|
Logger.error(String.format("[%s] 解析页面失败: %s", sourceName, url), e); |
||||
|
throw new ParseException("解析页面失败: " + url, sourceName, e); |
||||
|
} |
||||
|
|
||||
|
if (i < urls.size() - 1) { |
||||
|
delay(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (storage != null && !allMovies.isEmpty()) { |
||||
|
storage.saveBatch(allMovies); |
||||
|
Logger.info(String.format("[%s] 数据已保存到存储", sourceName)); |
||||
|
} |
||||
|
|
||||
|
long elapsedTime = System.currentTimeMillis() - startTime; |
||||
|
return CrawlerResult.success(sourceName) |
||||
|
.message("爬取成功") |
||||
|
.dataCount(allMovies.size()) |
||||
|
.elapsedTime(elapsedTime) |
||||
|
.build(); |
||||
|
|
||||
|
} catch (NetworkException e) { |
||||
|
long elapsedTime = System.currentTimeMillis() - startTime; |
||||
|
return CrawlerResult.failure(sourceName, "NETWORK_ERROR", e.getMessage()) |
||||
|
.elapsedTime(elapsedTime) |
||||
|
.build(); |
||||
|
} catch (ParseException e) { |
||||
|
long elapsedTime = System.currentTimeMillis() - startTime; |
||||
|
return CrawlerResult.failure(sourceName, "PARSE_ERROR", e.getMessage()) |
||||
|
.elapsedTime(elapsedTime) |
||||
|
.build(); |
||||
|
} catch (Exception e) { |
||||
|
long elapsedTime = System.currentTimeMillis() - startTime; |
||||
|
return CrawlerResult.failure(sourceName, "UNKNOWN_ERROR", e.getMessage()) |
||||
|
.elapsedTime(elapsedTime) |
||||
|
.build(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,15 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import exception.CrawlerResult; |
||||
|
import storage.DataStorage; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface BookCrawlerStrategy { |
||||
|
String getName(); |
||||
|
String getBaseUrl(); |
||||
|
List<String> getPageUrls(); |
||||
|
List<model.Book> parseBooks(String htmlContent); |
||||
|
CrawlerResult execute(); |
||||
|
int getDelayMs(); |
||||
|
void setStorage(DataStorage storage); |
||||
|
} |
||||
@ -0,0 +1,16 @@ |
|||||
|
package strategy; |
||||
|
|
||||
|
import exception.CrawlerResult; |
||||
|
import model.Movie; |
||||
|
import storage.DataStorage; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlerStrategy { |
||||
|
String getName(); |
||||
|
String getBaseUrl(); |
||||
|
List<String> getPageUrls(); |
||||
|
List<Movie> parseMovies(String htmlContent); |
||||
|
CrawlerResult execute(); |
||||
|
int getDelayMs(); |
||||
|
void setStorage(DataStorage storage); |
||||
|
} |
||||
@ -0,0 +1,116 @@ |
|||||
|
package strategy.impl; |
||||
|
|
||||
|
import model.Book; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import strategy.AbstractBookCrawlerStrategy; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class BooksToScrapeStrategy extends AbstractBookCrawlerStrategy { |
||||
|
private static final String NAME = "BooksToScrape"; |
||||
|
private static final String BASE_URL = "https://books.toscrape.com"; |
||||
|
|
||||
|
public BooksToScrapeStrategy() { |
||||
|
this.delayMs = 1000; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return NAME; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getBaseUrl() { |
||||
|
return BASE_URL; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<String> getPageUrls() { |
||||
|
List<String> urls = new ArrayList<>(); |
||||
|
urls.add(BASE_URL); |
||||
|
for (int i = 2; i <= 50; i++) { |
||||
|
urls.add(BASE_URL + "/catalogue/page-" + i + ".html"); |
||||
|
} |
||||
|
return urls; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Book> parseBooks(String htmlContent) { |
||||
|
List<Book> books = new ArrayList<>(); |
||||
|
Document doc = Jsoup.parse(htmlContent); |
||||
|
Elements items = doc.select("article.product_pod"); |
||||
|
|
||||
|
int rank = 1; |
||||
|
for (Element item : items) { |
||||
|
try { |
||||
|
Book book = new Book(); |
||||
|
|
||||
|
Element titleElement = item.select("h3 a").first(); |
||||
|
if (titleElement != null) { |
||||
|
book.setTitle(titleElement.attr("title")); |
||||
|
String href = titleElement.attr("href"); |
||||
|
if (href.startsWith("../")) { |
||||
|
book.setUrl(BASE_URL + "/catalogue/" + href.substring(3)); |
||||
|
} else { |
||||
|
book.setUrl(BASE_URL + "/" + href); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Element priceElement = item.select("p.price_color").first(); |
||||
|
if (priceElement != null) { |
||||
|
String priceStr = priceElement.text().replace("£", "").replace("Â", "").trim(); |
||||
|
try { |
||||
|
book.setPrice(Double.parseDouble(priceStr)); |
||||
|
} catch (NumberFormatException e) { |
||||
|
// ignore
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Element ratingElement = item.select("p.star-rating").first(); |
||||
|
if (ratingElement != null) { |
||||
|
String ratingClass = ratingElement.className(); |
||||
|
int rating = parseRating(ratingClass); |
||||
|
book.setRating((double) rating); |
||||
|
} |
||||
|
|
||||
|
Element imgElement = item.select("img").first(); |
||||
|
if (imgElement != null) { |
||||
|
String src = imgElement.attr("src"); |
||||
|
if (src.startsWith("../")) { |
||||
|
book.setImageUrl(BASE_URL + "/" + src.substring(3)); |
||||
|
} else { |
||||
|
book.setImageUrl(BASE_URL + "/" + src); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Element availabilityElement = item.select("p.instock.availability").first(); |
||||
|
if (availabilityElement != null) { |
||||
|
String availability = availabilityElement.text().trim(); |
||||
|
} |
||||
|
|
||||
|
book.setRank(rank++); |
||||
|
|
||||
|
if (book.getTitle() != null && !book.getTitle().isEmpty()) { |
||||
|
books.add(book); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
// skip invalid item
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return books; |
||||
|
} |
||||
|
|
||||
|
private int parseRating(String ratingClass) { |
||||
|
if (ratingClass.contains("One")) return 1; |
||||
|
if (ratingClass.contains("Two")) return 2; |
||||
|
if (ratingClass.contains("Three")) return 3; |
||||
|
if (ratingClass.contains("Four")) return 4; |
||||
|
if (ratingClass.contains("Five")) return 5; |
||||
|
return 0; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,159 @@ |
|||||
|
package strategy.impl; |
||||
|
|
||||
|
import model.Book; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import strategy.AbstractBookCrawlerStrategy; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class DoubanBookStrategy extends AbstractBookCrawlerStrategy { |
||||
|
private static final String NAME = "豆瓣读书Top250"; |
||||
|
private static final String BASE_URL = "https://book.douban.com/top250"; |
||||
|
|
||||
|
public DoubanBookStrategy() { |
||||
|
this.delayMs = 2000; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return NAME; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getBaseUrl() { |
||||
|
return BASE_URL; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<String> getPageUrls() { |
||||
|
List<String> urls = new ArrayList<>(); |
||||
|
for (int i = 0; i < 10; i++) { |
||||
|
urls.add(BASE_URL + "?start=" + (i * 25)); |
||||
|
} |
||||
|
return urls; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Book> parseBooks(String htmlContent) { |
||||
|
List<Book> books = new ArrayList<>(); |
||||
|
Document doc = Jsoup.parse(htmlContent); |
||||
|
Elements items = doc.select("tr.item"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
try { |
||||
|
Book book = new Book(); |
||||
|
|
||||
|
Element indent = item.select("td.indent").first(); |
||||
|
if (indent != null) { |
||||
|
String rankStr = indent.select("div.starcount").text(); |
||||
|
if (!rankStr.isEmpty()) { |
||||
|
book.setRank(parseNumber(rankStr)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (book.getRank() == null) { |
||||
|
Element order = item.select("div.starcount").first(); |
||||
|
if (order != null) { |
||||
|
book.setRank(parseNumber(order.text())); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Element titleElement = item.select("div.pl2 a").first(); |
||||
|
if (titleElement != null) { |
||||
|
String title = titleElement.attr("title"); |
||||
|
if (title.isEmpty()) { |
||||
|
title = titleElement.text().split("\\s")[0]; |
||||
|
} |
||||
|
book.setTitle(title.trim()); |
||||
|
book.setUrl(titleElement.attr("href")); |
||||
|
} |
||||
|
|
||||
|
Element ratingElement = item.select("span.rating_nums").first(); |
||||
|
if (ratingElement != null) { |
||||
|
String ratingStr = ratingElement.text(); |
||||
|
if (!ratingStr.isEmpty()) { |
||||
|
book.setRating(Double.parseDouble(ratingStr)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Element countElement = item.select("span.pl").first(); |
||||
|
if (countElement != null) { |
||||
|
String countText = countElement.text(); |
||||
|
if (countText.contains("人评价")) { |
||||
|
String num = countText.replace("人评价", "").replace("(", "").replace(")", "").trim(); |
||||
|
book.setRatingCount(parseNumber(num)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Element infoElement = item.select("p.pl").first(); |
||||
|
if (infoElement != null) { |
||||
|
String info = infoElement.text(); |
||||
|
parseBookInfo(book, info); |
||||
|
} |
||||
|
|
||||
|
Element imgElement = item.select("img").first(); |
||||
|
if (imgElement != null) { |
||||
|
book.setImageUrl(imgElement.attr("src")); |
||||
|
} |
||||
|
|
||||
|
if (book.getTitle() != null && !book.getTitle().isEmpty()) { |
||||
|
books.add(book); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
// skip invalid item
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return books; |
||||
|
} |
||||
|
|
||||
|
private void parseBookInfo(Book book, String info) { |
||||
|
String[] parts = info.split(" / "); |
||||
|
for (int i = 0; i < parts.length; i++) { |
||||
|
String part = parts[i].trim(); |
||||
|
|
||||
|
if (i == 0 && !part.matches("\\d{4}.*") && !part.matches(".*\\d+\\.\\d+.*")) { |
||||
|
book.setAuthor(part); |
||||
|
} |
||||
|
|
||||
|
if (part.matches("\\d{4}")) { |
||||
|
book.setYear(part); |
||||
|
} |
||||
|
|
||||
|
if (part.contains("出版社")) { |
||||
|
book.setPublisher(part.replace("出版社", "").trim()); |
||||
|
} |
||||
|
|
||||
|
if (part.matches(".*\\d+\\.\\d+元")) { |
||||
|
String priceStr = part.replace("元", "").trim(); |
||||
|
try { |
||||
|
book.setPrice(Double.parseDouble(priceStr)); |
||||
|
} catch (NumberFormatException e) { |
||||
|
// ignore
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (part.matches("ISBN.*")) { |
||||
|
book.setIsbn(part.replace("ISBN", "").trim()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private Integer parseNumber(String str) { |
||||
|
try { |
||||
|
if (str == null || str.isEmpty()) return null; |
||||
|
str = str.replaceAll("[^0-9.]", ""); |
||||
|
if (str.isEmpty()) return null; |
||||
|
if (str.contains(".")) { |
||||
|
return (int) Double.parseDouble(str); |
||||
|
} |
||||
|
return Integer.parseInt(str); |
||||
|
} catch (NumberFormatException e) { |
||||
|
return null; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,111 @@ |
|||||
|
package strategy.impl; |
||||
|
|
||||
|
import model.Movie; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import strategy.AbstractCrawlerStrategy; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class DoubanStrategy extends AbstractCrawlerStrategy { |
||||
|
private static final String NAME = "豆瓣电影Top250"; |
||||
|
private static final String BASE_URL = "https://movie.douban.com/top250"; |
||||
|
|
||||
|
public DoubanStrategy() { |
||||
|
this.delayMs = 1500; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return NAME; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getBaseUrl() { |
||||
|
return BASE_URL; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<String> getPageUrls() { |
||||
|
List<String> urls = new ArrayList<>(); |
||||
|
for (int i = 0; i < 10; i++) { |
||||
|
urls.add(BASE_URL + "?start=" + (i * 25)); |
||||
|
} |
||||
|
return urls; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> parseMovies(String htmlContent) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
Document doc = Jsoup.parse(htmlContent); |
||||
|
Elements items = doc.select("div.item"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
try { |
||||
|
Movie movie = new Movie(); |
||||
|
|
||||
|
String rankStr = item.select("em").text(); |
||||
|
movie.setRank(Integer.parseInt(rankStr)); |
||||
|
|
||||
|
Element titleElement = item.select("span.title").first(); |
||||
|
if (titleElement != null) { |
||||
|
movie.setName(titleElement.text()); |
||||
|
} |
||||
|
|
||||
|
String ratingStr = item.select("span.rating_num").text(); |
||||
|
if (!ratingStr.isEmpty()) { |
||||
|
movie.setRating(Double.parseDouble(ratingStr)); |
||||
|
} |
||||
|
|
||||
|
String ratingCountStr = item.select("div.star span").last().text(); |
||||
|
if (ratingCountStr != null && ratingCountStr.contains("人评价")) { |
||||
|
String num = ratingCountStr.replace("人评价", "").trim(); |
||||
|
movie.setRatingCount(parseNumber(num)); |
||||
|
} |
||||
|
|
||||
|
String info = item.select("div.bd p").first().text(); |
||||
|
if (info != null) { |
||||
|
String[] parts = info.split(" / "); |
||||
|
if (parts.length > 0) { |
||||
|
String firstPart = parts[0]; |
||||
|
if (firstPart.contains("导演: ")) { |
||||
|
movie.setDirector(firstPart.replace("导演: ", "").trim()); |
||||
|
} |
||||
|
for (String part : parts) { |
||||
|
if (part.matches("\\d{4}") || part.matches("\\d{4}.*")) { |
||||
|
movie.setYear(part.trim().split("\\s+")[0]); |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
String link = item.select("div.hd a").attr("href"); |
||||
|
movie.setUrl(link); |
||||
|
|
||||
|
String imgUrl = item.select("div.pic img").attr("src"); |
||||
|
movie.setImageUrl(imgUrl); |
||||
|
|
||||
|
movies.add(movie); |
||||
|
} catch (Exception e) { |
||||
|
// skip invalid item
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
|
||||
|
private Integer parseNumber(String str) { |
||||
|
try { |
||||
|
if (str.contains("万")) { |
||||
|
return (int) (Double.parseDouble(str.replace("万", "")) * 10000); |
||||
|
} |
||||
|
return Integer.parseInt(str.replace(",", "")); |
||||
|
} catch (NumberFormatException e) { |
||||
|
return null; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,94 @@ |
|||||
|
package strategy.impl; |
||||
|
|
||||
|
import model.Movie; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import strategy.AbstractCrawlerStrategy; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class MaoyanStrategy extends AbstractCrawlerStrategy { |
||||
|
private static final String NAME = "猫眼电影Top100"; |
||||
|
private static final String BASE_URL = "https://maoyan.com/board/4"; |
||||
|
|
||||
|
public MaoyanStrategy() { |
||||
|
this.delayMs = 1500; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return NAME; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getBaseUrl() { |
||||
|
return BASE_URL; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<String> getPageUrls() { |
||||
|
List<String> urls = new ArrayList<>(); |
||||
|
for (int i = 0; i < 10; i++) { |
||||
|
urls.add(BASE_URL + "?offset=" + (i * 10)); |
||||
|
} |
||||
|
return urls; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> parseMovies(String htmlContent) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
Document doc = Jsoup.parse(htmlContent); |
||||
|
Elements items = doc.select("dl.board-wrapper dd"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
try { |
||||
|
Movie movie = new Movie(); |
||||
|
|
||||
|
String rankStr = item.select("i.board-index").text(); |
||||
|
movie.setRank(Integer.parseInt(rankStr)); |
||||
|
|
||||
|
String name = item.select("p.name a").text(); |
||||
|
movie.setName(name); |
||||
|
|
||||
|
String ratingStr = item.select("i.integer").text() + |
||||
|
item.select("i.fraction").text(); |
||||
|
if (!ratingStr.isEmpty()) { |
||||
|
movie.setRating(Double.parseDouble(ratingStr)); |
||||
|
} |
||||
|
|
||||
|
String actors = item.select("p.star").text(); |
||||
|
if (actors != null && actors.contains("主演:")) { |
||||
|
movie.setActors(actors.replace("主演:", "").trim()); |
||||
|
} |
||||
|
|
||||
|
String releaseTime = item.select("p.releasetime").text(); |
||||
|
if (releaseTime != null && releaseTime.contains("上映时间:")) { |
||||
|
String timeStr = releaseTime.replace("上映时间:", "").trim(); |
||||
|
if (timeStr.matches("\\d{4}.*")) { |
||||
|
movie.setYear(timeStr.substring(0, 4)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
String link = item.select("p.name a").attr("href"); |
||||
|
if (!link.isEmpty()) { |
||||
|
movie.setUrl("https://maoyan.com" + link); |
||||
|
} |
||||
|
|
||||
|
String imgUrl = item.select("img.board-img").attr("data-src"); |
||||
|
if (imgUrl.isEmpty()) { |
||||
|
imgUrl = item.select("img.board-img").attr("src"); |
||||
|
} |
||||
|
movie.setImageUrl(imgUrl); |
||||
|
|
||||
|
movies.add(movie); |
||||
|
} catch (Exception e) { |
||||
|
// skip invalid item
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,105 @@ |
|||||
|
package strategy.impl; |
||||
|
|
||||
|
import model.Movie; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import strategy.AbstractCrawlerStrategy; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class RottenTomatoesStrategy extends AbstractCrawlerStrategy { |
||||
|
private static final String NAME = "烂番茄Top100"; |
||||
|
private static final String BASE_URL = "https://www.rottentomatoes.com/top/bestofrt/"; |
||||
|
|
||||
|
public RottenTomatoesStrategy() { |
||||
|
this.delayMs = 2000; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return NAME; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getBaseUrl() { |
||||
|
return BASE_URL; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<String> getPageUrls() { |
||||
|
List<String> urls = new ArrayList<>(); |
||||
|
urls.add(BASE_URL); |
||||
|
return urls; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> parseMovies(String htmlContent) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
Document doc = Jsoup.parse(htmlContent); |
||||
|
Elements items = doc.select("table.table tr"); |
||||
|
|
||||
|
int rank = 0; |
||||
|
for (Element item : items) { |
||||
|
try { |
||||
|
Element rankElement = item.selectFirst("td.rank"); |
||||
|
if (rankElement == null) continue; |
||||
|
|
||||
|
Movie movie = new Movie(); |
||||
|
|
||||
|
String rankStr = rankElement.text(); |
||||
|
if (!rankStr.isEmpty()) { |
||||
|
movie.setRank(Integer.parseInt(rankStr)); |
||||
|
} else { |
||||
|
movie.setRank(++rank); |
||||
|
} |
||||
|
|
||||
|
Element titleElement = item.selectFirst("td.title a"); |
||||
|
if (titleElement != null) { |
||||
|
String fullTitle = titleElement.text(); |
||||
|
if (fullTitle.contains("(") && fullTitle.contains(")")) { |
||||
|
int start = fullTitle.lastIndexOf("("); |
||||
|
int end = fullTitle.lastIndexOf(")"); |
||||
|
if (start > 0 && end > start) { |
||||
|
String yearStr = fullTitle.substring(start + 1, end); |
||||
|
if (yearStr.matches("\\d{4}")) { |
||||
|
movie.setYear(yearStr); |
||||
|
} |
||||
|
movie.setName(fullTitle.substring(0, start).trim()); |
||||
|
} else { |
||||
|
movie.setName(fullTitle); |
||||
|
} |
||||
|
} else { |
||||
|
movie.setName(fullTitle); |
||||
|
} |
||||
|
|
||||
|
String link = titleElement.attr("href"); |
||||
|
if (!link.isEmpty()) { |
||||
|
if (link.startsWith("/")) { |
||||
|
movie.setUrl("https://www.rottentomatoes.com" + link); |
||||
|
} else { |
||||
|
movie.setUrl(link); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Element scoreElement = item.selectFirst("td.score span.tMeterScore"); |
||||
|
if (scoreElement != null) { |
||||
|
String scoreStr = scoreElement.text(); |
||||
|
if (scoreStr.matches("\\d+%")) { |
||||
|
double rating = Double.parseDouble(scoreStr.replace("%", "")) / 10; |
||||
|
movie.setRating(Math.round(rating * 10) / 10.0); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
movies.add(movie); |
||||
|
} catch (Exception e) { |
||||
|
// skip invalid item
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,54 @@ |
|||||
|
package util; |
||||
|
|
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
|
||||
|
/** |
||||
|
* 简单日志工具类 |
||||
|
*/ |
||||
|
public class Logger { |
||||
|
private static final DateTimeFormatter formatter = |
||||
|
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
||||
|
|
||||
|
public enum Level { |
||||
|
DEBUG, INFO, WARN, ERROR |
||||
|
} |
||||
|
|
||||
|
private static Level currentLevel = Level.INFO; |
||||
|
|
||||
|
public static void setLevel(Level level) { |
||||
|
currentLevel = level; |
||||
|
} |
||||
|
|
||||
|
private static void log(Level level, String message) { |
||||
|
if (level.ordinal() >= currentLevel.ordinal()) { |
||||
|
String timestamp = LocalDateTime.now().format(formatter); |
||||
|
String threadName = Thread.currentThread().getName(); |
||||
|
System.out.printf("[%s] [%s] [%s] %s%n", |
||||
|
timestamp, level, threadName, message); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static void debug(String message) { |
||||
|
log(Level.DEBUG, message); |
||||
|
} |
||||
|
|
||||
|
public static void info(String message) { |
||||
|
log(Level.INFO, message); |
||||
|
} |
||||
|
|
||||
|
public static void warn(String message) { |
||||
|
log(Level.WARN, message); |
||||
|
} |
||||
|
|
||||
|
public static void error(String message) { |
||||
|
log(Level.ERROR, message); |
||||
|
} |
||||
|
|
||||
|
public static void error(String message, Throwable e) { |
||||
|
log(Level.ERROR, message + " - " + e.getMessage()); |
||||
|
if (currentLevel == Level.DEBUG) { |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,109 @@ |
|||||
|
package view; |
||||
|
|
||||
|
import exception.CrawlerResult; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CrawlerView { |
||||
|
|
||||
|
public void showWelcome() { |
||||
|
System.out.println(); |
||||
|
System.out.println("╔════════════════════════════════════════════════╗"); |
||||
|
System.out.println("║ Java 爬虫管理系统 v3.0 (电影+图书) ║"); |
||||
|
System.out.println("╚════════════════════════════════════════════════╝"); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void showHelp() { |
||||
|
System.out.println(); |
||||
|
System.out.println("═══════════════ 使用帮助 ═══════════════"); |
||||
|
System.out.println(" java -jar crawler.jar <命令> [参数]"); |
||||
|
System.out.println(); |
||||
|
System.out.println("可用命令:"); |
||||
|
System.out.println(" list - 列出所有爬虫"); |
||||
|
System.out.println(" run <爬虫名> - 运行指定爬虫"); |
||||
|
System.out.println(" run-all - 运行所有爬虫"); |
||||
|
System.out.println(" stats - 显示统计信息"); |
||||
|
System.out.println(" clear - 清空所有数据"); |
||||
|
System.out.println(" help - 显示帮助信息"); |
||||
|
System.out.println(); |
||||
|
System.out.println("电影爬虫:"); |
||||
|
System.out.println(" - 豆瓣电影Top250"); |
||||
|
System.out.println(" - 猫眼电影Top100"); |
||||
|
System.out.println(" - RottenTomatoes"); |
||||
|
System.out.println(); |
||||
|
System.out.println("图书爬虫:"); |
||||
|
System.out.println(" - 豆瓣读书Top250"); |
||||
|
System.out.println(" - BooksToScrape"); |
||||
|
System.out.println(); |
||||
|
System.out.println("示例:"); |
||||
|
System.out.println(" java -jar crawler.jar list"); |
||||
|
System.out.println(" java -jar crawler.jar run 豆瓣电影Top250"); |
||||
|
System.out.println(" java -jar crawler.jar run 豆瓣读书Top250"); |
||||
|
System.out.println(" java -jar crawler.jar run BooksToScrape"); |
||||
|
System.out.println(" java -jar crawler.jar run-all"); |
||||
|
System.out.println("═══════════════════════════════════════════"); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void showCrawlerList(List<String> crawlers) { |
||||
|
System.out.println(); |
||||
|
System.out.println("═══════════════ 爬虫列表 ═══════════════"); |
||||
|
for (int i = 0; i < crawlers.size(); i++) { |
||||
|
System.out.println(" " + (i + 1) + ". " + crawlers.get(i)); |
||||
|
} |
||||
|
System.out.println("═══════════════════════════════════════════"); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void showResult(CrawlerResult result) { |
||||
|
System.out.println(); |
||||
|
if (result.isSuccess()) { |
||||
|
System.out.println("╔════════════════════════════════════════╗"); |
||||
|
System.out.printf("║ SUCCESS: %-30s ║%n", result.getSource()); |
||||
|
System.out.printf("║ 数据条数: %-28d ║%n", result.getDataCount()); |
||||
|
System.out.printf("║ 耗时: %-30dms ║%n", result.getElapsedTime()); |
||||
|
System.out.println("╚════════════════════════════════════════╝"); |
||||
|
} else { |
||||
|
System.out.println("╔════════════════════════════════════════╗"); |
||||
|
System.out.printf("║ FAILURE: [%s] %-20s ║%n", result.getErrorCode(), result.getSource()); |
||||
|
System.out.printf("║ 错误信息: %-28s ║%n", result.getMessage()); |
||||
|
System.out.println("╚════════════════════════════════════════╝"); |
||||
|
} |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void showResults(List<CrawlerResult> results) { |
||||
|
System.out.println(); |
||||
|
System.out.println("═══════════════ 执行结果 ═══════════════"); |
||||
|
|
||||
|
int successCount = 0; |
||||
|
int totalData = 0; |
||||
|
|
||||
|
for (CrawlerResult result : results) { |
||||
|
System.out.println(result.toString()); |
||||
|
if (result.isSuccess()) { |
||||
|
successCount++; |
||||
|
totalData += result.getDataCount(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
System.out.println("─────────────────────────────────────────"); |
||||
|
System.out.printf(" 成功: %d/%d | 总数据: %d 条%n", |
||||
|
successCount, results.size(), totalData); |
||||
|
System.out.println("═══════════════════════════════════════════"); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void showError(String message) { |
||||
|
System.out.println(); |
||||
|
System.out.println("╔════════════════════════════════════════╗"); |
||||
|
System.out.println("║ 错误信息 ║"); |
||||
|
System.out.printf("║ %-36s ║%n", message); |
||||
|
System.out.println("╚════════════════════════════════════════╝"); |
||||
|
System.out.println(); |
||||
|
} |
||||
|
|
||||
|
public void showMessage(String message) { |
||||
|
System.out.println(message); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,88 @@ |
|||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
|
||||
|
// 抽象父类:封装通用爬虫逻辑
|
||||
|
abstract class BaseCrawler { |
||||
|
// 封装属性
|
||||
|
private String baseUrl; |
||||
|
|
||||
|
public BaseCrawler(String baseUrl) { |
||||
|
this.baseUrl = baseUrl; |
||||
|
} |
||||
|
|
||||
|
// 封装:获取网页文档
|
||||
|
protected Document getDoc(String url) throws IOException { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0") |
||||
|
.timeout(8000) |
||||
|
.get(); |
||||
|
} |
||||
|
|
||||
|
// 抽象方法:交给子类实现(多态基础)
|
||||
|
public abstract void parse(Document doc, FileWriter writer) throws IOException; |
||||
|
|
||||
|
// 封装:统一执行入口
|
||||
|
public void start(FileWriter writer) { |
||||
|
try { |
||||
|
for (int i = 0; i < 10; i++) { |
||||
|
int start = i * 25; |
||||
|
String url = baseUrl + "?start=" + start; |
||||
|
System.out.println("正在爬取第 " + (i + 1) + " 页"); |
||||
|
|
||||
|
Document doc = getDoc(url); |
||||
|
parse(doc, writer); // 多态:调用子类的parse
|
||||
|
Thread.sleep(1000); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 子类:继承父类,实现豆瓣电影解析
|
||||
|
class DoubanCrawler extends BaseCrawler { |
||||
|
|
||||
|
public DoubanCrawler() { |
||||
|
super("https://movie.douban.com/top250"); |
||||
|
} |
||||
|
|
||||
|
// 重写方法 → 多态
|
||||
|
@Override |
||||
|
public void parse(Document doc, FileWriter writer) throws IOException { |
||||
|
Elements items = doc.select("div.item"); |
||||
|
|
||||
|
for (Element item : items) { |
||||
|
String rank = item.select("em").text(); |
||||
|
String name = item.select("span.title").first().text(); |
||||
|
String score = item.select("span.rating_num").text(); |
||||
|
|
||||
|
String line = "排名:" + rank + " 电影:" + name + " 评分:" + score; |
||||
|
System.out.println(line); |
||||
|
writer.write(line + "\r\n"); // 写入文件
|
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 主类
|
||||
|
public class TestMain { |
||||
|
public static void main(String[] args) { |
||||
|
try { |
||||
|
// 直接写入桌面,好找!
|
||||
|
FileWriter writer = new FileWriter("douban_top250.txt"); |
||||
|
|
||||
|
// 多态:父类引用 指向 子类对象
|
||||
|
BaseCrawler crawler = new DoubanCrawler(); |
||||
|
|
||||
|
crawler.start(writer); |
||||
|
|
||||
|
writer.close(); |
||||
|
System.out.println("===== 全部爬完,文件已保存到桌面 ====="); |
||||
|
} catch (IOException e) { |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
Binary file not shown.
@ -0,0 +1,27 @@ |
|||||
|
import java.util.Scanner; |
||||
|
public class BMICalculator { |
||||
|
public static void main(String[] args) { |
||||
|
Scanner scanner = new Scanner(System.in); |
||||
|
|
||||
|
System.out.print("请输入身高(米):"); |
||||
|
double height = scanner.nextDouble(); |
||||
|
|
||||
|
System.out.print("请输入体重(千克):"); |
||||
|
double weight = scanner.nextDouble(); |
||||
|
|
||||
|
double bmi = weight / (height * height); |
||||
|
System.out.printf("你的 BMI 值为:%.2f%n", bmi); |
||||
|
|
||||
|
// BMI 范围判断
|
||||
|
if (bmi < 18.5) { |
||||
|
System.out.println("体重过轻"); |
||||
|
} else if (bmi < 24) { |
||||
|
System.out.println("正常范围"); |
||||
|
} else if (bmi < 28) { |
||||
|
System.out.println("超重"); |
||||
|
} else { |
||||
|
System.out.println("肥胖"); |
||||
|
} |
||||
|
scanner.close(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,4 @@ |
|||||
|
package PACKAGE_NAME; |
||||
|
|
||||
|
public class ShapeTest { |
||||
|
} |
||||
@ -0,0 +1,4 @@ |
|||||
|
package PACKAGE_NAME; |
||||
|
|
||||
|
public class AnimalTest { |
||||
|
} |
||||
@ -0,0 +1,4 @@ |
|||||
|
package PACKAGE_NAME; |
||||
|
|
||||
|
public class DoubanMovieCrawler { |
||||
|
} |
||||
Loading…
Reference in new issue