Browse Source

202506010204-孟鑫垚-期末实验报告

main
Mengxinyao 3 weeks ago
parent
commit
00c1146fcb
  1. BIN
      project/.DS_Store
  2. BIN
      project/my-crawler/.DS_Store
  3. 10
      project/my-crawler/.idea/.gitignore
  4. 13
      project/my-crawler/.idea/compiler.xml
  5. 7
      project/my-crawler/.idea/encodings.xml
  6. 20
      project/my-crawler/.idea/jarRepositories.xml
  7. 12
      project/my-crawler/.idea/misc.xml
  8. BIN
      project/my-crawler/202506010204-孟鑫垚-期末实验报告.docx
  9. BIN
      project/my-crawler/data/.DS_Store
  10. 20
      project/my-crawler/data/articles_20260531_003057.txt
  11. 21
      project/my-crawler/data/articles_20260531_004841.txt
  12. 21
      project/my-crawler/data/articles_20260531_010148.txt
  13. 20
      project/my-crawler/data/articles_20260531_105816.txt
  14. 31
      project/my-crawler/data/index.txt
  15. 50
      project/my-crawler/pom.xml
  16. BIN
      project/my-crawler/src/.DS_Store
  17. BIN
      project/my-crawler/src/main/.DS_Store
  18. BIN
      project/my-crawler/src/main/java/.DS_Store
  19. BIN
      project/my-crawler/src/main/java/com/.DS_Store
  20. 63
      project/my-crawler/src/main/java/com/crawler/App.java
  21. 7
      project/my-crawler/src/main/java/com/crawler/command/Command.java
  22. 41
      project/my-crawler/src/main/java/com/crawler/command/CrawlCommand.java
  23. 31
      project/my-crawler/src/main/java/com/crawler/command/ExitCommand.java
  24. 26
      project/my-crawler/src/main/java/com/crawler/command/HelpCommand.java
  25. 26
      project/my-crawler/src/main/java/com/crawler/command/ListCommand.java
  26. 26
      project/my-crawler/src/main/java/com/crawler/command/LoadCommand.java
  27. 26
      project/my-crawler/src/main/java/com/crawler/command/SaveCommand.java
  28. 67
      project/my-crawler/src/main/java/com/crawler/controller/CrawlerController.java
  29. 11
      project/my-crawler/src/main/java/com/crawler/exception/CrawlerException.java
  30. 11
      project/my-crawler/src/main/java/com/crawler/exception/NetworkException.java
  31. 11
      project/my-crawler/src/main/java/com/crawler/exception/ParseException.java
  32. 11
      project/my-crawler/src/main/java/com/crawler/exception/UrlFormatException.java
  33. 27
      project/my-crawler/src/main/java/com/crawler/factory/StrategyFactory.java
  34. 104
      project/my-crawler/src/main/java/com/crawler/model/Article.java
  35. 18
      project/my-crawler/src/main/java/com/crawler/repository/ArticleRepository.java
  36. 78
      project/my-crawler/src/main/java/com/crawler/repository/InMemoryArticleRepository.java
  37. 76
      project/my-crawler/src/main/java/com/crawler/strategy/BlogCrawlStrategy.java
  38. 9
      project/my-crawler/src/main/java/com/crawler/strategy/CrawlStrategy.java
  39. 170
      project/my-crawler/src/main/java/com/crawler/strategy/DoubanTop250Strategy.java
  40. 75
      project/my-crawler/src/main/java/com/crawler/strategy/JsoupCrawlStrategy.java
  41. 76
      project/my-crawler/src/main/java/com/crawler/strategy/NewsCrawlStrategy.java
  42. 54
      project/my-crawler/src/main/java/com/crawler/util/ColorUtil.java
  43. 193
      project/my-crawler/src/main/java/com/crawler/util/DataPersistence.java
  44. 101
      project/my-crawler/src/main/java/com/crawler/view/ConsoleView.java
  45. BIN
      project/my-crawler/target/classes/.DS_Store
  46. BIN
      project/my-crawler/target/classes/com/.DS_Store
  47. BIN
      project/my-crawler/target/classes/com/crawler/App.class
  48. BIN
      project/my-crawler/target/classes/com/crawler/command/Command.class
  49. BIN
      project/my-crawler/target/classes/com/crawler/command/CrawlCommand.class
  50. BIN
      project/my-crawler/target/classes/com/crawler/command/ExitCommand.class
  51. BIN
      project/my-crawler/target/classes/com/crawler/command/HelpCommand.class
  52. BIN
      project/my-crawler/target/classes/com/crawler/command/ListCommand.class
  53. BIN
      project/my-crawler/target/classes/com/crawler/command/LoadCommand.class
  54. BIN
      project/my-crawler/target/classes/com/crawler/command/SaveCommand.class
  55. BIN
      project/my-crawler/target/classes/com/crawler/controller/CrawlerController.class
  56. BIN
      project/my-crawler/target/classes/com/crawler/exception/CrawlerException.class
  57. BIN
      project/my-crawler/target/classes/com/crawler/exception/NetworkException.class
  58. BIN
      project/my-crawler/target/classes/com/crawler/exception/ParseException.class
  59. BIN
      project/my-crawler/target/classes/com/crawler/exception/UrlFormatException.class
  60. BIN
      project/my-crawler/target/classes/com/crawler/factory/StrategyFactory.class
  61. BIN
      project/my-crawler/target/classes/com/crawler/model/Article.class
  62. BIN
      project/my-crawler/target/classes/com/crawler/repository/ArticleRepository.class
  63. BIN
      project/my-crawler/target/classes/com/crawler/repository/InMemoryArticleRepository.class
  64. BIN
      project/my-crawler/target/classes/com/crawler/strategy/BlogCrawlStrategy.class
  65. BIN
      project/my-crawler/target/classes/com/crawler/strategy/CrawlStrategy.class
  66. BIN
      project/my-crawler/target/classes/com/crawler/strategy/DoubanTop250Strategy.class
  67. BIN
      project/my-crawler/target/classes/com/crawler/strategy/JsoupCrawlStrategy.class
  68. BIN
      project/my-crawler/target/classes/com/crawler/strategy/NewsCrawlStrategy.class
  69. BIN
      project/my-crawler/target/classes/com/crawler/util/ColorUtil.class
  70. BIN
      project/my-crawler/target/classes/com/crawler/util/DataPersistence.class
  71. BIN
      project/my-crawler/target/classes/com/crawler/view/ConsoleView.class

BIN
project/.DS_Store

Binary file not shown.

BIN
project/my-crawler/.DS_Store

Binary file not shown.

10
project/my-crawler/.idea/.gitignore

@ -0,0 +1,10 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 已忽略包含查询文件的默认文件夹
/queries/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/

13
project/my-crawler/.idea/compiler.xml

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="my-crawler" />
</profile>
</annotationProcessing>
</component>
</project>

7
project/my-crawler/.idea/encodings.xml

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
</component>
</project>

20
project/my-crawler/.idea/jarRepositories.xml

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteRepositoriesConfiguration">
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Central Repository" />
<option name="url" value="https://repo.maven.apache.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Maven Central repository" />
<option name="url" value="https://repo1.maven.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="jboss.community" />
<option name="name" value="JBoss Community repository" />
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
</remote-repository>
</component>
</project>

12
project/my-crawler/.idea/misc.xml

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_21" default="true" project-jdk-name="21" project-jdk-type="JavaSDK" />
</project>

BIN
project/my-crawler/202506010204-孟鑫垚-期末实验报告.docx

Binary file not shown.

BIN
project/my-crawler/data/.DS_Store

Binary file not shown.

20
project/my-crawler/data/articles_20260531_003057.txt

@ -0,0 +1,20 @@
========================================
文章数据批次保存
========================================
保存时间: 2026-05-31 00:30:57
文章数量: 1
========================================
----------------------------------------
文章 1
----------------------------------------
ID: 1
标题: 豆瓣音乐 Top 250
URL: https://music.douban.com/top250
来源: https://music.douban.com/top250
爬取时间: 2026-05-31 00:30:52
内容:
2025年度榜单 豆瓣音乐 Top 250 We Sing. We Dance. We Steal Things. Jason Mraz / 2008-05-13 / Import / Audio CD / 民谣 9.1 ( 117008人评价 ) Viva La Vida Death And All His Friends Coldplay / 2008-06-17 / 专辑 / CD / 摇滚 9.0 ( 121109人评价 ) 华丽的冒险 華麗的冒險 陈绮贞 / 2005-09-23 / 专辑 / CD / 流行 9.0 ( 93952人评价 ) 范特西 Fantasy 周杰伦 / 2001-09-14 / 专辑 / CD / 流行 9.5 ( 190826人评价 ) 后青春期的诗 後。青春期的詩 五月天 / 2008-10-23 / 专辑 / CD / 摇滚 9.0 ( 97666人评价 ) 是时候 It&#39;s Time 孙燕姿 / 2011-03-08 / 专辑 / CD / 流行 8.7 ( 84411人评价 ) Lenka Lenka / 2008-09-23 / 专辑 / Audio CD / 流行 8.6 ( 83946人评价 ) Start from Here 从这里开始 王若琳 / 2008-01-11 / 专辑 / CD / 爵士 8.8 ( 77184人评价 ) 旅行的意义 陈绮贞 / 2004-02-02 / 单曲 / CD / 流行 9.1 ( 101953人评价 ) 太阳 Immortal 陈绮贞 / 2009-01-22 / 专辑 / CD / 流行 8.8 ( 79731人评价 ) Once (Soundtrack) Once / 电影《曾经》原声大碟 Glen Hansard,Marketa Irglova / 2007-05-22 / Soundtrack / CD / 原声 9.2 ( 73664人评价 ) Not Going Anywhere 守候 Keren Ann / 2004-08-24 / Import / Audio CD / 民谣 8.9 ( 62733人评价 ) American Idiot Green Day / 2004-09-21 / Explicit Lyrics / Audio CD / 摇滚 9.0 ( 75471人评价 ) 思念是一种病 OK 张震岳 / 2007-07-06 / 专辑 / CD / 流行 8.9 ( 85810人评价 ) 無與倫比的美麗 无与伦比的美丽 苏打绿 / 2007-11-02 / 专辑 / CD / 流行 8.8 ( 92327人评价 ) 亲爱的...我还不知道 親愛的…我還不知道 张悬 / 2007-07-20 / 专辑 / CD / 流行 8.8 ( 69689人评价 ) 城市 The City 张悬 / 2009-05-22 / 专辑 / CD / 摇滚 8.7 ( 69137人评价 ) O Damien Rice / 2002-02-01 / 专辑 / CD / 流行 9.1 ( 53450人评价 ) Wake Me Up When September Ends 九月结束的时候叫醒我 Green Day / 2005-06-13 / 单曲 / CD / 摇滚 9.4 ( 55101人评价 ) 叶惠美 葉惠美 周杰伦 / 2003-07-31 / 专辑 / CD / 流行 9.3 ( 118425人评价 ) 七里香 Common Jasmin Orange 周杰伦 / 2004 / 专辑 / CD / 流行 9.2 ( 179593人评价 ) 21 Adele / 2011-01-24 / 专辑 / CD / 流行 9.3 ( 77434人评价 ) My Life Will... 张悬 / 2006-06-09 / 专辑 / CD / 流行 8.8 ( 61363人评价 ) 寓言 王菲 / 2000 / 专辑 / CD / 流行 9.4 ( 73631人评价 ) 你在煩惱什麼 你在烦恼什么 苏打绿 / 2011-11-11 / 专辑 / CD / 流行 9.0 ( 59752人评价 ) &lt

21
project/my-crawler/data/articles_20260531_004841.txt

File diff suppressed because one or more lines are too long

21
project/my-crawler/data/articles_20260531_010148.txt

@ -0,0 +1,21 @@
========================================
文章数据批次保存
========================================
保存时间: 2026-05-31 01:01:48
文章数量: 1
========================================
----------------------------------------
文章 1
----------------------------------------
ID: 1
标题: 长沙市, 湖南省月度天气预报 - weather.com
URL: https://weather.com/zh-CN/weather/monthly/l/2add4f77b58b85fbed59ae07151489bce5504e7ae027a17fa94ad0149eceef0f
来源: https://weather.com/zh-CN/weather/monthly/l/2add4f77b58b85fbed59ae07151489bce5504e7ae027a17fa94ad0149eceef0f
爬取时间: 2026-05-31 01:01:32
内容:
长沙市, 湖南省月度天气预报 - weather.com Hamburger The Weather Company Today Moon Phase - Day 8 26 Not Available -- -- Moon Phase - Day 9 27 Not Available -- -- Moon Phase - Day 10 28 Not Available -- -- Moon Phase - Day 11 29 Not Available -- -- Moon Phase - Day 12 30 Not Available -- -- Moon Phase - Day 13 1 Not Available -- -- Moon Phase - Day 14 2 Partly Cloudy Day 27 ° 19 ° Moon Phase - Day 15 3 Scattered Showers Day 26 ° 15 ° Moon Phase - Day 16 4 Partly Cloudy Day 25 ° 15 ° Moon Phase - Day 17 5 Partly Cloudy Day 28 ° 16 ° Moon Phase - Day 19 6 Partly Cloudy Day 30 ° 18 ° Moon Phase - Day 20 7 Partly Cloudy Day 32 ° 21 ° Moon Phase - Day 21 8 Showers 22 ° 18 ° Moon Phase - Day 22 9 Showers 21 ° 15 ° Moon Phase - Day 23 10 Partly Cloudy Day 24 ° 16 ° Moon Phase - Day 24 11 Partly Cloudy Day 27 ° 18 ° Moon Phase - Day 25 12 Partly Cloudy Day 30 ° 20 ° Moon Phase - Day 26 13 Showers 27 ° 19 ° Moon Phase - Day 27 14 Showers 23 ° 20 ° Moon Phase - Day 28 15 Cloudy 28 ° 21 ° Moon Phase - Day 29 16 Partly Cloudy Day 31 ° 23 ° Moon Phase - Day 1 17 Partly Cloudy Day 29 ° 22 ° Moon Phase - Day 2 18 Partly Cloudy Day 31 ° 23 ° Moon Phase - Day 3 19 Partly Cloudy Day 30 ° 21 ° Moon Phase - Day 4 20 Showers 23 ° 19 ° Moon Phase - Day 5 21 Showers 23 ° 20 ° Moon Phase - Day 5 22 Showers 26 ° 22 ° Moon Phase - Day 6 23 Scattered Showers Day 29 ° 23 ° Moon Phase - Day 7 24 Scattered Showers Day 30 ° 25 ° Moon Phase - Day 8 25 Windy 33 ° 27 ° Moon Phase - Day 9 26 Partly Cloudy Day 35 ° 26 ° Moon Phase - Day 10 27 Partly Cloudy Day 30 ° 22 ° Moon Phase - Day 11 28 Showers 27 ° 22 ° Moon Phase - Day 12 29 Partly Cloudy Day 29 ° 22 ° Moon Phase - Day 13 30 Showers 24 ° 19 ° Moon Phase - Day 14 31 Partly Cloudy Day 31 ° 22 ° Moon Phase - Day 15 1 Mostly Clear Day 33 ° 23 ° Moon Phase - Day 16 2 Mostly Clear Day 34 ° 25 ° Moon Phase - Day 17 3 Scattered Thunderstorms 30 ° 24 ° Moon Phase - Day 18 4 Scattered Showers Day 30 ° 24 ° Moon Phase - Day 19 5 Showers 31 ° 24 ° Moon Phase - Day 20 6 Scattered Thunderstorms 31 ° 24 ° Close 白天 31 ° Partly Cloudy Day Rain drop 3% 西 6   公里/小时 少云。 最高 31°C。 微风且风向多变。 Record High 最高纪录 36 ° Average High 平均最高 28 ° Sunrise 日出 5:31 Sunset 日落 19:19 夜间 22 ° Clear Night Rain drop 4% 北 6   公里/小时 大部晴朗。 最低 22°C。 微风且风向多变。 Record Low 最低记录 18 ° Average Low 平均最低 21 ° Moonrise 月出 19:32 Moonset 月落 4:57 Moon Phase - Day 14 满月 Moon Phase - Day 21 7 Showers 30 ° 23 ° Moon Phase - Day 22 8 Showers 29 ° 22 ° Moon Phase - Day 23 9 Scattered Showers Day 27 ° 21 ° Moon Phase - Day 24 10 Scattered Showers Day 28 ° 22 ° Moon Phase - Day 25 11 Partly Cloudy Day 29 ° 23 ° Moon Phase - Day 26 12 Scattered Showers Day 30 ° 24 ° Moon Phase - Day 27 13 Showers 30 ° 24 ° Not Available 14 Not Available -- -- Moon Phase - Day 0 15 平均气温 -- -- Moon Phase - Day 1 16 平均气温 -- -- Moon Phase - Day 2 17 平均气温 -- -- Moon Phase - Day 3 18 平均气温 -- -- Moon Phase - Day 5 19 平均气温 -- -- Moon Phase - Day 6 20 平均气温 -- -- Moon Phase - Day 6 21 平均气温 -- -- Moon Phase - Day 7 22 平均气温 -- -- Moon Phase - Day 8 23 平均气温 -- -- Moon Phase - Day 9 24 平均气温 -- -- Not Available 25 平均气温 -- -- Not Available 26 平均气温 -- -- Not Available 27 平均气温 -- -- Not Available 28 平均气温 -- -- Not Available 29 平均气温 -- -- Not Available 30 平均气温 -- -- Not Available 1 平均气温 -- -- Not Available 2 平均气温 -- -- Not Available 3 平均气温 -- -- Not Available 4 平均气温 -- -- Advertisement Advertisement 历史记录 5月31日 高 低 降水量 平均值 28 ° C 21 ° -- 记录 34 ° ( 2011 ) 17 ° ( 1993 ) -- 历史气温状况 昨日 24 ° 19 ° 0.25 毫米 过去七天 35 ° 19 ° 6.84 当月气温 35 ° 15 ° 61.66 历史月平均气温 五月 27 ° 19 ° 201.68 六月 30 ° 23 ° 224.28 七月 33 ° 26 ° 162.81

20
project/my-crawler/data/articles_20260531_105816.txt

File diff suppressed because one or more lines are too long

31
project/my-crawler/data/index.txt

@ -0,0 +1,31 @@
========================================
文章索引
========================================
共有 1 篇文章
[1] 豆瓣音乐 Top 250
URL: https://music.douban.com/top250
文件名: article_1.txt
爬取时间: 2026-05-31 00:24:38
[保存记录] 2026-05-31 00:30:57
批次文件: articles_20260531_003057.txt
文章数量: 1
[保存记录] 2026-05-31 00:48:41
批次文件: articles_20260531_004841.txt
文章数量: 1
[保存记录] 2026-05-31 01:01:48
批次文件: articles_20260531_010148.txt
文章数量: 1
[保存记录] 2026-05-31 10:58:16
批次文件: articles_20260531_105816.txt
文章数量: 1
[保存记录] 2026-05-31 11:00:37
批次文件: articles_20260531_110037.txt
文章数量: 1

50
project/my-crawler/pom.xml

@ -0,0 +1,50 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.crawler</groupId>
<artifactId>my-crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>My Crawler</name>
<description>A simple web crawler application</description>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.16.1</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<archive>
<manifest>
<mainClass>com.crawler.App</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</build>
</project>

BIN
project/my-crawler/src/.DS_Store

Binary file not shown.

BIN
project/my-crawler/src/main/.DS_Store

Binary file not shown.

BIN
project/my-crawler/src/main/java/.DS_Store

Binary file not shown.

BIN
project/my-crawler/src/main/java/com/.DS_Store

Binary file not shown.

63
project/my-crawler/src/main/java/com/crawler/App.java

@ -0,0 +1,63 @@
package com.crawler;
import com.crawler.command.*;
import com.crawler.controller.CrawlerController;
import com.crawler.repository.ArticleRepository;
import com.crawler.repository.InMemoryArticleRepository;
import com.crawler.view.ConsoleView;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
public class App {
private final Map<String, Command> commands = new HashMap<>();
private final ConsoleView view;
private final AtomicBoolean running = new AtomicBoolean(true);
public App() {
view = new ConsoleView();
ArticleRepository repository = new InMemoryArticleRepository();
CrawlerController controller = new CrawlerController(repository, view);
commands.put("crawl", new CrawlCommand(controller, view));
commands.put("list", new ListCommand(controller));
commands.put("save", new SaveCommand(controller));
commands.put("load", new LoadCommand(controller));
commands.put("help", new HelpCommand(view));
commands.put("exit", new ExitCommand(view, () -> running.set(false)));
}
public void run() {
view.displayWelcome();
view.displayHelp();
while (running.get()) {
try {
String input = view.readInput();
if (input.isEmpty()) {
continue;
}
String[] parts = input.split("\\s+", 3);
String commandName = parts[0].toLowerCase();
String[] args = parts.length > 1 ? java.util.Arrays.copyOfRange(parts, 1, parts.length) : new String[0];
Command command = commands.get(commandName);
if (command != null) {
command.execute(args);
} else {
view.displayError("Unknown command: " + commandName);
view.displayInfo("Type 'help' for available commands");
}
} catch (Exception e) {
view.displayError("Error: " + e.getMessage());
}
}
}
public static void main(String[] args) {
App app = new App();
app.run();
}
}

7
project/my-crawler/src/main/java/com/crawler/command/Command.java

@ -0,0 +1,7 @@
package com.crawler.command;
public interface Command {
void execute(String[] args) throws Exception;
String getCommandName();
String getDescription();
}

41
project/my-crawler/src/main/java/com/crawler/command/CrawlCommand.java

@ -0,0 +1,41 @@
package com.crawler.command;
import com.crawler.controller.CrawlerController;
import com.crawler.view.ConsoleView;
public class CrawlCommand implements Command {
private final CrawlerController controller;
private final ConsoleView view;
public CrawlCommand(CrawlerController controller, ConsoleView view) {
this.controller = controller;
this.view = view;
}
@Override
public void execute(String[] args) {
if (args.length < 1) {
view.displayError("Usage: crawl <url> [strategy]");
return;
}
String url = args[0];
String strategy = args.length > 1 ? args[1] : "jsoup";
try {
controller.crawl(url, strategy);
} catch (Exception e) {
view.displayError("Crawl failed: " + e.getMessage());
}
}
@Override
public String getCommandName() {
return "crawl";
}
@Override
public String getDescription() {
return "Crawl a website";
}
}

31
project/my-crawler/src/main/java/com/crawler/command/ExitCommand.java

@ -0,0 +1,31 @@
package com.crawler.command;
import com.crawler.view.ConsoleView;
public class ExitCommand implements Command {
private final ConsoleView view;
private Runnable exitCallback;
public ExitCommand(ConsoleView view, Runnable exitCallback) {
this.view = view;
this.exitCallback = exitCallback;
}
@Override
public void execute(String[] args) {
view.displayGoodbye();
if (exitCallback != null) {
exitCallback.run();
}
}
@Override
public String getCommandName() {
return "exit";
}
@Override
public String getDescription() {
return "Exit the application";
}
}

26
project/my-crawler/src/main/java/com/crawler/command/HelpCommand.java

@ -0,0 +1,26 @@
package com.crawler.command;
import com.crawler.view.ConsoleView;
public class HelpCommand implements Command {
private final ConsoleView view;
public HelpCommand(ConsoleView view) {
this.view = view;
}
@Override
public void execute(String[] args) {
view.displayHelp();
}
@Override
public String getCommandName() {
return "help";
}
@Override
public String getDescription() {
return "Show help message";
}
}

26
project/my-crawler/src/main/java/com/crawler/command/ListCommand.java

@ -0,0 +1,26 @@
package com.crawler.command;
import com.crawler.controller.CrawlerController;
public class ListCommand implements Command {
private final CrawlerController controller;
public ListCommand(CrawlerController controller) {
this.controller = controller;
}
@Override
public void execute(String[] args) {
controller.listArticles();
}
@Override
public String getCommandName() {
return "list";
}
@Override
public String getDescription() {
return "List all crawled articles";
}
}

26
project/my-crawler/src/main/java/com/crawler/command/LoadCommand.java

@ -0,0 +1,26 @@
package com.crawler.command;
import com.crawler.controller.CrawlerController;
public class LoadCommand implements Command {
private final CrawlerController controller;
public LoadCommand(CrawlerController controller) {
this.controller = controller;
}
@Override
public void execute(String[] args) {
controller.loadData();
}
@Override
public String getCommandName() {
return "load";
}
@Override
public String getDescription() {
return "Load articles from data file";
}
}

26
project/my-crawler/src/main/java/com/crawler/command/SaveCommand.java

@ -0,0 +1,26 @@
package com.crawler.command;
import com.crawler.controller.CrawlerController;
public class SaveCommand implements Command {
private final CrawlerController controller;
public SaveCommand(CrawlerController controller) {
this.controller = controller;
}
@Override
public void execute(String[] args) {
controller.saveData();
}
@Override
public String getCommandName() {
return "save";
}
@Override
public String getDescription() {
return "Save articles to data file";
}
}

67
project/my-crawler/src/main/java/com/crawler/controller/CrawlerController.java

@ -0,0 +1,67 @@
package com.crawler.controller;
import java.util.List;
import com.crawler.factory.StrategyFactory;
import com.crawler.model.Article;
import com.crawler.repository.ArticleRepository;
import com.crawler.strategy.CrawlStrategy;
import com.crawler.util.DataPersistence;
import com.crawler.view.ConsoleView;
public class CrawlerController {
private final ArticleRepository repository;
private final ConsoleView view;
public CrawlerController(ArticleRepository repository, ConsoleView view) {
this.repository = repository;
this.view = view;
loadSavedData();
}
private void loadSavedData() {
List<Article> savedArticles = DataPersistence.loadArticles();
if (!savedArticles.isEmpty()) {
repository.saveAll(savedArticles);
view.displayInfo("Loaded " + savedArticles.size() + " saved articles");
}
}
public void crawl(String url, String strategyName) throws Exception {
if (url == null || url.trim().isEmpty()) {
throw new IllegalArgumentException("URL cannot be empty");
}
if (!url.startsWith("http://") && !url.startsWith("https://")) {
url = "https://" + url;
}
// 移除URL重复检查,允许重复爬取同一URL
view.displayInfo("Crawling: " + url);
view.displayInfo("Using strategy: " + strategyName);
CrawlStrategy strategy = StrategyFactory.getStrategy(strategyName);
List<Article> articles = strategy.crawl(url);
for (Article article : articles) {
repository.save(article);
view.displaySuccess("Crawled: " + article.getTitle());
}
}
public void listArticles() {
List<Article> articles = repository.findAll();
view.displayArticleList(articles);
}
public void saveData() {
List<Article> articles = repository.findAll();
DataPersistence.saveArticles(articles);
}
public void loadData() {
repository.deleteAll();
List<Article> savedArticles = DataPersistence.loadArticles();
repository.saveAll(savedArticles);
}
}

11
project/my-crawler/src/main/java/com/crawler/exception/CrawlerException.java

@ -0,0 +1,11 @@
package com.crawler.exception;
public class CrawlerException extends RuntimeException {
public CrawlerException(String message) {
super(message);
}
public CrawlerException(String message, Throwable cause) {
super(message, cause);
}
}

11
project/my-crawler/src/main/java/com/crawler/exception/NetworkException.java

@ -0,0 +1,11 @@
package com.crawler.exception;
public class NetworkException extends CrawlerException {
public NetworkException(String message) {
super(message);
}
public NetworkException(String message, Throwable cause) {
super(message, cause);
}
}

11
project/my-crawler/src/main/java/com/crawler/exception/ParseException.java

@ -0,0 +1,11 @@
package com.crawler.exception;
public class ParseException extends CrawlerException {
public ParseException(String message) {
super(message);
}
public ParseException(String message, Throwable cause) {
super(message, cause);
}
}

11
project/my-crawler/src/main/java/com/crawler/exception/UrlFormatException.java

@ -0,0 +1,11 @@
package com.crawler.exception;
public class UrlFormatException extends CrawlerException {
public UrlFormatException(String message) {
super(message);
}
public UrlFormatException(String message, Throwable cause) {
super(message, cause);
}
}

27
project/my-crawler/src/main/java/com/crawler/factory/StrategyFactory.java

@ -0,0 +1,27 @@
package com.crawler.factory;
import com.crawler.strategy.*;
import java.util.HashMap;
import java.util.Map;
public class StrategyFactory {
private static final Map<String, CrawlStrategy> strategies = new HashMap<>();
static {
strategies.put("blog", new BlogCrawlStrategy());
strategies.put("news", new NewsCrawlStrategy());
strategies.put("jsoup", new JsoupCrawlStrategy());
}
public static CrawlStrategy getStrategy(String strategyName) {
return strategies.getOrDefault(strategyName.toLowerCase(), new JsoupCrawlStrategy());
}
public static boolean hasStrategy(String strategyName) {
return strategies.containsKey(strategyName.toLowerCase());
}
public static String[] getAvailableStrategies() {
return strategies.keySet().toArray(new String[0]);
}
}

104
project/my-crawler/src/main/java/com/crawler/model/Article.java

@ -0,0 +1,104 @@
package com.crawler.model;
import java.io.Serializable;
import java.time.LocalDateTime;
public class Article implements Serializable {
private static final long serialVersionUID = 1L;
private String id;
private String title;
private String url;
private String content;
private String author;
private LocalDateTime publishDate;
private LocalDateTime crawlDate;
private String source;
public Article() {
this.crawlDate = LocalDateTime.now();
}
public Article(String title, String url, String content) {
this.title = title;
this.url = url;
this.content = content;
this.crawlDate = LocalDateTime.now();
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public LocalDateTime getPublishDate() {
return publishDate;
}
public void setPublishDate(LocalDateTime publishDate) {
this.publishDate = publishDate;
}
public LocalDateTime getCrawlDate() {
return crawlDate;
}
public void setCrawlDate(LocalDateTime crawlDate) {
this.crawlDate = crawlDate;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
@Override
public String toString() {
return "Article{" +
"id='" + id + '\'' +
", title='" + title + '\'' +
", url='" + url + '\'' +
", author='" + author + '\'' +
", publishDate=" + publishDate +
", crawlDate=" + crawlDate +
", source='" + source + '\'' +
'}';
}
}

18
project/my-crawler/src/main/java/com/crawler/repository/ArticleRepository.java

@ -0,0 +1,18 @@
package com.crawler.repository;
import com.crawler.model.Article;
import java.util.List;
import java.util.Optional;
public interface ArticleRepository {
void save(Article article);
void saveAll(List<Article> articles);
Optional<Article> findById(String id);
Optional<Article> findByUrl(String url);
List<Article> findAll();
List<Article> findBySource(String source);
void deleteById(String id);
void deleteAll();
int count();
boolean existsByUrl(String url);
}

78
project/my-crawler/src/main/java/com/crawler/repository/InMemoryArticleRepository.java

@ -0,0 +1,78 @@
package com.crawler.repository;
import com.crawler.model.Article;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
public class InMemoryArticleRepository implements ArticleRepository {
private final Map<String, Article> articles = new ConcurrentHashMap<>();
private final Map<String, String> urlToIdMap = new ConcurrentHashMap<>();
private final AtomicInteger idGenerator = new AtomicInteger(1);
@Override
public void save(Article article) {
if (article.getId() == null) {
article.setId(String.valueOf(idGenerator.getAndIncrement()));
}
articles.put(article.getId(), article);
if (article.getUrl() != null) {
urlToIdMap.put(article.getUrl(), article.getId());
}
}
@Override
public void saveAll(List<Article> articleList) {
for (Article article : articleList) {
save(article);
}
}
@Override
public Optional<Article> findById(String id) {
return Optional.ofNullable(articles.get(id));
}
@Override
public Optional<Article> findByUrl(String url) {
String id = urlToIdMap.get(url);
return id != null ? Optional.ofNullable(articles.get(id)) : Optional.empty();
}
@Override
public List<Article> findAll() {
return new ArrayList<>(articles.values());
}
@Override
public List<Article> findBySource(String source) {
return articles.values().stream()
.filter(a -> source.equals(a.getSource()))
.collect(Collectors.toList());
}
@Override
public void deleteById(String id) {
Article article = articles.remove(id);
if (article != null && article.getUrl() != null) {
urlToIdMap.remove(article.getUrl());
}
}
@Override
public void deleteAll() {
articles.clear();
urlToIdMap.clear();
}
@Override
public int count() {
return articles.size();
}
@Override
public boolean existsByUrl(String url) {
return urlToIdMap.containsKey(url);
}
}

76
project/my-crawler/src/main/java/com/crawler/strategy/BlogCrawlStrategy.java

@ -0,0 +1,76 @@
package com.crawler.strategy;
import com.crawler.model.Article;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BlogCrawlStrategy implements CrawlStrategy {
@Override
public List<Article> crawl(String url) {
List<Article> articles = new ArrayList<>();
try {
URL urlObj = new URL(url);
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
connection.setConnectTimeout(10000);
connection.setReadTimeout(10000);
StringBuilder content = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
String line;
while ((line = reader.readLine()) != null) {
content.append(line).append("\n");
}
}
Article article = new Article();
article.setTitle("Blog: " + extractTitle(content.toString()));
article.setUrl(url);
article.setSource("blog");
article.setContent(extractText(content.toString()));
article.setAuthor("Blog Author");
articles.add(article);
} catch (Exception e) {
Article errorArticle = new Article();
errorArticle.setTitle("Error crawling blog: " + url);
errorArticle.setUrl(url);
errorArticle.setContent("Error details: " + e.getMessage());
errorArticle.setSource("blog");
articles.add(errorArticle);
}
return articles;
}
private String extractTitle(String html) {
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(html);
if (matcher.find()) {
return matcher.group(1).trim();
}
return "Untitled Blog";
}
private String extractText(String html) {
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "")
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "")
.replaceAll("<[^>]+>", " ")
.replaceAll("\\s+", " ")
.trim();
}
@Override
public String getStrategyName() {
return "blog";
}
}

9
project/my-crawler/src/main/java/com/crawler/strategy/CrawlStrategy.java

@ -0,0 +1,9 @@
package com.crawler.strategy;
import com.crawler.model.Article;
import java.util.List;
public interface CrawlStrategy {
List<Article> crawl(String url) throws Exception;
String getStrategyName();
}

170
project/my-crawler/src/main/java/com/crawler/strategy/DoubanTop250Strategy.java

@ -0,0 +1,170 @@
package com.crawler.strategy;
import com.crawler.model.Article;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DoubanTop250Strategy implements CrawlStrategy {
private static final int TOTAL_MOVIES = 250;
private static final int MOVIES_PER_PAGE = 25;
@Override
public List<Article> crawl(String url) {
List<Article> allMovies = new ArrayList<>();
try {
System.out.println("🎬 开始爬取豆瓣电影 Top 250...");
System.out.println("⏳ 预计需要爬取 " + (TOTAL_MOVIES / MOVIES_PER_PAGE) + " 页");
for (int page = 0; page < TOTAL_MOVIES; page += MOVIES_PER_PAGE) {
String pageUrl = "https://movie.douban.com/top250?start=" + page + "&filter=";
System.out.println("📄 正在爬取第 " + (page / MOVIES_PER_PAGE + 1) + " 页...");
List<Article> pageMovies = crawlPage(pageUrl, page / MOVIES_PER_PAGE + 1);
allMovies.addAll(pageMovies);
System.out.println("✅ 第 " + (page / MOVIES_PER_PAGE + 1) + " 页完成,已获取 " + allMovies.size() + " 部电影");
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
System.out.println("🎉 完成!共爬取 " + allMovies.size() + " 部电影");
} catch (Exception e) {
System.err.println("❌ 爬取失败: " + e.getMessage());
Article errorArticle = new Article();
errorArticle.setTitle("Error crawling Douban Top 250");
errorArticle.setUrl(url);
errorArticle.setContent("Error details: " + e.getMessage());
errorArticle.setSource("douban");
allMovies.add(errorArticle);
}
return allMovies;
}
private List<Article> crawlPage(String url, int pageNum) {
List<Article> movies = new ArrayList<>();
try {
URL urlObj = new URL(url);
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
connection.setConnectTimeout(15000);
connection.setReadTimeout(15000);
StringBuilder html = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
String line;
while ((line = reader.readLine()) != null) {
html.append(line).append("\n");
}
}
movies = parseMovies(html.toString());
} catch (Exception e) {
System.err.println("⚠️ 第 " + pageNum + " 页爬取失败: " + e.getMessage());
}
return movies;
}
private List<Article> parseMovies(String html) {
List<Article> movies = new ArrayList<>();
String moviePattern = "<div class=\"item\">[\\s\\S]*?</div>\\s*</div>\\s*</div>";
Pattern pattern = Pattern.compile(moviePattern, Pattern.DOTALL);
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
try {
Article movie = parseSingleMovie(matcher.group());
if (movie != null) {
movies.add(movie);
}
} catch (Exception e) {
continue;
}
}
return movies;
}
private Article parseSingleMovie(String movieHtml) {
Article movie = new Article();
movie.setSource("douban");
try {
Pattern titlePattern = Pattern.compile("<span class=\"title\">(.*?)</span>");
Matcher titleMatcher = titlePattern.matcher(movieHtml);
if (titleMatcher.find()) {
movie.setTitle(titleMatcher.group(1));
}
Pattern linkPattern = Pattern.compile("<a href=\"(.*?)\"");
Matcher linkMatcher = linkPattern.matcher(movieHtml);
if (linkMatcher.find()) {
movie.setUrl(linkMatcher.group(1));
}
Pattern ratingPattern = Pattern.compile("<span class=\"rating_num\">(.*?)</span>");
Matcher ratingMatcher = ratingPattern.matcher(movieHtml);
String rating = "";
if (ratingMatcher.find()) {
rating = ratingMatcher.group(1);
}
Pattern yearPattern = Pattern.compile("(\\d{4})\\s*/");
Matcher yearMatcher = yearPattern.matcher(movieHtml);
String year = "";
if (yearMatcher.find()) {
year = yearMatcher.group(1);
}
Pattern quotePattern = Pattern.compile("<span class=\"inq\">(.*?)</span>");
Matcher quoteMatcher = quotePattern.matcher(movieHtml);
String quote = "";
if (quoteMatcher.find()) {
quote = quoteMatcher.group(1);
}
Pattern infoPattern = Pattern.compile("<p class=\"\">(.*?)</p>", Pattern.DOTALL);
Matcher infoMatcher = infoPattern.matcher(movieHtml);
String info = "";
if (infoMatcher.find()) {
info = infoMatcher.group(1).replaceAll("<br\\s*/?>", "\n").replaceAll("<[^>]+>", "").trim();
}
StringBuilder content = new StringBuilder();
content.append("🎬 电影名称: ").append(movie.getTitle()).append("\n");
content.append("⭐ 评分: ").append(rating).append("\n");
content.append("📅 年份: ").append(year).append("\n");
if (!quote.isEmpty()) {
content.append("💬 简介: ").append(quote).append("\n");
}
content.append("\n📝 详细信息:\n").append(info);
movie.setContent(content.toString());
movie.setAuthor("豆瓣电影");
} catch (Exception e) {
return null;
}
return movie;
}
@Override
public String getStrategyName() {
return "douban";
}
}

75
project/my-crawler/src/main/java/com/crawler/strategy/JsoupCrawlStrategy.java

@ -0,0 +1,75 @@
package com.crawler.strategy;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.crawler.model.Article;
public class JsoupCrawlStrategy implements CrawlStrategy {
@Override
public List<Article> crawl(String url) {
List<Article> articles = new ArrayList<>();
try {
URL urlObj = new URL(url);
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
connection.setConnectTimeout(10000);
connection.setReadTimeout(10000);
StringBuilder content = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
String line;
while ((line = reader.readLine()) != null) {
content.append(line).append("\n");
}
}
Article article = new Article();
article.setTitle(extractTitle(content.toString()));
article.setUrl(url);
article.setSource(url);
article.setContent(extractText(content.toString()));
articles.add(article);
} catch (Exception e) {
Article errorArticle = new Article();
errorArticle.setTitle("Error crawling: " + url);
errorArticle.setUrl(url);
errorArticle.setContent("Error details: " + e.getMessage());
errorArticle.setSource(url);
articles.add(errorArticle);
}
return articles;
}
private String extractTitle(String html) {
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(html);
if (matcher.find()) {
return matcher.group(1).trim();
}
return "Untitled Page";
}
private String extractText(String html) {
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "")
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "")
.replaceAll("<[^>]+>", " ")
.replaceAll("\\s+", " ")
.trim();
}
@Override
public String getStrategyName() {
return "jsoup";
}
}

76
project/my-crawler/src/main/java/com/crawler/strategy/NewsCrawlStrategy.java

@ -0,0 +1,76 @@
package com.crawler.strategy;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.crawler.model.Article;
public class NewsCrawlStrategy implements CrawlStrategy {
@Override
public List<Article> crawl(String url) {
List<Article> articles = new ArrayList<>();
try {
URL urlObj = new URL(url);
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
connection.setConnectTimeout(10000);
connection.setReadTimeout(10000);
StringBuilder content = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
String line;
while ((line = reader.readLine()) != null) {
content.append(line).append("\n");
}
}
Article article = new Article();
article.setTitle("News: " + extractTitle(content.toString()));
article.setUrl(url);
article.setSource("news");
article.setContent(extractText(content.toString()));
article.setAuthor("News Reporter");
articles.add(article);
} catch (Exception e) {
Article errorArticle = new Article();
errorArticle.setTitle("Error crawling news: " + url);
errorArticle.setUrl(url);
errorArticle.setContent("Error details: " + e.getMessage());
errorArticle.setSource("news");
articles.add(errorArticle);
}
return articles;
}
private String extractTitle(String html) {
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(html);
if (matcher.find()) {
return matcher.group(1).trim();
}
return "Untitled News";
}
private String extractText(String html) {
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "")
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "")
.replaceAll("<[^>]+>", " ")
.replaceAll("\\s+", " ")
.trim();
}
@Override
public String getStrategyName() {
return "news";
}
}

54
project/my-crawler/src/main/java/com/crawler/util/ColorUtil.java

@ -0,0 +1,54 @@
package com.crawler.util;
public class ColorUtil {
public static final String RESET = "\u001B[0m";
public static final String BLACK = "\u001B[30m";
public static final String RED = "\u001B[31m";
public static final String GREEN = "\u001B[32m";
public static final String YELLOW = "\u001B[33m";
public static final String BLUE = "\u001B[34m";
public static final String PURPLE = "\u001B[35m";
public static final String CYAN = "\u001B[36m";
public static final String WHITE = "\u001B[37m";
public static final String BLACK_BG = "\u001B[40m";
public static final String RED_BG = "\u001B[41m";
public static final String GREEN_BG = "\u001B[42m";
public static final String YELLOW_BG = "\u001B[43m";
public static final String BLUE_BG = "\u001B[44m";
public static final String PURPLE_BG = "\u001B[45m";
public static final String CYAN_BG = "\u001B[46m";
public static final String WHITE_BG = "\u001B[47m";
public static String colorize(String text, String color) {
return color + text + RESET;
}
public static String green(String text) {
return colorize(text, GREEN);
}
public static String red(String text) {
return colorize(text, RED);
}
public static String yellow(String text) {
return colorize(text, YELLOW);
}
public static String blue(String text) {
return colorize(text, BLUE);
}
public static String cyan(String text) {
return colorize(text, CYAN);
}
public static String purple(String text) {
return colorize(text, PURPLE);
}
public static String bold(String text) {
return "\u001B[1m" + text + RESET;
}
}

193
project/my-crawler/src/main/java/com/crawler/util/DataPersistence.java

@ -0,0 +1,193 @@
package com.crawler.util;
import com.crawler.model.Article;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
public class DataPersistence {
private static final String DATA_FOLDER = "data";
private static final String INDEX_FILE = DATA_FOLDER + File.separator + "index.txt";
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
private static final DateTimeFormatter FILE_TIMESTAMP_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss");
static {
File folder = new File(DATA_FOLDER);
if (!folder.exists()) {
folder.mkdirs();
}
}
public static void saveArticles(List<Article> articles) {
try {
String timestamp = LocalDateTime.now().format(FILE_TIMESTAMP_FORMATTER);
String batchFileName = DATA_FOLDER + File.separator + "articles_" + timestamp + ".txt";
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(batchFileName), StandardCharsets.UTF_8))) {
writer.write("========================================\n");
writer.write(" 文章数据批次保存\n");
writer.write("========================================\n\n");
writer.write("保存时间: " + LocalDateTime.now().format(DATE_FORMATTER) + "\n");
writer.write("文章数量: " + articles.size() + "\n\n");
writer.write("========================================\n\n");
for (int i = 0; i < articles.size(); i++) {
Article article = articles.get(i);
writer.write("----------------------------------------\n");
writer.write("文章 " + (i + 1) + "\n");
writer.write("----------------------------------------\n");
writer.write("ID: " + article.getId() + "\n");
writer.write("标题: " + article.getTitle() + "\n");
writer.write("URL: " + article.getUrl() + "\n");
if (article.getAuthor() != null) {
writer.write("作者: " + article.getAuthor() + "\n");
}
if (article.getSource() != null) {
writer.write("来源: " + article.getSource() + "\n");
}
if (article.getPublishDate() != null) {
writer.write("发布时间: " + article.getPublishDate().format(DATE_FORMATTER) + "\n");
}
writer.write("爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n");
writer.write("\n内容:\n");
if (article.getContent() != null) {
writer.write(article.getContent());
}
writer.write("\n\n");
}
}
updateIndex(timestamp, articles.size());
System.out.println(ColorUtil.green("✓ Saved " + articles.size() + " articles to '" + batchFileName + "'"));
} catch (Exception e) {
System.err.println(ColorUtil.red("✗ Failed to save articles: " + e.getMessage()));
}
}
private static void updateIndex(String timestamp, int articleCount) throws IOException {
boolean fileExists = new File(INDEX_FILE).exists();
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(INDEX_FILE, true), StandardCharsets.UTF_8))) {
if (!fileExists) {
writer.write("========================================\n");
writer.write(" 文章保存历史记录索引\n");
writer.write("========================================\n\n");
}
writer.write("[保存记录] " + LocalDateTime.now().format(DATE_FORMATTER) + "\n");
writer.write(" 批次文件: articles_" + timestamp + ".txt\n");
writer.write(" 文章数量: " + articleCount + "\n");
writer.write("\n");
}
}
private static void saveIndex(List<Article> articles) throws IOException {
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(INDEX_FILE), StandardCharsets.UTF_8))) {
writer.write("========================================\n");
writer.write(" 文章索引\n");
writer.write("========================================\n\n");
writer.write("共有 " + articles.size() + " 篇文章\n\n");
for (Article article : articles) {
writer.write("[" + article.getId() + "] " + article.getTitle() + "\n");
writer.write(" URL: " + article.getUrl() + "\n");
writer.write(" 文件名: article_" + article.getId() + ".txt\n");
if (article.getCrawlDate() != null) {
writer.write(" 爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n");
}
writer.write("\n");
}
}
}
public static List<Article> loadArticles() {
List<Article> articles = new ArrayList<>();
File folder = new File(DATA_FOLDER);
if (!folder.exists()) {
return articles;
}
File[] files = folder.listFiles((dir, name) -> name.startsWith("articles_") && name.endsWith(".txt"));
if (files != null) {
for (File file : files) {
if (file.getName().equals("index.txt")) {
continue;
}
try {
List<Article> batchArticles = loadBatchArticle(file);
if (batchArticles != null) {
articles.addAll(batchArticles);
}
} catch (Exception e) {
System.err.println(ColorUtil.yellow("⚠ 无法加载文件: " + file.getName()));
}
}
}
System.out.println(ColorUtil.green("✓ Loaded " + articles.size() + " articles from '" + DATA_FOLDER + "' folder"));
return articles;
}
private static List<Article> loadBatchArticle(File file) throws IOException {
List<Article> articles = new ArrayList<>();
Article currentArticle = null;
StringBuilder content = new StringBuilder();
boolean inContent = false;
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8))) {
String line;
while ((line = reader.readLine()) != null) {
if (line.contains("文章 ")) {
if (currentArticle != null) {
currentArticle.setContent(content.toString());
articles.add(currentArticle);
}
currentArticle = new Article();
content = new StringBuilder();
inContent = false;
continue;
}
if (line.startsWith("ID: ")) {
currentArticle.setId(line.substring(4));
} else if (line.startsWith("标题: ")) {
currentArticle.setTitle(line.substring(4));
} else if (line.startsWith("URL: ")) {
currentArticle.setUrl(line.substring(5));
} else if (line.startsWith("作者: ")) {
currentArticle.setAuthor(line.substring(4));
} else if (line.startsWith("来源: ")) {
currentArticle.setSource(line.substring(4));
} else if (line.startsWith("爬取时间: ")) {
String crawlDateStr = line.substring(6);
try {
currentArticle.setCrawlDate(LocalDateTime.parse(crawlDateStr, DATE_FORMATTER));
} catch (Exception e) {
// Ignore parse errors
}
} else if (line.equals("内容:")) {
inContent = true;
} else if (inContent && !line.startsWith("-----") && !line.startsWith("=====")) {
if (content.length() > 0) {
content.append("\n");
}
content.append(line);
}
}
if (currentArticle != null) {
currentArticle.setContent(content.toString());
articles.add(currentArticle);
}
}
return articles;
}
}

101
project/my-crawler/src/main/java/com/crawler/view/ConsoleView.java

@ -0,0 +1,101 @@
package com.crawler.view;
import java.time.format.DateTimeFormatter;
import java.util.List;
import java.util.Scanner;
import com.crawler.model.Article;
import com.crawler.util.ColorUtil;
public class ConsoleView {
private static final Scanner scanner = new Scanner(System.in);
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
public void displayWelcome() {
System.out.println(ColorUtil.cyan("========================================"));
System.out.println(ColorUtil.cyan(" Welcome to My Crawler "));
System.out.println(ColorUtil.cyan("========================================"));
System.out.println();
}
public void displayHelp() {
System.out.println(ColorUtil.yellow("Available commands:"));
System.out.println(ColorUtil.green(" crawl <url> [strategy] - Crawl a website"));
System.out.println(ColorUtil.green(" list - List all crawled articles"));
System.out.println(ColorUtil.green(" save - Save articles to data file"));
System.out.println(ColorUtil.green(" load - Load articles from data file"));
System.out.println(ColorUtil.green(" help - Show this help message"));
System.out.println(ColorUtil.green(" exit - Exit the application"));
System.out.println();
System.out.println(ColorUtil.yellow("Available strategies:"));
System.out.println(ColorUtil.cyan(" blog - Blog crawling strategy"));
System.out.println(ColorUtil.cyan(" news - News crawling strategy"));
System.out.println(ColorUtil.cyan(" jsoup - Generic JSoup strategy (default)"));
System.out.println();
}
public void displayArticleList(List<Article> articles) {
if (articles.isEmpty()) {
System.out.println(ColorUtil.yellow("No articles found."));
return;
}
System.out.println(ColorUtil.cyan("=== Crawled Articles (" + articles.size() + ") ==="));
System.out.println();
for (int i = 0; i < articles.size(); i++) {
displayArticleDetail(articles.get(i), i + 1);
}
}
public void displayArticleDetail(Article article, int index) {
System.out.println(ColorUtil.bold(ColorUtil.green("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")));
System.out.println(ColorUtil.bold(ColorUtil.yellow("[" + index + "] " + article.getTitle())));
System.out.println(ColorUtil.bold(ColorUtil.green("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")));
System.out.println(ColorUtil.cyan(" ID: ") + article.getId());
System.out.println(ColorUtil.cyan(" URL: ") + article.getUrl());
if (article.getAuthor() != null) {
System.out.println(ColorUtil.cyan(" Author: ") + article.getAuthor());
}
if (article.getSource() != null) {
System.out.println(ColorUtil.cyan(" Source: ") + article.getSource());
}
if (article.getPublishDate() != null) {
System.out.println(ColorUtil.cyan(" Published: ") + article.getPublishDate().format(DATE_FORMATTER));
}
System.out.println(ColorUtil.cyan(" Crawled: ") + article.getCrawlDate().format(DATE_FORMATTER));
System.out.println(ColorUtil.cyan(" Content: "));
if (article.getContent() != null) {
String[] lines = article.getContent().split("(?<=\\G.{80})");
for (String line : lines) {
System.out.println(" " + line);
}
}
System.out.println();
}
public void displaySuccess(String message) {
System.out.println(ColorUtil.green("✓ " + message));
}
public void displayError(String message) {
System.out.println(ColorUtil.red("✗ " + message));
}
public void displayInfo(String message) {
System.out.println(ColorUtil.blue("ℹ " + message));
}
public void displayWarning(String message) {
System.out.println(ColorUtil.yellow("⚠ " + message));
}
public String readInput() {
System.out.print(ColorUtil.purple("> "));
return scanner.nextLine().trim();
}
public void displayGoodbye() {
System.out.println(ColorUtil.cyan("Goodbye! Thank you for using My Crawler."));
}
}

BIN
project/my-crawler/target/classes/.DS_Store

Binary file not shown.

BIN
project/my-crawler/target/classes/com/.DS_Store

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/App.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/command/Command.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/command/CrawlCommand.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/command/ExitCommand.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/command/HelpCommand.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/command/ListCommand.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/command/LoadCommand.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/command/SaveCommand.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/controller/CrawlerController.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/exception/CrawlerException.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/exception/NetworkException.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/exception/ParseException.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/exception/UrlFormatException.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/factory/StrategyFactory.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/model/Article.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/repository/ArticleRepository.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/repository/InMemoryArticleRepository.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/strategy/BlogCrawlStrategy.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/strategy/CrawlStrategy.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/strategy/DoubanTop250Strategy.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/strategy/JsoupCrawlStrategy.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/strategy/NewsCrawlStrategy.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/util/ColorUtil.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/util/DataPersistence.class

Binary file not shown.

BIN
project/my-crawler/target/classes/com/crawler/view/ConsoleView.class

Binary file not shown.
Loading…
Cancel
Save