71 changed files with 1563 additions and 0 deletions
Binary file not shown.
Binary file not shown.
@ -0,0 +1,10 @@ |
|||
# 默认忽略的文件 |
|||
/shelf/ |
|||
/workspace.xml |
|||
# 已忽略包含查询文件的默认文件夹 |
|||
/queries/ |
|||
# Datasource local storage ignored files |
|||
/dataSources/ |
|||
/dataSources.local.xml |
|||
# 基于编辑器的 HTTP 客户端请求 |
|||
/httpRequests/ |
|||
@ -0,0 +1,13 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="CompilerConfiguration"> |
|||
<annotationProcessing> |
|||
<profile name="Maven default annotation processors profile" enabled="true"> |
|||
<sourceOutputDir name="target/generated-sources/annotations" /> |
|||
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" /> |
|||
<outputRelativeToContentRoot value="true" /> |
|||
<module name="my-crawler" /> |
|||
</profile> |
|||
</annotationProcessing> |
|||
</component> |
|||
</project> |
|||
@ -0,0 +1,7 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="Encoding"> |
|||
<file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" /> |
|||
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" /> |
|||
</component> |
|||
</project> |
|||
@ -0,0 +1,20 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="RemoteRepositoriesConfiguration"> |
|||
<remote-repository> |
|||
<option name="id" value="central" /> |
|||
<option name="name" value="Central Repository" /> |
|||
<option name="url" value="https://repo.maven.apache.org/maven2" /> |
|||
</remote-repository> |
|||
<remote-repository> |
|||
<option name="id" value="central" /> |
|||
<option name="name" value="Maven Central repository" /> |
|||
<option name="url" value="https://repo1.maven.org/maven2" /> |
|||
</remote-repository> |
|||
<remote-repository> |
|||
<option name="id" value="jboss.community" /> |
|||
<option name="name" value="JBoss Community repository" /> |
|||
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" /> |
|||
</remote-repository> |
|||
</component> |
|||
</project> |
|||
@ -0,0 +1,12 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="ExternalStorageConfigurationManager" enabled="true" /> |
|||
<component name="MavenProjectsManager"> |
|||
<option name="originalFiles"> |
|||
<list> |
|||
<option value="$PROJECT_DIR$/pom.xml" /> |
|||
</list> |
|||
</option> |
|||
</component> |
|||
<component name="ProjectRootManager" version="2" languageLevel="JDK_21" default="true" project-jdk-name="21" project-jdk-type="JavaSDK" /> |
|||
</project> |
|||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,20 @@ |
|||
======================================== |
|||
文章数据批次保存 |
|||
======================================== |
|||
|
|||
保存时间: 2026-05-31 00:30:57 |
|||
文章数量: 1 |
|||
|
|||
======================================== |
|||
|
|||
---------------------------------------- |
|||
文章 1 |
|||
---------------------------------------- |
|||
ID: 1 |
|||
标题: 豆瓣音乐 Top 250 |
|||
URL: https://music.douban.com/top250 |
|||
来源: https://music.douban.com/top250 |
|||
爬取时间: 2026-05-31 00:30:52 |
|||
|
|||
内容: |
|||
2025年度榜单 豆瓣音乐 Top 250 We Sing. We Dance. We Steal Things. Jason Mraz / 2008-05-13 / Import / Audio CD / 民谣 9.1 ( 117008人评价 ) Viva La Vida Death And All His Friends Coldplay / 2008-06-17 / 专辑 / CD / 摇滚 9.0 ( 121109人评价 ) 华丽的冒险 華麗的冒險 陈绮贞 / 2005-09-23 / 专辑 / CD / 流行 9.0 ( 93952人评价 ) 范特西 Fantasy 周杰伦 / 2001-09-14 / 专辑 / CD / 流行 9.5 ( 190826人评价 ) 后青春期的诗 後。青春期的詩 五月天 / 2008-10-23 / 专辑 / CD / 摇滚 9.0 ( 97666人评价 ) 是时候 It's Time 孙燕姿 / 2011-03-08 / 专辑 / CD / 流行 8.7 ( 84411人评价 ) Lenka Lenka / 2008-09-23 / 专辑 / Audio CD / 流行 8.6 ( 83946人评价 ) Start from Here 从这里开始 王若琳 / 2008-01-11 / 专辑 / CD / 爵士 8.8 ( 77184人评价 ) 旅行的意义 陈绮贞 / 2004-02-02 / 单曲 / CD / 流行 9.1 ( 101953人评价 ) 太阳 Immortal 陈绮贞 / 2009-01-22 / 专辑 / CD / 流行 8.8 ( 79731人评价 ) Once (Soundtrack) Once / 电影《曾经》原声大碟 Glen Hansard,Marketa Irglova / 2007-05-22 / Soundtrack / CD / 原声 9.2 ( 73664人评价 ) Not Going Anywhere 守候 Keren Ann / 2004-08-24 / Import / Audio CD / 民谣 8.9 ( 62733人评价 ) American Idiot Green Day / 2004-09-21 / Explicit Lyrics / Audio CD / 摇滚 9.0 ( 75471人评价 ) 思念是一种病 OK 张震岳 / 2007-07-06 / 专辑 / CD / 流行 8.9 ( 85810人评价 ) 無與倫比的美麗 无与伦比的美丽 苏打绿 / 2007-11-02 / 专辑 / CD / 流行 8.8 ( 92327人评价 ) 亲爱的...我还不知道 親愛的…我還不知道 张悬 / 2007-07-20 / 专辑 / CD / 流行 8.8 ( 69689人评价 ) 城市 The City 张悬 / 2009-05-22 / 专辑 / CD / 摇滚 8.7 ( 69137人评价 ) O Damien Rice / 2002-02-01 / 专辑 / CD / 流行 9.1 ( 53450人评价 ) Wake Me Up When September Ends 九月结束的时候叫醒我 Green Day / 2005-06-13 / 单曲 / CD / 摇滚 9.4 ( 55101人评价 ) 叶惠美 葉惠美 周杰伦 / 2003-07-31 / 专辑 / CD / 流行 9.3 ( 118425人评价 ) 七里香 Common Jasmin Orange 周杰伦 / 2004 / 专辑 / CD / 流行 9.2 ( 179593人评价 ) 21 Adele / 2011-01-24 / 专辑 / CD / 流行 9.3 ( 77434人评价 ) My Life Will... 张悬 / 2006-06-09 / 专辑 / CD / 流行 8.8 ( 61363人评价 ) 寓言 王菲 / 2000 / 专辑 / CD / 流行 9.4 ( 73631人评价 ) 你在煩惱什麼 你在烦恼什么 苏打绿 / 2011-11-11 / 专辑 / CD / 流行 9.0 ( 59752人评价 ) < |
|||
File diff suppressed because one or more lines are too long
@ -0,0 +1,21 @@ |
|||
======================================== |
|||
文章数据批次保存 |
|||
======================================== |
|||
|
|||
保存时间: 2026-05-31 01:01:48 |
|||
文章数量: 1 |
|||
|
|||
======================================== |
|||
|
|||
---------------------------------------- |
|||
文章 1 |
|||
---------------------------------------- |
|||
ID: 1 |
|||
标题: 长沙市, 湖南省月度天气预报 - weather.com |
|||
URL: https://weather.com/zh-CN/weather/monthly/l/2add4f77b58b85fbed59ae07151489bce5504e7ae027a17fa94ad0149eceef0f |
|||
来源: https://weather.com/zh-CN/weather/monthly/l/2add4f77b58b85fbed59ae07151489bce5504e7ae027a17fa94ad0149eceef0f |
|||
爬取时间: 2026-05-31 01:01:32 |
|||
|
|||
内容: |
|||
长沙市, 湖南省月度天气预报 - weather.com Hamburger The Weather Company Today Moon Phase - Day 8 26 Not Available -- -- Moon Phase - Day 9 27 Not Available -- -- Moon Phase - Day 10 28 Not Available -- -- Moon Phase - Day 11 29 Not Available -- -- Moon Phase - Day 12 30 Not Available -- -- Moon Phase - Day 13 1 Not Available -- -- Moon Phase - Day 14 2 Partly Cloudy Day 27 ° 19 ° Moon Phase - Day 15 3 Scattered Showers Day 26 ° 15 ° Moon Phase - Day 16 4 Partly Cloudy Day 25 ° 15 ° Moon Phase - Day 17 5 Partly Cloudy Day 28 ° 16 ° Moon Phase - Day 19 6 Partly Cloudy Day 30 ° 18 ° Moon Phase - Day 20 7 Partly Cloudy Day 32 ° 21 ° Moon Phase - Day 21 8 Showers 22 ° 18 ° Moon Phase - Day 22 9 Showers 21 ° 15 ° Moon Phase - Day 23 10 Partly Cloudy Day 24 ° 16 ° Moon Phase - Day 24 11 Partly Cloudy Day 27 ° 18 ° Moon Phase - Day 25 12 Partly Cloudy Day 30 ° 20 ° Moon Phase - Day 26 13 Showers 27 ° 19 ° Moon Phase - Day 27 14 Showers 23 ° 20 ° Moon Phase - Day 28 15 Cloudy 28 ° 21 ° Moon Phase - Day 29 16 Partly Cloudy Day 31 ° 23 ° Moon Phase - Day 1 17 Partly Cloudy Day 29 ° 22 ° Moon Phase - Day 2 18 Partly Cloudy Day 31 ° 23 ° Moon Phase - Day 3 19 Partly Cloudy Day 30 ° 21 ° Moon Phase - Day 4 20 Showers 23 ° 19 ° Moon Phase - Day 5 21 Showers 23 ° 20 ° Moon Phase - Day 5 22 Showers 26 ° 22 ° Moon Phase - Day 6 23 Scattered Showers Day 29 ° 23 ° Moon Phase - Day 7 24 Scattered Showers Day 30 ° 25 ° Moon Phase - Day 8 25 Windy 33 ° 27 ° Moon Phase - Day 9 26 Partly Cloudy Day 35 ° 26 ° Moon Phase - Day 10 27 Partly Cloudy Day 30 ° 22 ° Moon Phase - Day 11 28 Showers 27 ° 22 ° Moon Phase - Day 12 29 Partly Cloudy Day 29 ° 22 ° Moon Phase - Day 13 30 Showers 24 ° 19 ° Moon Phase - Day 14 31 Partly Cloudy Day 31 ° 22 ° Moon Phase - Day 15 1 Mostly Clear Day 33 ° 23 ° Moon Phase - Day 16 2 Mostly Clear Day 34 ° 25 ° Moon Phase - Day 17 3 Scattered Thunderstorms 30 ° 24 ° Moon Phase - Day 18 4 Scattered Showers Day 30 ° 24 ° Moon Phase - Day 19 5 Showers 31 ° 24 ° Moon Phase - Day 20 6 Scattered Thunderstorms 31 ° 24 ° Close 白天 31 ° Partly Cloudy Day Rain drop 3% 西 6 公里/小时 少云。 最高 31°C。 微风且风向多变。 Record High 最高纪录 36 ° Average High 平均最高 28 ° Sunrise 日出 5:31 Sunset 日落 19:19 夜间 22 ° Clear Night Rain drop 4% 北 6 公里/小时 大部晴朗。 最低 22°C。 微风且风向多变。 Record Low 最低记录 18 ° Average Low 平均最低 21 ° Moonrise 月出 19:32 Moonset 月落 4:57 Moon Phase - Day 14 满月 Moon Phase - Day 21 7 Showers 30 ° 23 ° Moon Phase - Day 22 8 Showers 29 ° 22 ° Moon Phase - Day 23 9 Scattered Showers Day 27 ° 21 ° Moon Phase - Day 24 10 Scattered Showers Day 28 ° 22 ° Moon Phase - Day 25 11 Partly Cloudy Day 29 ° 23 ° Moon Phase - Day 26 12 Scattered Showers Day 30 ° 24 ° Moon Phase - Day 27 13 Showers 30 ° 24 ° Not Available 14 Not Available -- -- Moon Phase - Day 0 15 平均气温 -- -- Moon Phase - Day 1 16 平均气温 -- -- Moon Phase - Day 2 17 平均气温 -- -- Moon Phase - Day 3 18 平均气温 -- -- Moon Phase - Day 5 19 平均气温 -- -- Moon Phase - Day 6 20 平均气温 -- -- Moon Phase - Day 6 21 平均气温 -- -- Moon Phase - Day 7 22 平均气温 -- -- Moon Phase - Day 8 23 平均气温 -- -- Moon Phase - Day 9 24 平均气温 -- -- Not Available 25 平均气温 -- -- Not Available 26 平均气温 -- -- Not Available 27 平均气温 -- -- Not Available 28 平均气温 -- -- Not Available 29 平均气温 -- -- Not Available 30 平均气温 -- -- Not Available 1 平均气温 -- -- Not Available 2 平均气温 -- -- Not Available 3 平均气温 -- -- Not Available 4 平均气温 -- -- Advertisement Advertisement 历史记录 5月31日 高 低 降水量 平均值 28 ° C 21 ° -- 记录 34 ° ( 2011 ) 17 ° ( 1993 ) -- 历史气温状况 昨日 24 ° 19 ° 0.25 毫米 过去七天 35 ° 19 ° 6.84 当月气温 35 ° 15 ° 61.66 历史月平均气温 五月 27 ° 19 ° 201.68 六月 30 ° 23 ° 224.28 七月 33 ° 26 ° 162.81 |
|||
|
|||
File diff suppressed because one or more lines are too long
@ -0,0 +1,31 @@ |
|||
======================================== |
|||
文章索引 |
|||
======================================== |
|||
|
|||
共有 1 篇文章 |
|||
|
|||
[1] 豆瓣音乐 Top 250 |
|||
URL: https://music.douban.com/top250 |
|||
文件名: article_1.txt |
|||
爬取时间: 2026-05-31 00:24:38 |
|||
|
|||
[保存记录] 2026-05-31 00:30:57 |
|||
批次文件: articles_20260531_003057.txt |
|||
文章数量: 1 |
|||
|
|||
[保存记录] 2026-05-31 00:48:41 |
|||
批次文件: articles_20260531_004841.txt |
|||
文章数量: 1 |
|||
|
|||
[保存记录] 2026-05-31 01:01:48 |
|||
批次文件: articles_20260531_010148.txt |
|||
文章数量: 1 |
|||
|
|||
[保存记录] 2026-05-31 10:58:16 |
|||
批次文件: articles_20260531_105816.txt |
|||
文章数量: 1 |
|||
|
|||
[保存记录] 2026-05-31 11:00:37 |
|||
批次文件: articles_20260531_110037.txt |
|||
文章数量: 1 |
|||
|
|||
@ -0,0 +1,50 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
|
|||
<groupId>com.crawler</groupId> |
|||
<artifactId>my-crawler</artifactId> |
|||
<version>1.0-SNAPSHOT</version> |
|||
<packaging>jar</packaging> |
|||
|
|||
<name>My Crawler</name> |
|||
<description>A simple web crawler application</description> |
|||
|
|||
<properties> |
|||
<maven.compiler.source>11</maven.compiler.source> |
|||
<maven.compiler.target>11</maven.compiler.target> |
|||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
|||
</properties> |
|||
|
|||
<dependencies> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>1.16.1</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.google.code.gson</groupId> |
|||
<artifactId>gson</artifactId> |
|||
<version>2.10.1</version> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-jar-plugin</artifactId> |
|||
<version>3.3.0</version> |
|||
<configuration> |
|||
<archive> |
|||
<manifest> |
|||
<mainClass>com.crawler.App</mainClass> |
|||
</manifest> |
|||
</archive> |
|||
</configuration> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,63 @@ |
|||
package com.crawler; |
|||
|
|||
import com.crawler.command.*; |
|||
import com.crawler.controller.CrawlerController; |
|||
import com.crawler.repository.ArticleRepository; |
|||
import com.crawler.repository.InMemoryArticleRepository; |
|||
import com.crawler.view.ConsoleView; |
|||
|
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
import java.util.concurrent.atomic.AtomicBoolean; |
|||
|
|||
public class App { |
|||
private final Map<String, Command> commands = new HashMap<>(); |
|||
private final ConsoleView view; |
|||
private final AtomicBoolean running = new AtomicBoolean(true); |
|||
|
|||
public App() { |
|||
view = new ConsoleView(); |
|||
ArticleRepository repository = new InMemoryArticleRepository(); |
|||
CrawlerController controller = new CrawlerController(repository, view); |
|||
|
|||
commands.put("crawl", new CrawlCommand(controller, view)); |
|||
commands.put("list", new ListCommand(controller)); |
|||
commands.put("save", new SaveCommand(controller)); |
|||
commands.put("load", new LoadCommand(controller)); |
|||
commands.put("help", new HelpCommand(view)); |
|||
commands.put("exit", new ExitCommand(view, () -> running.set(false))); |
|||
} |
|||
|
|||
public void run() { |
|||
view.displayWelcome(); |
|||
view.displayHelp(); |
|||
|
|||
while (running.get()) { |
|||
try { |
|||
String input = view.readInput(); |
|||
if (input.isEmpty()) { |
|||
continue; |
|||
} |
|||
|
|||
String[] parts = input.split("\\s+", 3); |
|||
String commandName = parts[0].toLowerCase(); |
|||
String[] args = parts.length > 1 ? java.util.Arrays.copyOfRange(parts, 1, parts.length) : new String[0]; |
|||
|
|||
Command command = commands.get(commandName); |
|||
if (command != null) { |
|||
command.execute(args); |
|||
} else { |
|||
view.displayError("Unknown command: " + commandName); |
|||
view.displayInfo("Type 'help' for available commands"); |
|||
} |
|||
} catch (Exception e) { |
|||
view.displayError("Error: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
public static void main(String[] args) { |
|||
App app = new App(); |
|||
app.run(); |
|||
} |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
package com.crawler.command; |
|||
|
|||
public interface Command { |
|||
void execute(String[] args) throws Exception; |
|||
String getCommandName(); |
|||
String getDescription(); |
|||
} |
|||
@ -0,0 +1,41 @@ |
|||
package com.crawler.command; |
|||
|
|||
import com.crawler.controller.CrawlerController; |
|||
import com.crawler.view.ConsoleView; |
|||
|
|||
public class CrawlCommand implements Command { |
|||
private final CrawlerController controller; |
|||
private final ConsoleView view; |
|||
|
|||
public CrawlCommand(CrawlerController controller, ConsoleView view) { |
|||
this.controller = controller; |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
if (args.length < 1) { |
|||
view.displayError("Usage: crawl <url> [strategy]"); |
|||
return; |
|||
} |
|||
|
|||
String url = args[0]; |
|||
String strategy = args.length > 1 ? args[1] : "jsoup"; |
|||
|
|||
try { |
|||
controller.crawl(url, strategy); |
|||
} catch (Exception e) { |
|||
view.displayError("Crawl failed: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String getCommandName() { |
|||
return "crawl"; |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "Crawl a website"; |
|||
} |
|||
} |
|||
@ -0,0 +1,31 @@ |
|||
package com.crawler.command; |
|||
|
|||
import com.crawler.view.ConsoleView; |
|||
|
|||
public class ExitCommand implements Command { |
|||
private final ConsoleView view; |
|||
private Runnable exitCallback; |
|||
|
|||
public ExitCommand(ConsoleView view, Runnable exitCallback) { |
|||
this.view = view; |
|||
this.exitCallback = exitCallback; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
view.displayGoodbye(); |
|||
if (exitCallback != null) { |
|||
exitCallback.run(); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public String getCommandName() { |
|||
return "exit"; |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "Exit the application"; |
|||
} |
|||
} |
|||
@ -0,0 +1,26 @@ |
|||
package com.crawler.command; |
|||
|
|||
import com.crawler.view.ConsoleView; |
|||
|
|||
public class HelpCommand implements Command { |
|||
private final ConsoleView view; |
|||
|
|||
public HelpCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
view.displayHelp(); |
|||
} |
|||
|
|||
@Override |
|||
public String getCommandName() { |
|||
return "help"; |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "Show help message"; |
|||
} |
|||
} |
|||
@ -0,0 +1,26 @@ |
|||
package com.crawler.command; |
|||
|
|||
import com.crawler.controller.CrawlerController; |
|||
|
|||
public class ListCommand implements Command { |
|||
private final CrawlerController controller; |
|||
|
|||
public ListCommand(CrawlerController controller) { |
|||
this.controller = controller; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
controller.listArticles(); |
|||
} |
|||
|
|||
@Override |
|||
public String getCommandName() { |
|||
return "list"; |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "List all crawled articles"; |
|||
} |
|||
} |
|||
@ -0,0 +1,26 @@ |
|||
package com.crawler.command; |
|||
|
|||
import com.crawler.controller.CrawlerController; |
|||
|
|||
public class LoadCommand implements Command { |
|||
private final CrawlerController controller; |
|||
|
|||
public LoadCommand(CrawlerController controller) { |
|||
this.controller = controller; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
controller.loadData(); |
|||
} |
|||
|
|||
@Override |
|||
public String getCommandName() { |
|||
return "load"; |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "Load articles from data file"; |
|||
} |
|||
} |
|||
@ -0,0 +1,26 @@ |
|||
package com.crawler.command; |
|||
|
|||
import com.crawler.controller.CrawlerController; |
|||
|
|||
public class SaveCommand implements Command { |
|||
private final CrawlerController controller; |
|||
|
|||
public SaveCommand(CrawlerController controller) { |
|||
this.controller = controller; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
controller.saveData(); |
|||
} |
|||
|
|||
@Override |
|||
public String getCommandName() { |
|||
return "save"; |
|||
} |
|||
|
|||
@Override |
|||
public String getDescription() { |
|||
return "Save articles to data file"; |
|||
} |
|||
} |
|||
@ -0,0 +1,67 @@ |
|||
package com.crawler.controller; |
|||
|
|||
import java.util.List; |
|||
|
|||
import com.crawler.factory.StrategyFactory; |
|||
import com.crawler.model.Article; |
|||
import com.crawler.repository.ArticleRepository; |
|||
import com.crawler.strategy.CrawlStrategy; |
|||
import com.crawler.util.DataPersistence; |
|||
import com.crawler.view.ConsoleView; |
|||
|
|||
public class CrawlerController { |
|||
private final ArticleRepository repository; |
|||
private final ConsoleView view; |
|||
|
|||
public CrawlerController(ArticleRepository repository, ConsoleView view) { |
|||
this.repository = repository; |
|||
this.view = view; |
|||
loadSavedData(); |
|||
} |
|||
|
|||
private void loadSavedData() { |
|||
List<Article> savedArticles = DataPersistence.loadArticles(); |
|||
if (!savedArticles.isEmpty()) { |
|||
repository.saveAll(savedArticles); |
|||
view.displayInfo("Loaded " + savedArticles.size() + " saved articles"); |
|||
} |
|||
} |
|||
|
|||
public void crawl(String url, String strategyName) throws Exception { |
|||
if (url == null || url.trim().isEmpty()) { |
|||
throw new IllegalArgumentException("URL cannot be empty"); |
|||
} |
|||
|
|||
if (!url.startsWith("http://") && !url.startsWith("https://")) { |
|||
url = "https://" + url; |
|||
} |
|||
|
|||
// 移除URL重复检查,允许重复爬取同一URL
|
|||
view.displayInfo("Crawling: " + url); |
|||
view.displayInfo("Using strategy: " + strategyName); |
|||
|
|||
CrawlStrategy strategy = StrategyFactory.getStrategy(strategyName); |
|||
List<Article> articles = strategy.crawl(url); |
|||
|
|||
for (Article article : articles) { |
|||
repository.save(article); |
|||
view.displaySuccess("Crawled: " + article.getTitle()); |
|||
} |
|||
} |
|||
|
|||
public void listArticles() { |
|||
List<Article> articles = repository.findAll(); |
|||
view.displayArticleList(articles); |
|||
} |
|||
|
|||
public void saveData() { |
|||
List<Article> articles = repository.findAll(); |
|||
DataPersistence.saveArticles(articles); |
|||
} |
|||
|
|||
public void loadData() { |
|||
repository.deleteAll(); |
|||
List<Article> savedArticles = DataPersistence.loadArticles(); |
|||
repository.saveAll(savedArticles); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class CrawlerException extends RuntimeException { |
|||
public CrawlerException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public CrawlerException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class NetworkException extends CrawlerException { |
|||
public NetworkException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public NetworkException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class ParseException extends CrawlerException { |
|||
public ParseException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public ParseException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.crawler.exception; |
|||
|
|||
public class UrlFormatException extends CrawlerException { |
|||
public UrlFormatException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public UrlFormatException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,27 @@ |
|||
package com.crawler.factory; |
|||
|
|||
import com.crawler.strategy.*; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
|
|||
public class StrategyFactory { |
|||
private static final Map<String, CrawlStrategy> strategies = new HashMap<>(); |
|||
|
|||
static { |
|||
strategies.put("blog", new BlogCrawlStrategy()); |
|||
strategies.put("news", new NewsCrawlStrategy()); |
|||
strategies.put("jsoup", new JsoupCrawlStrategy()); |
|||
} |
|||
|
|||
public static CrawlStrategy getStrategy(String strategyName) { |
|||
return strategies.getOrDefault(strategyName.toLowerCase(), new JsoupCrawlStrategy()); |
|||
} |
|||
|
|||
public static boolean hasStrategy(String strategyName) { |
|||
return strategies.containsKey(strategyName.toLowerCase()); |
|||
} |
|||
|
|||
public static String[] getAvailableStrategies() { |
|||
return strategies.keySet().toArray(new String[0]); |
|||
} |
|||
} |
|||
@ -0,0 +1,104 @@ |
|||
package com.crawler.model; |
|||
|
|||
import java.io.Serializable; |
|||
import java.time.LocalDateTime; |
|||
|
|||
public class Article implements Serializable { |
|||
private static final long serialVersionUID = 1L; |
|||
private String id; |
|||
private String title; |
|||
private String url; |
|||
private String content; |
|||
private String author; |
|||
private LocalDateTime publishDate; |
|||
private LocalDateTime crawlDate; |
|||
private String source; |
|||
|
|||
public Article() { |
|||
this.crawlDate = LocalDateTime.now(); |
|||
} |
|||
|
|||
public Article(String title, String url, String content) { |
|||
this.title = title; |
|||
this.url = url; |
|||
this.content = content; |
|||
this.crawlDate = LocalDateTime.now(); |
|||
} |
|||
|
|||
public String getId() { |
|||
return id; |
|||
} |
|||
|
|||
public void setId(String id) { |
|||
this.id = id; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
|
|||
public void setUrl(String url) { |
|||
this.url = url; |
|||
} |
|||
|
|||
public String getContent() { |
|||
return content; |
|||
} |
|||
|
|||
public void setContent(String content) { |
|||
this.content = content; |
|||
} |
|||
|
|||
public String getAuthor() { |
|||
return author; |
|||
} |
|||
|
|||
public void setAuthor(String author) { |
|||
this.author = author; |
|||
} |
|||
|
|||
public LocalDateTime getPublishDate() { |
|||
return publishDate; |
|||
} |
|||
|
|||
public void setPublishDate(LocalDateTime publishDate) { |
|||
this.publishDate = publishDate; |
|||
} |
|||
|
|||
public LocalDateTime getCrawlDate() { |
|||
return crawlDate; |
|||
} |
|||
|
|||
public void setCrawlDate(LocalDateTime crawlDate) { |
|||
this.crawlDate = crawlDate; |
|||
} |
|||
|
|||
public String getSource() { |
|||
return source; |
|||
} |
|||
|
|||
public void setSource(String source) { |
|||
this.source = source; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Article{" + |
|||
"id='" + id + '\'' + |
|||
", title='" + title + '\'' + |
|||
", url='" + url + '\'' + |
|||
", author='" + author + '\'' + |
|||
", publishDate=" + publishDate + |
|||
", crawlDate=" + crawlDate + |
|||
", source='" + source + '\'' + |
|||
'}'; |
|||
} |
|||
} |
|||
@ -0,0 +1,18 @@ |
|||
package com.crawler.repository; |
|||
|
|||
import com.crawler.model.Article; |
|||
import java.util.List; |
|||
import java.util.Optional; |
|||
|
|||
public interface ArticleRepository { |
|||
void save(Article article); |
|||
void saveAll(List<Article> articles); |
|||
Optional<Article> findById(String id); |
|||
Optional<Article> findByUrl(String url); |
|||
List<Article> findAll(); |
|||
List<Article> findBySource(String source); |
|||
void deleteById(String id); |
|||
void deleteAll(); |
|||
int count(); |
|||
boolean existsByUrl(String url); |
|||
} |
|||
@ -0,0 +1,78 @@ |
|||
package com.crawler.repository; |
|||
|
|||
import com.crawler.model.Article; |
|||
import java.util.*; |
|||
import java.util.concurrent.ConcurrentHashMap; |
|||
import java.util.concurrent.atomic.AtomicInteger; |
|||
import java.util.stream.Collectors; |
|||
|
|||
public class InMemoryArticleRepository implements ArticleRepository { |
|||
private final Map<String, Article> articles = new ConcurrentHashMap<>(); |
|||
private final Map<String, String> urlToIdMap = new ConcurrentHashMap<>(); |
|||
private final AtomicInteger idGenerator = new AtomicInteger(1); |
|||
|
|||
@Override |
|||
public void save(Article article) { |
|||
if (article.getId() == null) { |
|||
article.setId(String.valueOf(idGenerator.getAndIncrement())); |
|||
} |
|||
articles.put(article.getId(), article); |
|||
if (article.getUrl() != null) { |
|||
urlToIdMap.put(article.getUrl(), article.getId()); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public void saveAll(List<Article> articleList) { |
|||
for (Article article : articleList) { |
|||
save(article); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public Optional<Article> findById(String id) { |
|||
return Optional.ofNullable(articles.get(id)); |
|||
} |
|||
|
|||
@Override |
|||
public Optional<Article> findByUrl(String url) { |
|||
String id = urlToIdMap.get(url); |
|||
return id != null ? Optional.ofNullable(articles.get(id)) : Optional.empty(); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> findAll() { |
|||
return new ArrayList<>(articles.values()); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> findBySource(String source) { |
|||
return articles.values().stream() |
|||
.filter(a -> source.equals(a.getSource())) |
|||
.collect(Collectors.toList()); |
|||
} |
|||
|
|||
@Override |
|||
public void deleteById(String id) { |
|||
Article article = articles.remove(id); |
|||
if (article != null && article.getUrl() != null) { |
|||
urlToIdMap.remove(article.getUrl()); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
public void deleteAll() { |
|||
articles.clear(); |
|||
urlToIdMap.clear(); |
|||
} |
|||
|
|||
@Override |
|||
public int count() { |
|||
return articles.size(); |
|||
} |
|||
|
|||
@Override |
|||
public boolean existsByUrl(String url) { |
|||
return urlToIdMap.containsKey(url); |
|||
} |
|||
} |
|||
@ -0,0 +1,76 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import com.crawler.model.Article; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class BlogCrawlStrategy implements CrawlStrategy { |
|||
|
|||
@Override |
|||
public List<Article> crawl(String url) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
URL urlObj = new URL(url); |
|||
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); |
|||
connection.setRequestMethod("GET"); |
|||
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); |
|||
connection.setConnectTimeout(10000); |
|||
connection.setReadTimeout(10000); |
|||
|
|||
StringBuilder content = new StringBuilder(); |
|||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { |
|||
String line; |
|||
while ((line = reader.readLine()) != null) { |
|||
content.append(line).append("\n"); |
|||
} |
|||
} |
|||
|
|||
Article article = new Article(); |
|||
article.setTitle("Blog: " + extractTitle(content.toString())); |
|||
article.setUrl(url); |
|||
article.setSource("blog"); |
|||
article.setContent(extractText(content.toString())); |
|||
article.setAuthor("Blog Author"); |
|||
|
|||
articles.add(article); |
|||
|
|||
} catch (Exception e) { |
|||
Article errorArticle = new Article(); |
|||
errorArticle.setTitle("Error crawling blog: " + url); |
|||
errorArticle.setUrl(url); |
|||
errorArticle.setContent("Error details: " + e.getMessage()); |
|||
errorArticle.setSource("blog"); |
|||
articles.add(errorArticle); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
private String extractTitle(String html) { |
|||
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE); |
|||
Matcher matcher = pattern.matcher(html); |
|||
if (matcher.find()) { |
|||
return matcher.group(1).trim(); |
|||
} |
|||
return "Untitled Blog"; |
|||
} |
|||
|
|||
private String extractText(String html) { |
|||
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "") |
|||
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "") |
|||
.replaceAll("<[^>]+>", " ") |
|||
.replaceAll("\\s+", " ") |
|||
.trim(); |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "blog"; |
|||
} |
|||
} |
|||
@ -0,0 +1,9 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import com.crawler.model.Article; |
|||
import java.util.List; |
|||
|
|||
public interface CrawlStrategy { |
|||
List<Article> crawl(String url) throws Exception; |
|||
String getStrategyName(); |
|||
} |
|||
@ -0,0 +1,170 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import com.crawler.model.Article; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class DoubanTop250Strategy implements CrawlStrategy { |
|||
|
|||
private static final int TOTAL_MOVIES = 250; |
|||
private static final int MOVIES_PER_PAGE = 25; |
|||
|
|||
@Override |
|||
public List<Article> crawl(String url) { |
|||
List<Article> allMovies = new ArrayList<>(); |
|||
try { |
|||
System.out.println("🎬 开始爬取豆瓣电影 Top 250..."); |
|||
System.out.println("⏳ 预计需要爬取 " + (TOTAL_MOVIES / MOVIES_PER_PAGE) + " 页"); |
|||
|
|||
for (int page = 0; page < TOTAL_MOVIES; page += MOVIES_PER_PAGE) { |
|||
String pageUrl = "https://movie.douban.com/top250?start=" + page + "&filter="; |
|||
System.out.println("📄 正在爬取第 " + (page / MOVIES_PER_PAGE + 1) + " 页..."); |
|||
|
|||
List<Article> pageMovies = crawlPage(pageUrl, page / MOVIES_PER_PAGE + 1); |
|||
allMovies.addAll(pageMovies); |
|||
|
|||
System.out.println("✅ 第 " + (page / MOVIES_PER_PAGE + 1) + " 页完成,已获取 " + allMovies.size() + " 部电影"); |
|||
|
|||
try { |
|||
Thread.sleep(1000); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
System.out.println("🎉 完成!共爬取 " + allMovies.size() + " 部电影"); |
|||
} catch (Exception e) { |
|||
System.err.println("❌ 爬取失败: " + e.getMessage()); |
|||
Article errorArticle = new Article(); |
|||
errorArticle.setTitle("Error crawling Douban Top 250"); |
|||
errorArticle.setUrl(url); |
|||
errorArticle.setContent("Error details: " + e.getMessage()); |
|||
errorArticle.setSource("douban"); |
|||
allMovies.add(errorArticle); |
|||
} |
|||
return allMovies; |
|||
} |
|||
|
|||
private List<Article> crawlPage(String url, int pageNum) { |
|||
List<Article> movies = new ArrayList<>(); |
|||
try { |
|||
URL urlObj = new URL(url); |
|||
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); |
|||
connection.setRequestMethod("GET"); |
|||
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); |
|||
connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); |
|||
connection.setConnectTimeout(15000); |
|||
connection.setReadTimeout(15000); |
|||
|
|||
StringBuilder html = new StringBuilder(); |
|||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { |
|||
String line; |
|||
while ((line = reader.readLine()) != null) { |
|||
html.append(line).append("\n"); |
|||
} |
|||
} |
|||
|
|||
movies = parseMovies(html.toString()); |
|||
} catch (Exception e) { |
|||
System.err.println("⚠️ 第 " + pageNum + " 页爬取失败: " + e.getMessage()); |
|||
} |
|||
return movies; |
|||
} |
|||
|
|||
private List<Article> parseMovies(String html) { |
|||
List<Article> movies = new ArrayList<>(); |
|||
|
|||
String moviePattern = "<div class=\"item\">[\\s\\S]*?</div>\\s*</div>\\s*</div>"; |
|||
Pattern pattern = Pattern.compile(moviePattern, Pattern.DOTALL); |
|||
Matcher matcher = pattern.matcher(html); |
|||
|
|||
while (matcher.find()) { |
|||
try { |
|||
Article movie = parseSingleMovie(matcher.group()); |
|||
if (movie != null) { |
|||
movies.add(movie); |
|||
} |
|||
} catch (Exception e) { |
|||
continue; |
|||
} |
|||
} |
|||
return movies; |
|||
} |
|||
|
|||
private Article parseSingleMovie(String movieHtml) { |
|||
Article movie = new Article(); |
|||
movie.setSource("douban"); |
|||
|
|||
try { |
|||
Pattern titlePattern = Pattern.compile("<span class=\"title\">(.*?)</span>"); |
|||
Matcher titleMatcher = titlePattern.matcher(movieHtml); |
|||
if (titleMatcher.find()) { |
|||
movie.setTitle(titleMatcher.group(1)); |
|||
} |
|||
|
|||
Pattern linkPattern = Pattern.compile("<a href=\"(.*?)\""); |
|||
Matcher linkMatcher = linkPattern.matcher(movieHtml); |
|||
if (linkMatcher.find()) { |
|||
movie.setUrl(linkMatcher.group(1)); |
|||
} |
|||
|
|||
Pattern ratingPattern = Pattern.compile("<span class=\"rating_num\">(.*?)</span>"); |
|||
Matcher ratingMatcher = ratingPattern.matcher(movieHtml); |
|||
String rating = ""; |
|||
if (ratingMatcher.find()) { |
|||
rating = ratingMatcher.group(1); |
|||
} |
|||
|
|||
Pattern yearPattern = Pattern.compile("(\\d{4})\\s*/"); |
|||
Matcher yearMatcher = yearPattern.matcher(movieHtml); |
|||
String year = ""; |
|||
if (yearMatcher.find()) { |
|||
year = yearMatcher.group(1); |
|||
} |
|||
|
|||
Pattern quotePattern = Pattern.compile("<span class=\"inq\">(.*?)</span>"); |
|||
Matcher quoteMatcher = quotePattern.matcher(movieHtml); |
|||
String quote = ""; |
|||
if (quoteMatcher.find()) { |
|||
quote = quoteMatcher.group(1); |
|||
} |
|||
|
|||
Pattern infoPattern = Pattern.compile("<p class=\"\">(.*?)</p>", Pattern.DOTALL); |
|||
Matcher infoMatcher = infoPattern.matcher(movieHtml); |
|||
String info = ""; |
|||
if (infoMatcher.find()) { |
|||
info = infoMatcher.group(1).replaceAll("<br\\s*/?>", "\n").replaceAll("<[^>]+>", "").trim(); |
|||
} |
|||
|
|||
StringBuilder content = new StringBuilder(); |
|||
content.append("🎬 电影名称: ").append(movie.getTitle()).append("\n"); |
|||
content.append("⭐ 评分: ").append(rating).append("\n"); |
|||
content.append("📅 年份: ").append(year).append("\n"); |
|||
if (!quote.isEmpty()) { |
|||
content.append("💬 简介: ").append(quote).append("\n"); |
|||
} |
|||
content.append("\n📝 详细信息:\n").append(info); |
|||
|
|||
movie.setContent(content.toString()); |
|||
movie.setAuthor("豆瓣电影"); |
|||
|
|||
} catch (Exception e) { |
|||
return null; |
|||
} |
|||
|
|||
return movie; |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "douban"; |
|||
} |
|||
} |
|||
@ -0,0 +1,75 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
import com.crawler.model.Article; |
|||
|
|||
public class JsoupCrawlStrategy implements CrawlStrategy { |
|||
|
|||
@Override |
|||
public List<Article> crawl(String url) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
URL urlObj = new URL(url); |
|||
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); |
|||
connection.setRequestMethod("GET"); |
|||
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); |
|||
connection.setConnectTimeout(10000); |
|||
connection.setReadTimeout(10000); |
|||
|
|||
StringBuilder content = new StringBuilder(); |
|||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { |
|||
String line; |
|||
while ((line = reader.readLine()) != null) { |
|||
content.append(line).append("\n"); |
|||
} |
|||
} |
|||
|
|||
Article article = new Article(); |
|||
article.setTitle(extractTitle(content.toString())); |
|||
article.setUrl(url); |
|||
article.setSource(url); |
|||
article.setContent(extractText(content.toString())); |
|||
|
|||
articles.add(article); |
|||
|
|||
} catch (Exception e) { |
|||
Article errorArticle = new Article(); |
|||
errorArticle.setTitle("Error crawling: " + url); |
|||
errorArticle.setUrl(url); |
|||
errorArticle.setContent("Error details: " + e.getMessage()); |
|||
errorArticle.setSource(url); |
|||
articles.add(errorArticle); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
private String extractTitle(String html) { |
|||
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE); |
|||
Matcher matcher = pattern.matcher(html); |
|||
if (matcher.find()) { |
|||
return matcher.group(1).trim(); |
|||
} |
|||
return "Untitled Page"; |
|||
} |
|||
|
|||
private String extractText(String html) { |
|||
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "") |
|||
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "") |
|||
.replaceAll("<[^>]+>", " ") |
|||
.replaceAll("\\s+", " ") |
|||
.trim(); |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "jsoup"; |
|||
} |
|||
} |
|||
@ -0,0 +1,76 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
import com.crawler.model.Article; |
|||
|
|||
public class NewsCrawlStrategy implements CrawlStrategy { |
|||
|
|||
@Override |
|||
public List<Article> crawl(String url) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
URL urlObj = new URL(url); |
|||
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); |
|||
connection.setRequestMethod("GET"); |
|||
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); |
|||
connection.setConnectTimeout(10000); |
|||
connection.setReadTimeout(10000); |
|||
|
|||
StringBuilder content = new StringBuilder(); |
|||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { |
|||
String line; |
|||
while ((line = reader.readLine()) != null) { |
|||
content.append(line).append("\n"); |
|||
} |
|||
} |
|||
|
|||
Article article = new Article(); |
|||
article.setTitle("News: " + extractTitle(content.toString())); |
|||
article.setUrl(url); |
|||
article.setSource("news"); |
|||
article.setContent(extractText(content.toString())); |
|||
article.setAuthor("News Reporter"); |
|||
|
|||
articles.add(article); |
|||
|
|||
} catch (Exception e) { |
|||
Article errorArticle = new Article(); |
|||
errorArticle.setTitle("Error crawling news: " + url); |
|||
errorArticle.setUrl(url); |
|||
errorArticle.setContent("Error details: " + e.getMessage()); |
|||
errorArticle.setSource("news"); |
|||
articles.add(errorArticle); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
private String extractTitle(String html) { |
|||
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE); |
|||
Matcher matcher = pattern.matcher(html); |
|||
if (matcher.find()) { |
|||
return matcher.group(1).trim(); |
|||
} |
|||
return "Untitled News"; |
|||
} |
|||
|
|||
private String extractText(String html) { |
|||
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "") |
|||
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "") |
|||
.replaceAll("<[^>]+>", " ") |
|||
.replaceAll("\\s+", " ") |
|||
.trim(); |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "news"; |
|||
} |
|||
} |
|||
@ -0,0 +1,54 @@ |
|||
package com.crawler.util; |
|||
|
|||
public class ColorUtil { |
|||
public static final String RESET = "\u001B[0m"; |
|||
public static final String BLACK = "\u001B[30m"; |
|||
public static final String RED = "\u001B[31m"; |
|||
public static final String GREEN = "\u001B[32m"; |
|||
public static final String YELLOW = "\u001B[33m"; |
|||
public static final String BLUE = "\u001B[34m"; |
|||
public static final String PURPLE = "\u001B[35m"; |
|||
public static final String CYAN = "\u001B[36m"; |
|||
public static final String WHITE = "\u001B[37m"; |
|||
|
|||
public static final String BLACK_BG = "\u001B[40m"; |
|||
public static final String RED_BG = "\u001B[41m"; |
|||
public static final String GREEN_BG = "\u001B[42m"; |
|||
public static final String YELLOW_BG = "\u001B[43m"; |
|||
public static final String BLUE_BG = "\u001B[44m"; |
|||
public static final String PURPLE_BG = "\u001B[45m"; |
|||
public static final String CYAN_BG = "\u001B[46m"; |
|||
public static final String WHITE_BG = "\u001B[47m"; |
|||
|
|||
public static String colorize(String text, String color) { |
|||
return color + text + RESET; |
|||
} |
|||
|
|||
public static String green(String text) { |
|||
return colorize(text, GREEN); |
|||
} |
|||
|
|||
public static String red(String text) { |
|||
return colorize(text, RED); |
|||
} |
|||
|
|||
public static String yellow(String text) { |
|||
return colorize(text, YELLOW); |
|||
} |
|||
|
|||
public static String blue(String text) { |
|||
return colorize(text, BLUE); |
|||
} |
|||
|
|||
public static String cyan(String text) { |
|||
return colorize(text, CYAN); |
|||
} |
|||
|
|||
public static String purple(String text) { |
|||
return colorize(text, PURPLE); |
|||
} |
|||
|
|||
public static String bold(String text) { |
|||
return "\u001B[1m" + text + RESET; |
|||
} |
|||
} |
|||
@ -0,0 +1,193 @@ |
|||
package com.crawler.util; |
|||
|
|||
import com.crawler.model.Article; |
|||
|
|||
import java.io.*; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class DataPersistence { |
|||
private static final String DATA_FOLDER = "data"; |
|||
private static final String INDEX_FILE = DATA_FOLDER + File.separator + "index.txt"; |
|||
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
private static final DateTimeFormatter FILE_TIMESTAMP_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"); |
|||
|
|||
static { |
|||
File folder = new File(DATA_FOLDER); |
|||
if (!folder.exists()) { |
|||
folder.mkdirs(); |
|||
} |
|||
} |
|||
|
|||
public static void saveArticles(List<Article> articles) { |
|||
try { |
|||
String timestamp = LocalDateTime.now().format(FILE_TIMESTAMP_FORMATTER); |
|||
String batchFileName = DATA_FOLDER + File.separator + "articles_" + timestamp + ".txt"; |
|||
|
|||
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(batchFileName), StandardCharsets.UTF_8))) { |
|||
writer.write("========================================\n"); |
|||
writer.write(" 文章数据批次保存\n"); |
|||
writer.write("========================================\n\n"); |
|||
writer.write("保存时间: " + LocalDateTime.now().format(DATE_FORMATTER) + "\n"); |
|||
writer.write("文章数量: " + articles.size() + "\n\n"); |
|||
writer.write("========================================\n\n"); |
|||
|
|||
for (int i = 0; i < articles.size(); i++) { |
|||
Article article = articles.get(i); |
|||
writer.write("----------------------------------------\n"); |
|||
writer.write("文章 " + (i + 1) + "\n"); |
|||
writer.write("----------------------------------------\n"); |
|||
writer.write("ID: " + article.getId() + "\n"); |
|||
writer.write("标题: " + article.getTitle() + "\n"); |
|||
writer.write("URL: " + article.getUrl() + "\n"); |
|||
if (article.getAuthor() != null) { |
|||
writer.write("作者: " + article.getAuthor() + "\n"); |
|||
} |
|||
if (article.getSource() != null) { |
|||
writer.write("来源: " + article.getSource() + "\n"); |
|||
} |
|||
if (article.getPublishDate() != null) { |
|||
writer.write("发布时间: " + article.getPublishDate().format(DATE_FORMATTER) + "\n"); |
|||
} |
|||
writer.write("爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n"); |
|||
writer.write("\n内容:\n"); |
|||
if (article.getContent() != null) { |
|||
writer.write(article.getContent()); |
|||
} |
|||
writer.write("\n\n"); |
|||
} |
|||
} |
|||
|
|||
updateIndex(timestamp, articles.size()); |
|||
System.out.println(ColorUtil.green("✓ Saved " + articles.size() + " articles to '" + batchFileName + "'")); |
|||
} catch (Exception e) { |
|||
System.err.println(ColorUtil.red("✗ Failed to save articles: " + e.getMessage())); |
|||
} |
|||
} |
|||
|
|||
private static void updateIndex(String timestamp, int articleCount) throws IOException { |
|||
boolean fileExists = new File(INDEX_FILE).exists(); |
|||
|
|||
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(INDEX_FILE, true), StandardCharsets.UTF_8))) { |
|||
if (!fileExists) { |
|||
writer.write("========================================\n"); |
|||
writer.write(" 文章保存历史记录索引\n"); |
|||
writer.write("========================================\n\n"); |
|||
} |
|||
|
|||
writer.write("[保存记录] " + LocalDateTime.now().format(DATE_FORMATTER) + "\n"); |
|||
writer.write(" 批次文件: articles_" + timestamp + ".txt\n"); |
|||
writer.write(" 文章数量: " + articleCount + "\n"); |
|||
writer.write("\n"); |
|||
} |
|||
} |
|||
|
|||
private static void saveIndex(List<Article> articles) throws IOException { |
|||
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(INDEX_FILE), StandardCharsets.UTF_8))) { |
|||
writer.write("========================================\n"); |
|||
writer.write(" 文章索引\n"); |
|||
writer.write("========================================\n\n"); |
|||
writer.write("共有 " + articles.size() + " 篇文章\n\n"); |
|||
|
|||
for (Article article : articles) { |
|||
writer.write("[" + article.getId() + "] " + article.getTitle() + "\n"); |
|||
writer.write(" URL: " + article.getUrl() + "\n"); |
|||
writer.write(" 文件名: article_" + article.getId() + ".txt\n"); |
|||
if (article.getCrawlDate() != null) { |
|||
writer.write(" 爬取时间: " + article.getCrawlDate().format(DATE_FORMATTER) + "\n"); |
|||
} |
|||
writer.write("\n"); |
|||
} |
|||
} |
|||
} |
|||
|
|||
public static List<Article> loadArticles() { |
|||
List<Article> articles = new ArrayList<>(); |
|||
File folder = new File(DATA_FOLDER); |
|||
|
|||
if (!folder.exists()) { |
|||
return articles; |
|||
} |
|||
|
|||
File[] files = folder.listFiles((dir, name) -> name.startsWith("articles_") && name.endsWith(".txt")); |
|||
|
|||
if (files != null) { |
|||
for (File file : files) { |
|||
if (file.getName().equals("index.txt")) { |
|||
continue; |
|||
} |
|||
try { |
|||
List<Article> batchArticles = loadBatchArticle(file); |
|||
if (batchArticles != null) { |
|||
articles.addAll(batchArticles); |
|||
} |
|||
} catch (Exception e) { |
|||
System.err.println(ColorUtil.yellow("⚠ 无法加载文件: " + file.getName())); |
|||
} |
|||
} |
|||
} |
|||
|
|||
System.out.println(ColorUtil.green("✓ Loaded " + articles.size() + " articles from '" + DATA_FOLDER + "' folder")); |
|||
return articles; |
|||
} |
|||
|
|||
private static List<Article> loadBatchArticle(File file) throws IOException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
Article currentArticle = null; |
|||
StringBuilder content = new StringBuilder(); |
|||
boolean inContent = false; |
|||
|
|||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8))) { |
|||
String line; |
|||
|
|||
while ((line = reader.readLine()) != null) { |
|||
if (line.contains("文章 ")) { |
|||
if (currentArticle != null) { |
|||
currentArticle.setContent(content.toString()); |
|||
articles.add(currentArticle); |
|||
} |
|||
currentArticle = new Article(); |
|||
content = new StringBuilder(); |
|||
inContent = false; |
|||
continue; |
|||
} |
|||
|
|||
if (line.startsWith("ID: ")) { |
|||
currentArticle.setId(line.substring(4)); |
|||
} else if (line.startsWith("标题: ")) { |
|||
currentArticle.setTitle(line.substring(4)); |
|||
} else if (line.startsWith("URL: ")) { |
|||
currentArticle.setUrl(line.substring(5)); |
|||
} else if (line.startsWith("作者: ")) { |
|||
currentArticle.setAuthor(line.substring(4)); |
|||
} else if (line.startsWith("来源: ")) { |
|||
currentArticle.setSource(line.substring(4)); |
|||
} else if (line.startsWith("爬取时间: ")) { |
|||
String crawlDateStr = line.substring(6); |
|||
try { |
|||
currentArticle.setCrawlDate(LocalDateTime.parse(crawlDateStr, DATE_FORMATTER)); |
|||
} catch (Exception e) { |
|||
// Ignore parse errors
|
|||
} |
|||
} else if (line.equals("内容:")) { |
|||
inContent = true; |
|||
} else if (inContent && !line.startsWith("-----") && !line.startsWith("=====")) { |
|||
if (content.length() > 0) { |
|||
content.append("\n"); |
|||
} |
|||
content.append(line); |
|||
} |
|||
} |
|||
|
|||
if (currentArticle != null) { |
|||
currentArticle.setContent(content.toString()); |
|||
articles.add(currentArticle); |
|||
} |
|||
} |
|||
|
|||
return articles; |
|||
} |
|||
} |
|||
@ -0,0 +1,101 @@ |
|||
package com.crawler.view; |
|||
|
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.List; |
|||
import java.util.Scanner; |
|||
|
|||
import com.crawler.model.Article; |
|||
import com.crawler.util.ColorUtil; |
|||
|
|||
public class ConsoleView { |
|||
private static final Scanner scanner = new Scanner(System.in); |
|||
private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
|
|||
public void displayWelcome() { |
|||
System.out.println(ColorUtil.cyan("========================================")); |
|||
System.out.println(ColorUtil.cyan(" Welcome to My Crawler ")); |
|||
System.out.println(ColorUtil.cyan("========================================")); |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void displayHelp() { |
|||
System.out.println(ColorUtil.yellow("Available commands:")); |
|||
System.out.println(ColorUtil.green(" crawl <url> [strategy] - Crawl a website")); |
|||
System.out.println(ColorUtil.green(" list - List all crawled articles")); |
|||
System.out.println(ColorUtil.green(" save - Save articles to data file")); |
|||
System.out.println(ColorUtil.green(" load - Load articles from data file")); |
|||
System.out.println(ColorUtil.green(" help - Show this help message")); |
|||
System.out.println(ColorUtil.green(" exit - Exit the application")); |
|||
System.out.println(); |
|||
System.out.println(ColorUtil.yellow("Available strategies:")); |
|||
System.out.println(ColorUtil.cyan(" blog - Blog crawling strategy")); |
|||
System.out.println(ColorUtil.cyan(" news - News crawling strategy")); |
|||
System.out.println(ColorUtil.cyan(" jsoup - Generic JSoup strategy (default)")); |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void displayArticleList(List<Article> articles) { |
|||
if (articles.isEmpty()) { |
|||
System.out.println(ColorUtil.yellow("No articles found.")); |
|||
return; |
|||
} |
|||
|
|||
System.out.println(ColorUtil.cyan("=== Crawled Articles (" + articles.size() + ") ===")); |
|||
System.out.println(); |
|||
|
|||
for (int i = 0; i < articles.size(); i++) { |
|||
displayArticleDetail(articles.get(i), i + 1); |
|||
} |
|||
} |
|||
|
|||
public void displayArticleDetail(Article article, int index) { |
|||
System.out.println(ColorUtil.bold(ColorUtil.green("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"))); |
|||
System.out.println(ColorUtil.bold(ColorUtil.yellow("[" + index + "] " + article.getTitle()))); |
|||
System.out.println(ColorUtil.bold(ColorUtil.green("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"))); |
|||
System.out.println(ColorUtil.cyan(" ID: ") + article.getId()); |
|||
System.out.println(ColorUtil.cyan(" URL: ") + article.getUrl()); |
|||
if (article.getAuthor() != null) { |
|||
System.out.println(ColorUtil.cyan(" Author: ") + article.getAuthor()); |
|||
} |
|||
if (article.getSource() != null) { |
|||
System.out.println(ColorUtil.cyan(" Source: ") + article.getSource()); |
|||
} |
|||
if (article.getPublishDate() != null) { |
|||
System.out.println(ColorUtil.cyan(" Published: ") + article.getPublishDate().format(DATE_FORMATTER)); |
|||
} |
|||
System.out.println(ColorUtil.cyan(" Crawled: ") + article.getCrawlDate().format(DATE_FORMATTER)); |
|||
System.out.println(ColorUtil.cyan(" Content: ")); |
|||
if (article.getContent() != null) { |
|||
String[] lines = article.getContent().split("(?<=\\G.{80})"); |
|||
for (String line : lines) { |
|||
System.out.println(" " + line); |
|||
} |
|||
} |
|||
System.out.println(); |
|||
} |
|||
|
|||
public void displaySuccess(String message) { |
|||
System.out.println(ColorUtil.green("✓ " + message)); |
|||
} |
|||
|
|||
public void displayError(String message) { |
|||
System.out.println(ColorUtil.red("✗ " + message)); |
|||
} |
|||
|
|||
public void displayInfo(String message) { |
|||
System.out.println(ColorUtil.blue("ℹ " + message)); |
|||
} |
|||
|
|||
public void displayWarning(String message) { |
|||
System.out.println(ColorUtil.yellow("⚠ " + message)); |
|||
} |
|||
|
|||
public String readInput() { |
|||
System.out.print(ColorUtil.purple("> ")); |
|||
return scanner.nextLine().trim(); |
|||
} |
|||
|
|||
public void displayGoodbye() { |
|||
System.out.println(ColorUtil.cyan("Goodbye! Thank you for using My Crawler.")); |
|||
} |
|||
} |
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue