Browse Source

project

main
SALAH ABDULLAH阿山 2 weeks ago
commit
184d50aa0b
  1. 39
      .gitignore
  2. 10
      .idea/.gitignore
  3. 7
      .idea/encodings.xml
  4. 14
      .idea/misc.xml
  5. 6
      .idea/vcs.xml
  6. 22
      export_٢٠٢٦٠٦٠٤_٠٥٥١٠٢.json
  7. 246
      logs/crawler.log
  8. 82
      pom.xml
  9. 18
      src/main/java/com/abod/crawler/Main.java
  10. 12
      src/main/java/com/abod/crawler/command/ClearCommand.java
  11. 19
      src/main/java/com/abod/crawler/command/Command.java
  12. 12
      src/main/java/com/abod/crawler/command/CountCommand.java
  13. 126
      src/main/java/com/abod/crawler/command/CrawlCommand.java
  14. 32
      src/main/java/com/abod/crawler/command/CrawlMultipleCommand.java
  15. 12
      src/main/java/com/abod/crawler/command/ExitCommand.java
  16. 32
      src/main/java/com/abod/crawler/command/ExportCommand.java
  17. 11
      src/main/java/com/abod/crawler/command/HelpCommand.java
  18. 47
      src/main/java/com/abod/crawler/command/ImportCommand.java
  19. 31
      src/main/java/com/abod/crawler/command/ListCommand.java
  20. 77
      src/main/java/com/abod/crawler/controller/CrawlerController.java
  21. 180
      src/main/java/com/abod/crawler/model/Article.java
  22. 144
      src/main/java/com/abod/crawler/model/ArticleRepository.java
  23. 74
      src/main/java/com/abod/crawler/util/JsonUtil.java
  24. 164
      src/main/java/com/abod/crawler/view/ConsoleView.java
  25. 19
      src/main/resources/logback.xml

39
.gitignore

@ -0,0 +1,39 @@
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
.kotlin
### IntelliJ IDEA ###
.idea/modules.xml
.idea/jarRepositories.xml
.idea/compiler.xml
.idea/libraries/
*.iws
*.iml
*.ipr
### Eclipse ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/
### Mac OS ###
.DS_Store

10
.idea/.gitignore

@ -0,0 +1,10 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Ignored default folder with query files
/queries/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

7
.idea/encodings.xml

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
</component>
</project>

14
.idea/misc.xml

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_21" default="true" project-jdk-name="21" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

6
.idea/vcs.xml

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

22
export_٢٠٢٦٠٦٠٤_٠٥٥١٠٢.json

@ -0,0 +1,22 @@
[ {
"title" : "Example Domain",
"url" : "https://example.com",
"content" : "Example Domain This domain is for use in documentation examples without needing permission. Avoid use in operations. Learn more",
"crawled_at" : [ 2026, 6, 4, 5, 50, 59, 489282900 ],
"website_name" : "example.com",
"word_count" : 19
}, {
"title" : "No Title",
"url" : "https://httpbin.org/html",
"content" : "Herman Melville - Moby-Dick Availing himself of the mild, summer-cool weather that now reigned in these latitudes, and in preparation for the peculiarly active pursuits shortly to be anticipated, Perth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to ringbolts by the foremast; being now almost incessantly invoked by the headsmen, and harpooneers, and ...",
"crawled_at" : [ 2026, 6, 4, 5, 51, 0, 805255900 ],
"website_name" : "httpbin.org",
"word_count" : 78
}, {
"title" : "Quotes to Scrape",
"url" : "https://quotes.toscrape.com",
"content" : "Quotes to Scrape Login “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.” by Albert Einstein (about) Tags: change deep-thoughts thinking world “It is our choices, Harry, that show what we truly are, far more than our abilities.” by J.K. Rowling (about) Tags: abilities choices “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.” by Albert Einstein (about)...",
"crawled_at" : [ 2026, 6, 4, 5, 51, 1, 895941300 ],
"website_name" : "quotes.toscrape.com",
"word_count" : 87
} ]

246
logs/crawler.log

@ -0,0 +1,246 @@
2026-06-04 04:19:28.967 [main] INFO com.abod.crawler.Main - Web Crawler Application Started
2026-06-04 04:19:28.970 [main] INFO com.abod.crawler.Main - Web Crawler Application Shutdown
2026-06-04 04:36:38.625 [main] INFO com.abod.crawler.Main - Web Crawler Application Started
2026-06-04 04:36:38.628 [main] INFO c.y.crawler.model.ArticleRepository - ArticleRepository initialized
2026-06-04 04:36:38.636 [main] INFO com.abod.crawler.Main - Web Crawler Application Shutdown
2026-06-04 04:54:49.725 [main] INFO com.abod.crawler.Main - Web Crawler Application Started
2026-06-04 04:54:49.728 [main] INFO c.y.crawler.model.ArticleRepository - ArticleRepository initialized
2026-06-04 04:54:49.746 [main] INFO c.y.c.controller.CrawlerController - CrawlerController initialized with 7 commands
2026-06-04 05:02:07.337 [main] INFO com.abod.crawler.Main - Web Crawler Application Started
2026-06-04 05:02:07.340 [main] INFO c.y.crawler.model.ArticleRepository - ArticleRepository initialized
2026-06-04 05:02:07.358 [main] INFO c.y.c.controller.CrawlerController - CrawlerController initialized with 7 commands
2026-06-04 05:03:00.376 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://example.com
2026-06-04 05:03:01.514 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:03:14.499 [main] INFO c.yourname.crawler.view.ConsoleView - Showing all 1 articles (newest first)
2026-06-04 05:03:32.683 [main] INFO c.yourname.crawler.view.ConsoleView - Total articles in repository: 1
2026-06-04 05:03:40.361 [main] INFO c.yourname.crawler.view.ConsoleView - Starting multi-URL crawl...
2026-06-04 05:03:40.361 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://example.com
2026-06-04 05:03:40.361 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://example.com
2026-06-04 05:03:40.361 [main] WARN c.yourname.crawler.view.ConsoleView - URL already crawled! Use import to load existing data.
2026-06-04 05:03:40.361 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://www.google.com
2026-06-04 05:03:40.362 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://www.google.com
2026-06-04 05:03:50.401 [main] ERROR c.y.crawler.command.CrawlCommand - Failed to crawl URL: https://www.google.com
java.net.SocketTimeoutException: Connect timed out
at java.base/sun.nio.ch.NioSocketImpl.timedFinishConnect(NioSocketImpl.java:546)
at java.base/sun.nio.ch.NioSocketImpl.connect(NioSocketImpl.java:592)
at java.base/java.net.SocksSocketImpl.connect(SocksSocketImpl.java:327)
at java.base/java.net.Socket.connect(Socket.java:751)
at java.base/sun.security.ssl.SSLSocketImpl.connect(SSLSocketImpl.java:304)
at java.base/sun.net.NetworkClient.doConnect(NetworkClient.java:178)
at java.base/sun.net.www.http.HttpClient.openServer(HttpClient.java:531)
at java.base/sun.net.www.http.HttpClient.openServer(HttpClient.java:636)
at java.base/sun.net.www.protocol.https.HttpsClient.<init>(HttpsClient.java:264)
at java.base/sun.net.www.protocol.https.HttpsClient.New(HttpsClient.java:377)
at java.base/sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.getNewHttpClient(AbstractDelegateHttpsURLConnection.java:193)
at java.base/sun.net.www.protocol.http.HttpURLConnection.plainConnect0(HttpURLConnection.java:1253)
at java.base/sun.net.www.protocol.http.HttpURLConnection.plainConnect(HttpURLConnection.java:1139)
at java.base/sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:179)
at java.base/sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:141)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:848)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:818)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:346)
at org.jsoup.helper.HttpConnection.get(HttpConnection.java:331)
at command.com.abod.crawler.CrawlCommand.execute(CrawlCommand.java:39)
at command.com.abod.crawler.CrawlMultipleCommand.execute(CrawlMultipleCommand.java:25)
at controller.com.abod.crawler.CrawlerController.start(CrawlerController.java:74)
at com.abod.crawler.Main.main(Main.java:15)
2026-06-04 05:03:50.403 [main] ERROR c.yourname.crawler.view.ConsoleView - Failed to crawl: Connect timed out
2026-06-04 05:03:50.403 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://www.github.com
2026-06-04 05:03:50.403 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://www.github.com
2026-06-04 05:03:56.409 [main] ERROR c.y.crawler.command.CrawlCommand - Failed to crawl URL: https://www.github.com
java.io.IOException: Underlying input stream returned zero bytes
at java.base/sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:354)
at java.base/sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:393)
at java.base/sun.nio.cs.StreamDecoder.lockedRead(StreamDecoder.java:217)
at java.base/sun.nio.cs.StreamDecoder.read(StreamDecoder.java:171)
at java.base/java.io.InputStreamReader.read(InputStreamReader.java:188)
at java.base/java.io.BufferedReader.fill(BufferedReader.java:160)
at java.base/java.io.BufferedReader.read1(BufferedReader.java:225)
at java.base/java.io.BufferedReader.implRead(BufferedReader.java:314)
at java.base/java.io.BufferedReader.read(BufferedReader.java:296)
at org.jsoup.parser.CharacterReader.bufferUp(CharacterReader.java:87)
at org.jsoup.parser.CharacterReader.<init>(CharacterReader.java:43)
at org.jsoup.parser.CharacterReader.<init>(CharacterReader.java:47)
at org.jsoup.parser.TreeBuilder.initialiseParse(TreeBuilder.java:48)
at org.jsoup.parser.HtmlTreeBuilder.initialiseParse(HtmlTreeBuilder.java:79)
at org.jsoup.parser.TreeBuilder.parse(TreeBuilder.java:60)
at org.jsoup.parser.Parser.parseInput(Parser.java:57)
at org.jsoup.helper.DataUtil.parseInputStream(DataUtil.java:218)
at org.jsoup.helper.HttpConnection$Response.parse(HttpConnection.java:959)
at org.jsoup.helper.HttpConnection.get(HttpConnection.java:333)
at command.com.abod.crawler.CrawlCommand.execute(CrawlCommand.java:39)
at command.com.abod.crawler.CrawlMultipleCommand.execute(CrawlMultipleCommand.java:25)
at controller.com.abod.crawler.CrawlerController.start(CrawlerController.java:74)
at com.abod.crawler.Main.main(Main.java:15)
2026-06-04 05:03:56.409 [main] ERROR c.yourname.crawler.view.ConsoleView - Failed to crawl: Underlying input stream returned zero bytes
2026-06-04 05:03:56.411 [main] INFO c.yourname.crawler.view.ConsoleView - Completed! Crawled 3 URLs.
2026-06-04 05:03:56.411 [main] INFO c.yourname.crawler.view.ConsoleView - Total articles in repository: 1
2026-06-04 05:04:10.838 [main] INFO c.yourname.crawler.view.ConsoleView - Starting multi-URL crawl...
2026-06-04 05:04:10.838 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://example.com
2026-06-04 05:04:10.838 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://example.com
2026-06-04 05:04:10.838 [main] WARN c.yourname.crawler.view.ConsoleView - URL already crawled! Use import to load existing data.
2026-06-04 05:04:10.838 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://www.google.com
2026-06-04 05:04:10.838 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://www.google.com
2026-06-04 05:04:20.847 [main] ERROR c.y.crawler.command.CrawlCommand - Failed to crawl URL: https://www.google.com
java.net.SocketTimeoutException: Connect timed out
at java.base/sun.nio.ch.NioSocketImpl.timedFinishConnect(NioSocketImpl.java:546)
at java.base/sun.nio.ch.NioSocketImpl.connect(NioSocketImpl.java:592)
at java.base/java.net.SocksSocketImpl.connect(SocksSocketImpl.java:327)
at java.base/java.net.Socket.connect(Socket.java:751)
at java.base/sun.security.ssl.SSLSocketImpl.connect(SSLSocketImpl.java:304)
at java.base/sun.net.NetworkClient.doConnect(NetworkClient.java:178)
at java.base/sun.net.www.http.HttpClient.openServer(HttpClient.java:531)
at java.base/sun.net.www.http.HttpClient.openServer(HttpClient.java:636)
at java.base/sun.net.www.protocol.https.HttpsClient.<init>(HttpsClient.java:264)
at java.base/sun.net.www.protocol.https.HttpsClient.New(HttpsClient.java:377)
at java.base/sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.getNewHttpClient(AbstractDelegateHttpsURLConnection.java:193)
at java.base/sun.net.www.protocol.http.HttpURLConnection.plainConnect0(HttpURLConnection.java:1253)
at java.base/sun.net.www.protocol.http.HttpURLConnection.plainConnect(HttpURLConnection.java:1139)
at java.base/sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:179)
at java.base/sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:141)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:848)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:818)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:346)
at org.jsoup.helper.HttpConnection.get(HttpConnection.java:331)
at command.com.abod.crawler.CrawlCommand.execute(CrawlCommand.java:39)
at command.com.abod.crawler.CrawlMultipleCommand.execute(CrawlMultipleCommand.java:25)
at controller.com.abod.crawler.CrawlerController.start(CrawlerController.java:74)
at com.abod.crawler.Main.main(Main.java:15)
2026-06-04 05:04:20.848 [main] ERROR c.yourname.crawler.view.ConsoleView - Failed to crawl: Connect timed out
2026-06-04 05:04:20.848 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://www.github.com
2026-06-04 05:04:20.848 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://www.github.com
2026-06-04 05:04:30.865 [main] ERROR c.y.crawler.command.CrawlCommand - Failed to crawl URL: https://www.github.com
java.net.SocketTimeoutException: Connect timed out
at java.base/sun.nio.ch.NioSocketImpl.timedFinishConnect(NioSocketImpl.java:546)
at java.base/sun.nio.ch.NioSocketImpl.connect(NioSocketImpl.java:592)
at java.base/java.net.SocksSocketImpl.connect(SocksSocketImpl.java:327)
at java.base/java.net.Socket.connect(Socket.java:751)
at java.base/sun.security.ssl.SSLSocketImpl.connect(SSLSocketImpl.java:304)
at java.base/sun.net.NetworkClient.doConnect(NetworkClient.java:178)
at java.base/sun.net.www.http.HttpClient.openServer(HttpClient.java:531)
at java.base/sun.net.www.http.HttpClient.openServer(HttpClient.java:636)
at java.base/sun.net.www.protocol.https.HttpsClient.<init>(HttpsClient.java:264)
at java.base/sun.net.www.protocol.https.HttpsClient.New(HttpsClient.java:377)
at java.base/sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.getNewHttpClient(AbstractDelegateHttpsURLConnection.java:193)
at java.base/sun.net.www.protocol.http.HttpURLConnection.plainConnect0(HttpURLConnection.java:1253)
at java.base/sun.net.www.protocol.http.HttpURLConnection.plainConnect(HttpURLConnection.java:1139)
at java.base/sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:179)
at java.base/sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:141)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:848)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:818)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:346)
at org.jsoup.helper.HttpConnection.get(HttpConnection.java:331)
at command.com.abod.crawler.CrawlCommand.execute(CrawlCommand.java:39)
at command.com.abod.crawler.CrawlMultipleCommand.execute(CrawlMultipleCommand.java:25)
at controller.com.abod.crawler.CrawlerController.start(CrawlerController.java:74)
at com.abod.crawler.Main.main(Main.java:15)
2026-06-04 05:04:30.865 [main] ERROR c.yourname.crawler.view.ConsoleView - Failed to crawl: Connect timed out
2026-06-04 05:04:30.865 [main] INFO c.yourname.crawler.view.ConsoleView - Completed! Crawled 3 URLs.
2026-06-04 05:04:30.865 [main] INFO c.yourname.crawler.view.ConsoleView - Total articles in repository: 1
2026-06-04 05:04:56.866 [main] INFO c.yourname.crawler.view.ConsoleView - Showing 1 articles from example.com
2026-06-04 05:05:12.736 [main] INFO c.y.crawler.model.ArticleRepository - Repository cleared
2026-06-04 05:05:12.736 [main] INFO c.yourname.crawler.view.ConsoleView - All articles cleared from repository
2026-06-04 05:05:16.858 [main] INFO c.yourname.crawler.view.ConsoleView - Goodbye!
2026-06-04 05:06:54.222 [main] INFO com.abod.crawler.Main - Web Crawler Application Started
2026-06-04 05:06:54.227 [main] INFO c.y.crawler.model.ArticleRepository - ArticleRepository initialized
2026-06-04 05:06:54.244 [main] INFO c.y.c.controller.CrawlerController - CrawlerController initialized with 7 commands
2026-06-04 05:06:58.112 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://example.com
2026-06-04 05:06:59.193 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:07:10.211 [main] INFO c.yourname.crawler.view.ConsoleView - Showing all 1 articles (newest first)
2026-06-04 05:08:15.454 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://www.github.com
2026-06-04 05:08:25.545 [main] ERROR c.y.crawler.command.CrawlCommand - Failed to crawl URL: https://www.github.com
java.net.SocketTimeoutException: Connect timed out
at java.base/sun.nio.ch.NioSocketImpl.timedFinishConnect(NioSocketImpl.java:546)
at java.base/sun.nio.ch.NioSocketImpl.connect(NioSocketImpl.java:592)
at java.base/java.net.SocksSocketImpl.connect(SocksSocketImpl.java:327)
at java.base/java.net.Socket.connect(Socket.java:751)
at java.base/sun.security.ssl.SSLSocketImpl.connect(SSLSocketImpl.java:304)
at java.base/sun.net.NetworkClient.doConnect(NetworkClient.java:178)
at java.base/sun.net.www.http.HttpClient.openServer(HttpClient.java:531)
at java.base/sun.net.www.http.HttpClient.openServer(HttpClient.java:636)
at java.base/sun.net.www.protocol.https.HttpsClient.<init>(HttpsClient.java:264)
at java.base/sun.net.www.protocol.https.HttpsClient.New(HttpsClient.java:377)
at java.base/sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.getNewHttpClient(AbstractDelegateHttpsURLConnection.java:193)
at java.base/sun.net.www.protocol.http.HttpURLConnection.plainConnect0(HttpURLConnection.java:1253)
at java.base/sun.net.www.protocol.http.HttpURLConnection.plainConnect(HttpURLConnection.java:1139)
at java.base/sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:179)
at java.base/sun.net.www.protocol.https.HttpsURLConnectionImpl.connect(HttpsURLConnectionImpl.java:141)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:848)
at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:818)
at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:346)
at org.jsoup.helper.HttpConnection.get(HttpConnection.java:331)
at command.com.abod.crawler.CrawlCommand.execute(CrawlCommand.java:39)
at controller.com.abod.crawler.CrawlerController.start(CrawlerController.java:74)
at com.abod.crawler.Main.main(Main.java:15)
2026-06-04 05:08:25.547 [main] ERROR c.yourname.crawler.view.ConsoleView - Failed to crawl: Connect timed out
2026-06-04 05:09:21.026 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://httpbin.org/html
2026-06-04 05:09:22.177 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:09:36.720 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://books.toscrape.com
2026-06-04 05:09:38.631 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:10:22.093 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://www.example.com
2026-06-04 05:10:22.983 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:10:46.164 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://quotes.toscrape.com
2026-06-04 05:10:47.608 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:11:21.873 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://quotes.toscrape.com
2026-06-04 05:11:21.873 [main] WARN c.yourname.crawler.view.ConsoleView - URL already crawled! Use import to load existing data.
2026-06-04 05:12:26.629 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://httpbin.org/html
2026-06-04 05:12:26.629 [main] WARN c.yourname.crawler.view.ConsoleView - URL already crawled! Use import to load existing data.
2026-06-04 05:13:47.019 [main] INFO c.yourname.crawler.view.ConsoleView - Showing all 5 articles (newest first)
2026-06-04 05:13:59.278 [main] INFO c.yourname.crawler.view.ConsoleView - Total articles in repository: 5
2026-06-04 05:20:17.775 [main] INFO c.yourname.crawler.view.ConsoleView - Showing all 5 articles (newest first)
2026-06-04 05:20:56.447 [main] ERROR c.yourname.crawler.view.ConsoleView - Unknown command: 'export'. Type 'help' for available commands.
2026-06-04 05:36:05.721 [main] ERROR c.yourname.crawler.view.ConsoleView - Unknown command: 'export'. Type 'help' for available commands.
2026-06-04 05:36:31.242 [main] INFO c.yourname.crawler.view.ConsoleView - Showing all 5 articles (newest first)
2026-06-04 05:36:43.324 [main] INFO c.yourname.crawler.view.ConsoleView - Showing all 5 articles (newest first)
2026-06-04 05:36:50.737 [main] ERROR c.yourname.crawler.view.ConsoleView - Unknown command: 'export'. Type 'help' for available commands.
2026-06-04 05:42:46.740 [main] INFO com.abod.crawler.Main - Web Crawler Application Started
2026-06-04 05:42:46.744 [main] INFO c.y.crawler.model.ArticleRepository - ArticleRepository initialized
2026-06-04 05:42:46.763 [main] INFO c.y.c.controller.CrawlerController - CrawlerController initialized with 9 commands
2026-06-04 05:42:53.748 [main] ERROR c.yourname.crawler.view.ConsoleView - No articles to export. Please crawl some websites first.
2026-06-04 05:43:38.606 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://example.com
2026-06-04 05:43:39.589 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:43:51.022 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://httpbin.org/html
2026-06-04 05:43:52.399 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:43:56.782 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://quotes.toscrape.com
2026-06-04 05:43:57.924 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:44:01.371 [main] INFO c.yourname.crawler.view.ConsoleView - Showing all 3 articles (newest first)
2026-06-04 05:44:06.030 [main] INFO c.yourname.crawler.view.ConsoleView - Exporting 3 articles to JSON...
2026-06-04 05:44:06.215 [main] INFO util.com.abod.crawler.JsonUtil - Exported 3 articles to export_٢٠٢٦٠٦٠٤_٠٥٤٤٠٦.json
2026-06-04 05:44:06.217 [main] INFO c.yourname.crawler.view.ConsoleView - Exported 3 articles to: export_٢٠٢٦٠٦٠٤_٠٥٤٤٠٦.json
2026-06-04 05:44:06.217 [main] INFO c.yourname.crawler.view.ConsoleView - File location: C:\Users\2040a\IdeaProjects\web-crawler/export_٢٠٢٦٠٦٠٤_٠٥٤٤٠٦.json
2026-06-04 05:45:48.703 [main] INFO com.abod.crawler.Main - Web Crawler Application Started
2026-06-04 05:45:48.706 [main] INFO c.y.crawler.model.ArticleRepository - ArticleRepository initialized
2026-06-04 05:45:48.726 [main] INFO c.y.c.controller.CrawlerController - CrawlerController initialized with 9 commands
2026-06-04 05:45:50.197 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://example.com
2026-06-04 05:45:51.277 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:45:51.279 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://httpbin.org/html
2026-06-04 05:45:52.419 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:45:52.421 [main] INFO c.yourname.crawler.view.ConsoleView - Crawling: https://quotes.toscrape.com
2026-06-04 05:45:53.808 [main] INFO c.yourname.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:45:53.812 [main] INFO c.yourname.crawler.view.ConsoleView - Showing all 3 articles (newest first)
2026-06-04 05:45:53.813 [main] INFO c.yourname.crawler.view.ConsoleView - Exporting 3 articles to JSON...
2026-06-04 05:45:53.987 [main] INFO util.com.abod.crawler.JsonUtil - Exported 3 articles to export_٢٠٢٦٠٦٠٤_٠٥٤٥٥٣.json
2026-06-04 05:45:53.989 [main] INFO c.yourname.crawler.view.ConsoleView - Exported 3 articles to: export_٢٠٢٦٠٦٠٤_٠٥٤٥٥٣.json
2026-06-04 05:45:53.990 [main] INFO c.yourname.crawler.view.ConsoleView - File location: C:\Users\2040a\IdeaProjects\web-crawler/export_٢٠٢٦٠٦٠٤_٠٥٤٥٥٣.json
2026-06-04 05:45:53.990 [main] INFO c.y.crawler.model.ArticleRepository - Repository cleared
2026-06-04 05:45:53.990 [main] INFO c.yourname.crawler.view.ConsoleView - All articles cleared from repository
2026-06-04 05:45:53.990 [main] INFO c.yourname.crawler.view.ConsoleView - Showing all 0 articles (newest first)
2026-06-04 05:45:53.990 [main] INFO c.yourname.crawler.view.ConsoleView - No articles found.
2026-06-04 05:50:50.985 [main] INFO com.abod.crawler.Main - Web Crawler Application Started
2026-06-04 05:50:50.988 [main] INFO c.a.crawler.model.ArticleRepository - ArticleRepository initialized
2026-06-04 05:50:51.008 [main] INFO c.a.c.controller.CrawlerController - CrawlerController initialized with 9 commands
2026-06-04 05:50:58.283 [main] INFO com.abod.crawler.view.ConsoleView - Crawling: https://example.com
2026-06-04 05:50:59.490 [main] INFO com.abod.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:50:59.492 [main] INFO com.abod.crawler.view.ConsoleView - Crawling: https://httpbin.org/html
2026-06-04 05:51:00.805 [main] INFO com.abod.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:51:00.807 [main] INFO com.abod.crawler.view.ConsoleView - Crawling: https://quotes.toscrape.com
2026-06-04 05:51:01.895 [main] INFO com.abod.crawler.view.ConsoleView - Crawled successfully!
2026-06-04 05:51:01.901 [main] INFO com.abod.crawler.view.ConsoleView - Showing all 3 articles (newest first)
2026-06-04 05:51:01.902 [main] INFO com.abod.crawler.view.ConsoleView - Exporting 3 articles to JSON...
2026-06-04 05:51:02.075 [main] INFO com.abod.crawler.util.JsonUtil - Exported 3 articles to export_٢٠٢٦٠٦٠٤_٠٥٥١٠٢.json
2026-06-04 05:51:02.078 [main] INFO com.abod.crawler.view.ConsoleView - Exported 3 articles to: export_٢٠٢٦٠٦٠٤_٠٥٥١٠٢.json
2026-06-04 05:51:02.078 [main] INFO com.abod.crawler.view.ConsoleView - File location: C:\Users\2040a\IdeaProjects\web-crawler/export_٢٠٢٦٠٦٠٤_٠٥٥١٠٢.json
2026-06-04 05:51:02.078 [main] INFO c.a.crawler.model.ArticleRepository - Repository cleared
2026-06-04 05:51:02.078 [main] INFO com.abod.crawler.view.ConsoleView - All articles cleared from repository
2026-06-04 05:51:02.078 [main] INFO com.abod.crawler.view.ConsoleView - Showing all 0 articles (newest first)
2026-06-04 05:51:02.078 [main] INFO com.abod.crawler.view.ConsoleView - No articles found.

82
pom.xml

@ -0,0 +1,82 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.yourname.crawler</groupId>
<artifactId>web-crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- Jsoup: HTML parsing (for crawling websites) -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<!-- Jackson: JSON serialization (for export/import) -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.16.1</version>
</dependency>
<!-- Jackson: Support for Java Time (LocalDateTime) -->
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
<version>2.16.1</version>
</dependency>
<!-- Logback: Logging framework -->
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.5.3</version>
</dependency>
<!-- JUnit: Testing -->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>5.10.2</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
<configuration>
<source>17</source>
<target>17</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<archive>
<manifest>
<mainClass>com.abod.crawler.Main</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</build>
</project>

18
src/main/java/com/abod/crawler/Main.java

@ -0,0 +1,18 @@
package com.abod.crawler;
import com.abod.crawler.controller.CrawlerController;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Main {
private static final Logger logger = LoggerFactory.getLogger(Main.class);
public static void main(String[] args) {
logger.info("Web Crawler Application Started");
CrawlerController controller = new CrawlerController();
controller.start();
logger.info("Web Crawler Application Shutdown");
}
}

12
src/main/java/com/abod/crawler/command/ClearCommand.java

@ -0,0 +1,12 @@
package com.abod.crawler.command;
import com.abod.crawler.model.ArticleRepository;
import com.abod.crawler.view.ConsoleView;
public class ClearCommand implements Command {
@Override
public void execute(String[] args, ArticleRepository repository, ConsoleView view) {
repository.clear();
view.printSuccess("All articles cleared from repository");
}
}

19
src/main/java/com/abod/crawler/command/Command.java

@ -0,0 +1,19 @@
package com.abod.crawler.command;
import com.abod.crawler.model.ArticleRepository;
import com.abod.crawler.view.ConsoleView;
/**
* Command interface - part of the Command Design Pattern
* Each user command will implement this interface
*/
public interface Command {
/**
* Execute the command
* @param args Command arguments (split by space)
* @param repository The article data repository
* @param view The console view for output
*/
void execute(String[] args, ArticleRepository repository, ConsoleView view);
}

12
src/main/java/com/abod/crawler/command/CountCommand.java

@ -0,0 +1,12 @@
package com.abod.crawler.command;
import com.abod.crawler.model.ArticleRepository;
import com.abod.crawler.view.ConsoleView;
public class CountCommand implements Command {
@Override
public void execute(String[] args, ArticleRepository repository, ConsoleView view) {
int count = repository.size();
view.printSuccess("Total articles in repository: " + count);
}
}

126
src/main/java/com/abod/crawler/command/CrawlCommand.java

@ -0,0 +1,126 @@
package com.abod.crawler.command;
import com.abod.crawler.model.Article;
import com.abod.crawler.model.ArticleRepository;
import com.abod.crawler.view.ConsoleView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.SocketTimeoutException;
public class CrawlCommand implements Command {
private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class);
// List of known accessible test websites
private static final String[] TEST_WEBSITES = {
"https://example.com",
"https://httpbin.org/html",
"https://books.toscrape.com",
"https://quotes.toscrape.com"
};
@Override
public void execute(String[] args, ArticleRepository repository, ConsoleView view) {
// Check if URL was provided
if (args.length < 2) {
view.printError("Usage: crawl <url>");
view.printInfo("Example: crawl https://example.com");
view.printInfo("Test sites you can use:");
for (String site : TEST_WEBSITES) {
view.printInfo(" - " + site);
}
return;
}
String url = args[1];
view.printInfo("Crawling: " + url);
// Check if URL already exists
if (repository.exists(url)) {
view.printWarning("URL already crawled!");
return;
}
try {
// Connect with longer timeout and better user agent
Document document = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
.timeout(15000) // Increased to 15 seconds
.ignoreHttpErrors(true) // Don't fail on 404, 403, etc.
.followRedirects(true)
.get();
// Extract basic information
String title = document.title();
if (title == null || title.isEmpty()) {
title = "No Title";
}
String content = extractContent(document);
String websiteName = extractWebsiteName(url);
// Create article
Article article = new Article(title, url, content, websiteName);
repository.save(article);
view.printSuccess("Crawled successfully!");
view.displayArticle(article);
} catch (SocketTimeoutException e) {
logger.error("Timeout crawling URL: {}", url, e);
view.printError("Connection timed out. The website might be slow or blocking requests.");
view.printInfo("Try one of these test sites instead:");
for (String site : TEST_WEBSITES) {
if (!site.equals(url)) {
view.printInfo(" - crawl " + site);
}
}
} catch (IOException e) {
logger.error("Failed to crawl URL: {}", url, e);
view.printError("Failed to crawl: " + e.getMessage());
view.printInfo("Try one of these accessible test sites:");
for (String site : TEST_WEBSITES) {
view.printInfo(" - crawl " + site);
}
} catch (Exception e) {
logger.error("Unexpected error crawling URL: {}", url, e);
view.printError("Unexpected error: " + e.getMessage());
}
}
private String extractContent(Document document) {
try {
String bodyText = document.body() != null ? document.body().text() : "";
String content = bodyText.replaceAll("\\s+", " ").trim();
if (content.length() > 500) {
content = content.substring(0, 500) + "...";
}
return content.isEmpty() ? "No content extracted" : content;
} catch (Exception e) {
return "Error extracting content";
}
}
private String extractWebsiteName(String url) {
try {
String domain = url.replace("https://", "").replace("http://", "");
int slashIndex = domain.indexOf("/");
if (slashIndex > 0) {
domain = domain.substring(0, slashIndex);
}
// Remove www. if present
if (domain.startsWith("www.")) {
domain = domain.substring(4);
}
return domain;
} catch (Exception e) {
return "Unknown";
}
}
}

32
src/main/java/com/abod/crawler/command/CrawlMultipleCommand.java

@ -0,0 +1,32 @@
package com.abod.crawler.command;
import com.abod.crawler.model.ArticleRepository;
import com.abod.crawler.view.ConsoleView;
public class CrawlMultipleCommand implements Command {
// Predefined URLs to crawl for testing
private static final String[] DEFAULT_URLS = {
"https://example.com",
"https://www.google.com",
"https://www.github.com"
};
@Override
public void execute(String[] args, ArticleRepository repository, ConsoleView view) {
view.printInfo("Starting multi-URL crawl...");
CrawlCommand crawlCommand = new CrawlCommand();
int successCount = 0;
for (String url : DEFAULT_URLS) {
view.printInfo("Crawling: " + url);
String[] crawlArgs = {"crawl", url};
crawlCommand.execute(crawlArgs, repository, view);
successCount++;
}
view.printSuccess("Completed! Crawled " + successCount + " URLs.");
view.printInfo("Total articles in repository: " + repository.size());
}
}

12
src/main/java/com/abod/crawler/command/ExitCommand.java

@ -0,0 +1,12 @@
package com.abod.crawler.command;
import com.abod.crawler.model.ArticleRepository;
import com.abod.crawler.view.ConsoleView;
public class ExitCommand implements Command {
@Override
public void execute(String[] args, ArticleRepository repository, ConsoleView view) {
view.printSuccess("Goodbye!");
System.exit(0);
}
}

32
src/main/java/com/abod/crawler/command/ExportCommand.java

@ -0,0 +1,32 @@
package com.abod.crawler.command;
import com.abod.crawler.model.Article;
import com.abod.crawler.model.ArticleRepository;
import com.abod.crawler.util.JsonUtil;
import com.abod.crawler.view.ConsoleView;
import java.util.List;
public class ExportCommand implements Command {
@Override
public void execute(String[] args, ArticleRepository repository, ConsoleView view) {
List<Article> articles = repository.getAll();
if (articles.isEmpty()) {
view.printError("No articles to export. Please crawl some websites first.");
return;
}
view.printInfo("Exporting " + articles.size() + " articles to JSON...");
String filename = JsonUtil.exportWithTimestamp(articles);
if (filename != null) {
view.printSuccess("Exported " + articles.size() + " articles to: " + filename);
view.printInfo("File location: " + JsonUtil.getWorkingDirectory() + "/" + filename);
} else {
view.printError("Export failed");
}
}
}

11
src/main/java/com/abod/crawler/command/HelpCommand.java

@ -0,0 +1,11 @@
package com.abod.crawler.command;
import com.abod.crawler.model.ArticleRepository;
import com.abod.crawler.view.ConsoleView;
public class HelpCommand implements Command {
@Override
public void execute(String[] args, ArticleRepository repository, ConsoleView view) {
view.showHelp();
}
}

47
src/main/java/com/abod/crawler/command/ImportCommand.java

@ -0,0 +1,47 @@
package com.abod.crawler.command;
import com.abod.crawler.model.Article;
import com.abod.crawler.model.ArticleRepository;
import com.abod.crawler.util.JsonUtil;
import com.abod.crawler.view.ConsoleView;
import java.util.List;
public class ImportCommand implements Command {
@Override
public void execute(String[] args, ArticleRepository repository, ConsoleView view) {
if (args.length < 2) {
view.printError("Usage: import <filename>");
view.printInfo("Example: import export_20260604_143022.json");
return;
}
String filename = args[1];
if (!filename.endsWith(".json")) {
filename = filename + ".json";
}
view.printInfo("Importing articles from: " + filename);
List<Article> imported = JsonUtil.importFromJson(filename);
if (imported.isEmpty()) {
view.printError("No articles found in " + filename);
view.printInfo("File location should be: " + JsonUtil.getWorkingDirectory() + "/" + filename);
return;
}
int newCount = 0;
for (Article article : imported) {
if (!repository.exists(article.getUrl())) {
repository.save(article);
newCount++;
}
}
view.printSuccess("Imported " + newCount + " new articles from " + filename);
view.printInfo("Total articles in repository: " + repository.size());
}
}

31
src/main/java/com/abod/crawler/command/ListCommand.java

@ -0,0 +1,31 @@
package com.abod.crawler.command;
import com.abod.crawler.model.Article;
import com.abod.crawler.model.ArticleRepository;
import com.abod.crawler.view.ConsoleView;
import java.util.List;
public class ListCommand implements Command {
@Override
public void execute(String[] args, ArticleRepository repository, ConsoleView view) {
List<Article> articles;
// Check if user wants to filter by website
if (args.length > 1) {
String websiteName = args[1];
articles = repository.getByWebsite(websiteName);
if (articles.isEmpty()) {
view.printWarning("No articles found from website: " + websiteName);
return;
}
view.printInfo("Showing " + articles.size() + " articles from " + websiteName);
} else {
articles = repository.getNewestFirst();
view.printInfo("Showing all " + articles.size() + " articles (newest first)");
}
view.displayArticles(articles);
}
}

77
src/main/java/com/abod/crawler/controller/CrawlerController.java

@ -0,0 +1,77 @@
package com.abod.crawler.controller;
import com.abod.crawler.command.*;
import com.abod.crawler.command.*;
import com.abod.crawler.model.ArticleRepository;
import com.abod.crawler.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;
public class CrawlerController {
private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class);
private final ArticleRepository repository;
private final ConsoleView view;
private final Map<String, Command> commands;
public CrawlerController() {
this.repository = new ArticleRepository();
this.view = new ConsoleView();
this.commands = new HashMap<>();
registerCommands();
logger.info("CrawlerController initialized with {} commands", commands.size());
}
private void registerCommands() {
commands.put("help", new HelpCommand());
commands.put("exit", new ExitCommand());
commands.put("list", new ListCommand());
commands.put("count", new CountCommand());
commands.put("clear", new ClearCommand());
commands.put("crawl", new CrawlCommand());
commands.put("crawl-multiple", new CrawlMultipleCommand());
commands.put("export", new ExportCommand());
commands.put("import", new ImportCommand());
logger.debug("Registered commands: {}", commands.keySet());
}
public void start() {
view.showWelcome();
while (true) {
try {
String input = view.readCommand();
if (input == null || input.trim().isEmpty()) {
continue;
}
String[] parts = input.trim().split("\\s+");
String commandName = parts[0].toLowerCase();
if (commandName.equals("exit")) {
Command exitCommand = commands.get("exit");
if (exitCommand != null) {
exitCommand.execute(parts, repository, view);
break;
}
} else if (commands.containsKey(commandName)) {
Command command = commands.get(commandName);
command.execute(parts, repository, view);
} else {
view.printError("Unknown command: '" + commandName + "'. Type 'help' for available commands.");
}
} catch (Exception e) {
logger.error("Error processing command", e);
view.printError("An error occurred: " + e.getMessage());
}
}
}
}

180
src/main/java/com/abod/crawler/model/Article.java

@ -0,0 +1,180 @@
package com.abod.crawler.model;
import com.fasterxml.jackson.annotation.JsonProperty;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
/**
* Article Model class representing a crawled web article.
* This is the data structure that will be stored, exported, and imported.
*/
public class Article {
// ===== Fields =====
@JsonProperty("title")
private String title;
@JsonProperty("url")
private String url;
@JsonProperty("content")
private String content;
@JsonProperty("crawled_at")
private LocalDateTime crawledAt;
@JsonProperty("website_name")
private String websiteName;
@JsonProperty("word_count")
private int wordCount;
// ===== Constructors =====
/**
* Default constructor (required for Jackson JSON deserialization)
*/
public Article() {
// Empty constructor needed for Jackson
}
/**
* Constructor with essential fields
* @param title Article title
* @param url Article URL
* @param content Article content/text
*/
public Article(String title, String url, String content) {
this.title = title;
this.url = url;
this.content = content;
this.crawledAt = LocalDateTime.now(); // Auto-set current time
this.wordCount = content != null ? content.split("\\s+").length : 0;
}
/**
* Full constructor with all fields
* @param title Article title
* @param url Article URL
* @param content Article content
* @param websiteName Name of the source website
*/
public Article(String title, String url, String content, String websiteName) {
this(title, url, content);
this.websiteName = websiteName;
}
// ===== Getters and Setters =====
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
this.wordCount = content != null ? content.split("\\s+").length : 0;
}
public LocalDateTime getCrawledAt() {
return crawledAt;
}
public void setCrawledAt(LocalDateTime crawledAt) {
this.crawledAt = crawledAt;
}
public String getWebsiteName() {
return websiteName;
}
public void setWebsiteName(String websiteName) {
this.websiteName = websiteName;
}
public int getWordCount() {
return wordCount;
}
// No setter for wordCount - it's calculated automatically from content
// ===== Utility Methods =====
/**
* Returns a formatted string for displaying in CLI
* @return Formatted article summary
*/
public String toDisplayString() {
String formattedDate = crawledAt.format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
return String.format("[%s] %s\n URL: %s\n Words: %d | Crawled: %s\n",
websiteName != null ? websiteName : "Unknown",
truncate(title, 60),
truncate(url, 50),
wordCount,
formattedDate);
}
/**
* Returns a short summary for list commands
* @return Short summary string
*/
public String toShortString() {
return String.format("%d. %s (%s) - %d words",
-1, // Index will be added by the caller
truncate(title, 40),
websiteName != null ? websiteName : "Unknown",
wordCount);
}
/**
* Truncates a string to a maximum length
* @param str String to truncate
* @param maxLength Maximum length
* @return Truncated string with "..." if needed
*/
private String truncate(String str, int maxLength) {
if (str == null) return "N/A";
if (str.length() <= maxLength) return str;
return str.substring(0, maxLength - 3) + "...";
}
// ===== Equals and HashCode (for duplicate detection) =====
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null || getClass() != obj.getClass()) return false;
Article article = (Article) obj;
// Two articles are considered equal if they have the same URL
return url != null ? url.equals(article.url) : article.url == null;
}
@Override
public int hashCode() {
return url != null ? url.hashCode() : 0;
}
@Override
public String toString() {
return String.format("Article{title='%s', url='%s', wordCount=%d, crawledAt=%s}",
truncate(title, 30), truncate(url, 40), wordCount, crawledAt);
}
}

144
src/main/java/com/abod/crawler/model/ArticleRepository.java

@ -0,0 +1,144 @@
package com.abod.crawler.model;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
/**
* Repository class for storing and managing Article objects.
* This acts as the data access layer (part of the Model in MVC).
*/
public class ArticleRepository {
private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class);
// Thread-safe map for storing articles by URL
private final Map<String, Article> articles;
/**
* Default constructor
*/
public ArticleRepository() {
this.articles = new ConcurrentHashMap<>();
logger.info("ArticleRepository initialized");
}
/**
* Save or update an article
* @param article Article to save
*/
public void save(Article article) {
if (article == null) {
logger.warn("Attempted to save null article");
return;
}
if (article.getUrl() == null) {
logger.warn("Article has no URL, cannot save: {}", article);
return;
}
articles.put(article.getUrl(), article);
logger.debug("Saved article: {}", article.getTitle());
}
/**
* Find an article by its URL
* @param url Article URL
* @return Article if found, null otherwise
*/
public Article findByUrl(String url) {
if (url == null) return null;
return articles.get(url);
}
/**
* Check if an article with given URL already exists
* @param url URL to check
* @return true if exists, false otherwise
*/
public boolean exists(String url) {
return url != null && articles.containsKey(url);
}
/**
* Get all articles
* @return List of all articles (new list, safe to modify)
*/
public List<Article> getAll() {
return new ArrayList<>(articles.values());
}
/**
* Get articles sorted by crawl time (newest first)
* @return Sorted list of articles
*/
public List<Article> getNewestFirst() {
List<Article> list = getAll();
list.sort((a1, a2) -> a2.getCrawledAt().compareTo(a1.getCrawledAt()));
return list;
}
/**
* Get articles from a specific website
* @param websiteName Name of the website
* @return List of articles from that website
*/
public List<Article> getByWebsite(String websiteName) {
if (websiteName == null) return Collections.emptyList();
List<Article> result = new ArrayList<>();
for (Article article : articles.values()) {
if (websiteName.equals(article.getWebsiteName())) {
result.add(article);
}
}
return result;
}
/**
* Get total number of articles
* @return Article count
*/
public int size() {
return articles.size();
}
/**
* Clear all articles from repository
*/
public void clear() {
articles.clear();
logger.info("Repository cleared");
}
/**
* Get all URLs (for duplicate checking during crawling)
* @return Set of all URLs
*/
public Set<String> getAllUrls() {
return new HashSet<>(articles.keySet());
}
/**
* Save multiple articles at once
* @param articleList List of articles to save
* @return Number of articles saved
*/
public int saveAll(List<Article> articleList) {
if (articleList == null) return 0;
int savedCount = 0;
for (Article article : articleList) {
if (article != null) {
save(article);
savedCount++;
}
}
logger.info("Saved {} articles", savedCount);
return savedCount;
}
}

74
src/main/java/com/abod/crawler/util/JsonUtil.java

@ -0,0 +1,74 @@
package com.abod.crawler.util;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import com.abod.crawler.model.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
public class JsonUtil {
private static final Logger logger = LoggerFactory.getLogger(JsonUtil.class);
private static final ObjectMapper objectMapper;
static {
objectMapper = new ObjectMapper();
objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
objectMapper.registerModule(new JavaTimeModule());
}
public static boolean exportToJson(List<Article> articles, String filePath) {
if (articles == null || articles.isEmpty()) {
logger.warn("No articles to export");
return false;
}
try {
objectMapper.writeValue(new File(filePath), articles);
logger.info("Exported {} articles to {}", articles.size(), filePath);
return true;
} catch (IOException e) {
logger.error("Export failed", e);
return false;
}
}
public static List<Article> importFromJson(String filePath) {
File file = new File(filePath);
if (!file.exists()) {
logger.warn("File not found: {}", filePath);
return new ArrayList<>();
}
try {
List<Article> articles = objectMapper.readValue(file,
objectMapper.getTypeFactory().constructCollectionType(List.class, Article.class));
logger.info("Imported {} articles from {}", articles.size(), filePath);
return articles;
} catch (IOException e) {
logger.error("Import failed", e);
return new ArrayList<>();
}
}
public static String exportWithTimestamp(List<Article> articles) {
if (articles == null || articles.isEmpty()) {
return null;
}
String timestamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());
String filename = "export_" + timestamp + ".json";
boolean success = exportToJson(articles, filename);
return success ? filename : null;
}
// ADD THIS METHOD - it was missing!
public static String getWorkingDirectory() {
return System.getProperty("user.dir");
}
}

164
src/main/java/com/abod/crawler/view/ConsoleView.java

@ -0,0 +1,164 @@
package com.abod.crawler.view;
import com.abod.crawler.model.Article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Scanner;
/**
* View layer - Handles all user input and output
* In MVC, the View ONLY handles display and user input collection
* It contains NO business logic
*/
public class ConsoleView {
private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class);
private final Scanner scanner;
public ConsoleView() {
this.scanner = new Scanner(System.in);
logger.debug("ConsoleView initialized");
}
/**
* Display welcome banner
*/
public void showWelcome() {
System.out.println();
System.out.println("╔══════════════════════════════════════════════════════════╗");
System.out.println("║ WEB CRAWLER v1.0 ║");
System.out.println("║ ║");
System.out.println("║ A CLI web crawler with MVC + Command + Strategy ║");
System.out.println("╚══════════════════════════════════════════════════════════╝");
System.out.println();
showHelp();
}
/**
* Display help menu
*/
public void showHelp() {
System.out.println("📖 Available Commands:");
System.out.println(" ┌─────────────────────────────────────────────────────────┐");
System.out.println(" │ crawl <url> - Crawl a single URL │");
System.out.println(" │ crawl-multiple - Crawl multiple predefined URLs │");
System.out.println(" │ list - List all articles │");
System.out.println(" │ list <website> - List articles from a specific website │");
System.out.println(" │ export - Export all articles to JSON │");
System.out.println(" │ import <file> - Import articles from JSON file │");
System.out.println(" │ count - Show total article count │");
System.out.println(" │ clear - Clear all articles │");
System.out.println(" │ help - Show this help menu │");
System.out.println(" │ exit - Exit the application │");
System.out.println(" └─────────────────────────────────────────────────────────┘");
System.out.println();
}
/**
* Display a single article
*/
public void displayArticle(Article article) {
if (article == null) {
printError("Article not found");
return;
}
System.out.println();
System.out.println("┌─────────────────────────────────────────────────────────┐");
System.out.println("│ 📄 ARTICLE DETAILS │");
System.out.println("├─────────────────────────────────────────────────────────┤");
System.out.printf ("│ Title: %-45s│%n", truncate(article.getTitle(), 45));
System.out.printf ("│ Source: %-44s│%n", truncate(article.getWebsiteName(), 44));
System.out.printf ("│ URL: %-48s│%n", truncate(article.getUrl(), 48));
System.out.printf ("│ Words: %-46d│%n", article.getWordCount());
System.out.printf ("│ Crawled: %-44s│%n", article.getCrawledAt());
System.out.println("└─────────────────────────────────────────────────────────┘");
System.out.println();
}
/**
* Display list of articles
*/
public void displayArticles(List<Article> articles) {
if (articles == null || articles.isEmpty()) {
printInfo("No articles found.");
return;
}
System.out.println();
System.out.println("┌─────────────────────────────────────────────────────────┐");
System.out.println("│ 📚 ARTICLES (" + articles.size() + " total) │");
System.out.println("├─────────────────────────────────────────────────────────┤");
int index = 1;
for (Article article : articles) {
System.out.printf("│ %2d. %-50s │%n", index, truncate(article.getTitle(), 50));
System.out.printf("│ 📍 %-48s │%n", truncate(article.getUrl(), 48));
System.out.printf("│ 🏠 %-10s | 📝 %-4d words | 🕐 %-19s │%n",
truncate(article.getWebsiteName(), 10),
article.getWordCount(),
article.getCrawledAt().toString().substring(0, 19));
index++;
}
System.out.println("└─────────────────────────────────────────────────────────┘");
System.out.println();
}
/**
* Display success message
*/
public void printSuccess(String message) {
System.out.println("✅ " + message);
logger.info(message);
}
/**
* Display error message
*/
public void printError(String message) {
System.out.println("❌ " + message);
logger.error(message);
}
/**
* Display info message
*/
public void printInfo(String message) {
System.out.println("ℹ️ " + message);
logger.info(message);
}
/**
* Display warning message
*/
public void printWarning(String message) {
System.out.println("⚠️ " + message);
logger.warn(message);
}
/**
* Read user input (for CLI prompt)
*/
public String readCommand() {
System.out.print("\n🐱 crawler> ");
return scanner.nextLine().trim();
}
/**
* Wait for user to press Enter (for pause between operations)
*/
public void waitForEnter() {
System.out.print("Press Enter to continue...");
scanner.nextLine();
}
/**
* Helper method to truncate long strings
*/
private String truncate(String str, int maxLength) {
if (str == null) return "N/A";
if (str.length() <= maxLength) return str;
return str.substring(0, maxLength - 3) + "...";
}
}

19
src/main/resources/logback.xml

@ -0,0 +1,19 @@
<configuration>
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<appender name="FILE" class="ch.qos.logback.core.FileAppender">
<file>logs/crawler.log</file>
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<root level="INFO">
<appender-ref ref="CONSOLE"/>
<appender-ref ref="FILE"/>
</root>
</configuration>
Loading…
Cancel
Save