diff --git a/project/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java b/project/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java deleted file mode 100644 index 3c790a5..0000000 --- a/project/java-cli/src/main/java/com/example/datacollect/command/CrawlCommand.java +++ /dev/null @@ -1,113 +0,0 @@ -package com.example.datacollect.command; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.example.datacollect.exception.ParseException; -import com.example.datacollect.repository.ArticleRepository; -import com.example.datacollect.strategy.CrawlStrategy; -import com.example.datacollect.strategy.StrategyFactory; -import com.example.datacollect.view.ConsoleView; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; - -import java.net.MalformedURLException; -import java.net.URL; -import java.util.regex.Pattern; - -public class CrawlCommand implements Command { - private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); - - private final ConsoleView view; - private final StrategyFactory strategyFactory; - private static final Pattern URL_PATTERN = Pattern.compile( - "^(https?://)?([\\da-z.-]+)\\.([a-z.]{2,6})([/\\w.-]*)*(/?)$" - ); - private static final int MAX_RETRIES = 3; - private static final long RETRY_DELAY_MS = 2000; - - public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { - this.view = view; - this.strategyFactory = strategyFactory; - } - - @Override - public String getName() { - return "crawl"; - } - - private boolean isValidUrl(String url) { - if (url == null || url.isEmpty()) { - return false; - } - if (!URL_PATTERN.matcher(url).matches()) { - return false; - } - try { - if (!url.startsWith("http://") && !url.startsWith("https://")) { - url = "http://" + url; - } - new URL(url); - return true; - } catch (MalformedURLException e) { - return false; - } - } - - @Override - public void execute(String[] args, ArticleRepository repository) { - if (args.length < 2) { - view.printError("Usage: crawl "); - return; - } - String url = args[1]; - - if (!isValidUrl(url)) { - view.printError("Invalid URL format: " + url); - return; - } - - if (!url.startsWith("http://") && !url.startsWith("https://")) { - url = "https://" + url; - } - - CrawlStrategy strategy = strategyFactory.getStrategy(url); - if (strategy == null) { - view.printError("No strategy found for: " + url); - return; - } - - view.printInfo("Crawling: " + url); - int retryCount = 0; - Exception lastException = null; - - while (retryCount < MAX_RETRIES) { - try { - Document doc = Jsoup.connect(url).get(); - var articles = strategy.parse(url, doc); - for (var article : articles) { - repository.add(article); - } - view.printSuccess("Crawled " + articles.size() + " articles."); - return; - } catch (ParseException e) { - view.printError("Parse error: " + e.getMessage()); - return; - } catch (Exception e) { - lastException = e; - retryCount++; - if (retryCount < MAX_RETRIES) { - view.printInfo("Retry " + retryCount + "/" + MAX_RETRIES + " in " + RETRY_DELAY_MS + "ms..."); - try { - Thread.sleep(RETRY_DELAY_MS); - } catch (InterruptedException ie) { - Thread.currentThread().interrupt(); - break; - } - } - } - } - - view.printError("Failed to crawl after " + MAX_RETRIES + " attempts: " + lastException.getMessage()); - } -}