|
|
@ -5,15 +5,22 @@ import internal.hw.crawler.repositories.ArticleRepository; |
|
|
import internal.hw.crawler.strategies.crawl.CrawlStrategy; |
|
|
import internal.hw.crawler.strategies.crawl.CrawlStrategy; |
|
|
import internal.hw.crawler.strategies.crawl.CrawlStrategyFactory; |
|
|
import internal.hw.crawler.strategies.crawl.CrawlStrategyFactory; |
|
|
import internal.hw.crawler.strategies.crawl.CrawlUtils; |
|
|
import internal.hw.crawler.strategies.crawl.CrawlUtils; |
|
|
|
|
|
import internal.hw.crawler.strategies.crawl.CrawlNetworkException; |
|
|
|
|
|
import internal.hw.crawler.strategies.crawl.CrawlParseException; |
|
|
|
|
|
import internal.hw.crawler.strategies.crawl.CrawlUnsupportedException; |
|
|
import internal.hw.crawler.views.CommandOutput; |
|
|
import internal.hw.crawler.views.CommandOutput; |
|
|
import org.jsoup.Jsoup; |
|
|
import org.jsoup.Jsoup; |
|
|
import org.jsoup.nodes.Document; |
|
|
import org.jsoup.nodes.Document; |
|
|
|
|
|
import org.slf4j.Logger; |
|
|
|
|
|
import org.slf4j.LoggerFactory; |
|
|
|
|
|
|
|
|
|
|
|
import java.io.IOException; |
|
|
import java.net.URL; |
|
|
import java.net.URL; |
|
|
import java.util.List; |
|
|
import java.util.List; |
|
|
import java.util.Objects; |
|
|
import java.util.Objects; |
|
|
|
|
|
|
|
|
public class CrawlCommand implements Command { |
|
|
public class CrawlCommand implements Command { |
|
|
|
|
|
private static final Logger log = LoggerFactory.getLogger(CrawlCommand.class); |
|
|
private final ArticleRepository repository; |
|
|
private final ArticleRepository repository; |
|
|
private final CommandOutput out; |
|
|
private final CommandOutput out; |
|
|
private final CrawlStrategyFactory crawlStrategyFactory = new CrawlStrategyFactory(); |
|
|
private final CrawlStrategyFactory crawlStrategyFactory = new CrawlStrategyFactory(); |
|
|
@ -34,15 +41,11 @@ public class CrawlCommand implements Command { |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
@Override |
|
|
@Override |
|
|
public void execute(String[] args) throws Exception { |
|
|
public void execute(String[] args) { |
|
|
String urlRaw = args[1]; |
|
|
String urlRaw = args[1]; |
|
|
|
|
|
try { |
|
|
URL url = new URL(urlRaw); |
|
|
URL url = new URL(urlRaw); |
|
|
CrawlStrategy strategy = crawlStrategyFactory.getStrategy(url); |
|
|
CrawlStrategy strategy = crawlStrategyFactory.getStrategy(url); |
|
|
if (strategy == null) { |
|
|
|
|
|
out.error("Unsupported URL: " + urlRaw); |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
Document doc = Jsoup.connect(url.toString()).timeout(5000).get(); |
|
|
Document doc = Jsoup.connect(url.toString()).timeout(5000).get(); |
|
|
CrawlUtils.setProgressCallback(msg -> System.out.print("\r" + msg)); |
|
|
CrawlUtils.setProgressCallback(msg -> System.out.print("\r" + msg)); |
|
|
try { |
|
|
try { |
|
|
@ -53,5 +56,18 @@ public class CrawlCommand implements Command { |
|
|
} finally { |
|
|
} finally { |
|
|
CrawlUtils.clearProgressCallback(); |
|
|
CrawlUtils.clearProgressCallback(); |
|
|
} |
|
|
} |
|
|
|
|
|
} catch (CrawlUnsupportedException e) { |
|
|
|
|
|
out.error(e.getMessage()); |
|
|
|
|
|
log.warn("Unsupported URL: {}", urlRaw); |
|
|
|
|
|
} catch (CrawlNetworkException e) { |
|
|
|
|
|
out.error("Network error: " + e.getMessage()); |
|
|
|
|
|
log.error("Crawl network failure for {}", urlRaw, e); |
|
|
|
|
|
} catch (CrawlParseException e) { |
|
|
|
|
|
out.error("Parse error: " + e.getMessage()); |
|
|
|
|
|
log.error("Crawl parse failure for {}", urlRaw, e); |
|
|
|
|
|
} catch (IOException e) { |
|
|
|
|
|
out.error("I/O error: " + e.getMessage()); |
|
|
|
|
|
log.error("Crawl I/O failure for {}", urlRaw, e); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|