You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
253 lines
9.4 KiB
253 lines
9.4 KiB
package com.ski.crawler.command;
|
|
|
|
import com.ski.crawler.controller.CrawlerContext;
|
|
import com.ski.crawler.exception.NetworkException;
|
|
import com.ski.crawler.factory.StrategyFactory;
|
|
import com.ski.crawler.repository.SkiResortRepository;
|
|
import com.ski.crawler.service.ScraperService;
|
|
import com.ski.crawler.strategy.CrawlStrategy;
|
|
import com.ski.crawler.util.CliArgs;
|
|
import com.ski.crawler.util.ExcelUtil;
|
|
import com.ski.crawler.utils.CrawlerHttp;
|
|
import com.ski.crawler.view.ConsoleView;
|
|
|
|
import java.util.Arrays;
|
|
import java.util.LinkedHashMap;
|
|
import java.util.List;
|
|
import java.util.Locale;
|
|
import java.util.Map;
|
|
|
|
public class CrawlCommand implements Command {
|
|
@Override
|
|
public String name() {
|
|
return "crawl";
|
|
}
|
|
|
|
@Override
|
|
public void execute(String[] args, CrawlerContext context) throws Exception {
|
|
Map<String, String> opts = CliArgs.parseOptions(args, 1);
|
|
|
|
String siteId = normalizeSite(opts.getOrDefault("site", "skiresort"));
|
|
int limit = parseLimit(opts.get("limit"), 100);
|
|
int threads = CliArgs.parseInt(opts.get("threads"), 3);
|
|
int timeoutMs = CliArgs.parseInt(opts.get("timeout"), 20000);
|
|
int retry = CliArgs.parseInt(opts.get("retry"), 3);
|
|
long retrySleep = CliArgs.parseInt(opts.get("retry-sleep"), 1000);
|
|
boolean dryRun = CliArgs.parseBoolean(opts.get("dry-run"));
|
|
boolean full = CliArgs.parseBoolean(opts.get("full"));
|
|
boolean incremental = !full;
|
|
boolean noProxy = CliArgs.parseBoolean(opts.get("no-proxy"));
|
|
boolean color = CliArgs.parseBoolean(opts.get("color"));
|
|
boolean showFailures = CliArgs.parseBoolean(opts.get("show-failures"));
|
|
Integer widthArg = CliArgs.parseNullableInt(opts.get("width"));
|
|
|
|
String country = opts.get("country");
|
|
String startUrl = opts.get("start-url");
|
|
String outRaw = opts.get("out");
|
|
String out = (outRaw == null || outRaw.trim().isEmpty()) ? null : outRaw.trim();
|
|
String outJsonl = out;
|
|
String outXlsx = null;
|
|
if (out != null && out.toLowerCase(Locale.ROOT).endsWith(".xlsx")) {
|
|
outXlsx = out;
|
|
outJsonl = null;
|
|
}
|
|
|
|
String userAgent = opts.getOrDefault("ua", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36");
|
|
|
|
String proxyHost = opts.getOrDefault("proxy-host", "127.0.0.1");
|
|
int proxyPort = CliArgs.parseInt(opts.get("proxy-port"), 7890);
|
|
boolean proxyEnabled = !noProxy;
|
|
String proxy = opts.get("proxy");
|
|
if (proxy != null && !proxy.isEmpty()) {
|
|
String p = proxy.trim();
|
|
if (p.equalsIgnoreCase("none") || p.equalsIgnoreCase("off") || p.equalsIgnoreCase("false")) {
|
|
proxyEnabled = false;
|
|
} else {
|
|
int idx = p.lastIndexOf(':');
|
|
if (idx > 0 && idx < p.length() - 1) {
|
|
proxyHost = p.substring(0, idx);
|
|
proxyPort = CliArgs.parseInt(p.substring(idx + 1), proxyPort);
|
|
} else {
|
|
proxyHost = p;
|
|
}
|
|
}
|
|
}
|
|
|
|
CrawlerHttp http = new CrawlerHttp(userAgent, proxyHost, proxyPort, proxyEnabled, timeoutMs);
|
|
int width = resolveWidth(widthArg);
|
|
ConsoleView view = new ConsoleView(width, color);
|
|
|
|
StrategyFactory factory = context.strategies();
|
|
SkiResortRepository repo = context.repository();
|
|
ScraperService svc = context.scraper();
|
|
|
|
ScraperService.CrawlReport report;
|
|
if (siteId.equals("all")) {
|
|
if (outJsonl != null) {
|
|
System.err.println("When --site all, JSONL --out is not supported. Use --out result.xlsx or omit --out.");
|
|
return;
|
|
}
|
|
report = crawlAll(factory, svc, startUrl, limit, threads, country, http, repo, incremental, view, showFailures, dryRun, retry, retrySleep);
|
|
} else {
|
|
CrawlStrategy strategy = factory.create(siteId);
|
|
try {
|
|
report = svc.crawl(strategy, startUrl, limit, threads, country, http, repo, incremental, outJsonl, view, showFailures, dryRun, retry, retrySleep);
|
|
} catch (NetworkException e) {
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
if (outXlsx != null) {
|
|
if (dryRun) {
|
|
System.err.println("dry-run is enabled, skip writing: " + outXlsx);
|
|
} else {
|
|
ExcelUtil.exportResortsBySiteToXlsx(repo.getAll(), outXlsx);
|
|
System.err.println("Excel exported: " + repo.getAll().size() + " -> " + outXlsx);
|
|
}
|
|
}
|
|
|
|
Map<String, Object> summary = new LinkedHashMap<>();
|
|
summary.put("site", report.site);
|
|
summary.put("total", report.total);
|
|
summary.put("success", report.success);
|
|
summary.put("filteredOut", report.filteredOut);
|
|
summary.put("skipped", report.skipped);
|
|
summary.put("failed", report.failed);
|
|
if (outXlsx != null && !dryRun) {
|
|
summary.put("out", outXlsx);
|
|
} else if (outJsonl != null && !dryRun) {
|
|
summary.put("out", outJsonl);
|
|
}
|
|
|
|
view.printSummary(summary, sortByValueDesc(report.byCountry), showFailures ? report.failures : null);
|
|
}
|
|
|
|
private String normalizeSite(String raw) {
|
|
if (raw == null) {
|
|
return "skiresort";
|
|
}
|
|
String t = raw.trim().toLowerCase(Locale.ROOT);
|
|
if (t.equals("wiki")) {
|
|
return "wikipedia";
|
|
}
|
|
return t;
|
|
}
|
|
|
|
private ScraperService.CrawlReport crawlAll(
|
|
StrategyFactory factory,
|
|
ScraperService svc,
|
|
String startUrl,
|
|
int limit,
|
|
int threads,
|
|
String countryFilter,
|
|
CrawlerHttp http,
|
|
SkiResortRepository repo,
|
|
boolean incremental,
|
|
ConsoleView view,
|
|
boolean showFailures,
|
|
boolean dryRun,
|
|
int retryAttempts,
|
|
long retrySleepMs
|
|
) throws Exception {
|
|
List<String> sites = Arrays.asList("skiresort", "wikipedia", "skimap");
|
|
Map<String, Long> byCountry = new LinkedHashMap<>();
|
|
List<String> failures = new java.util.ArrayList<>();
|
|
int total = 0;
|
|
int success = 0;
|
|
int filteredOut = 0;
|
|
int skipped = 0;
|
|
int failed = 0;
|
|
|
|
for (String s : sites) {
|
|
CrawlStrategy strategy = factory.create(s);
|
|
try {
|
|
ScraperService.CrawlReport r = svc.crawl(strategy, null, limit, threads, countryFilter, http, repo, incremental, null, view, showFailures, dryRun, retryAttempts, retrySleepMs);
|
|
total += r.total;
|
|
success += r.success;
|
|
filteredOut += r.filteredOut;
|
|
skipped += r.skipped;
|
|
failed += r.failed;
|
|
mergeByCountry(byCountry, r.byCountry);
|
|
if (showFailures && r.failures != null) {
|
|
for (String f : r.failures) {
|
|
if (failures.size() >= 200) {
|
|
break;
|
|
}
|
|
failures.add(f);
|
|
}
|
|
}
|
|
} catch (Exception e) {
|
|
failed += 1;
|
|
if (showFailures && failures.size() < 200) {
|
|
failures.add("site=" + s + " [" + e.getClass().getSimpleName() + "] " + (e.getMessage() == null ? "" : e.getMessage()));
|
|
}
|
|
}
|
|
}
|
|
|
|
ScraperService.CrawlReport out = new ScraperService.CrawlReport();
|
|
out.site = "all";
|
|
out.total = total;
|
|
out.success = success;
|
|
out.filteredOut = filteredOut;
|
|
out.skipped = skipped;
|
|
out.failed = failed;
|
|
out.byCountry = byCountry;
|
|
out.failures = failures;
|
|
return out;
|
|
}
|
|
|
|
private void mergeByCountry(Map<String, Long> acc, Map<String, Long> add) {
|
|
if (acc == null || add == null || add.isEmpty()) {
|
|
return;
|
|
}
|
|
for (Map.Entry<String, Long> e : add.entrySet()) {
|
|
if (e.getKey() == null) {
|
|
continue;
|
|
}
|
|
long v = e.getValue() == null ? 0L : e.getValue();
|
|
acc.put(e.getKey(), acc.getOrDefault(e.getKey(), 0L) + v);
|
|
}
|
|
}
|
|
|
|
private int parseLimit(String v, int def) {
|
|
if (v == null || v.trim().isEmpty()) {
|
|
return def;
|
|
}
|
|
String t = v.trim();
|
|
if (t.equalsIgnoreCase("all")) {
|
|
return -1;
|
|
}
|
|
try {
|
|
int n = Integer.parseInt(t);
|
|
return n <= 0 ? def : n;
|
|
} catch (Exception e) {
|
|
return def;
|
|
}
|
|
}
|
|
|
|
private int resolveWidth(Integer widthArg) {
|
|
if (widthArg != null && widthArg > 20) {
|
|
return widthArg;
|
|
}
|
|
String cols = System.getenv("COLUMNS");
|
|
if (cols != null) {
|
|
try {
|
|
int n = Integer.parseInt(cols.trim());
|
|
if (n > 20) {
|
|
return n;
|
|
}
|
|
} catch (Exception ignored) {
|
|
}
|
|
}
|
|
return 120;
|
|
}
|
|
|
|
private Map<String, Long> sortByValueDesc(Map<String, Long> m) {
|
|
if (m == null || m.isEmpty()) {
|
|
return m;
|
|
}
|
|
return m.entrySet().stream()
|
|
.sorted((a, b) -> Long.compare(b.getValue(), a.getValue()))
|
|
.collect(LinkedHashMap::new, (acc, e) -> acc.put(e.getKey(), e.getValue()), Map::putAll);
|
|
}
|
|
}
|
|
|