You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

253 lines
9.4 KiB

package com.ski.crawler.command;
import com.ski.crawler.controller.CrawlerContext;
import com.ski.crawler.exception.NetworkException;
import com.ski.crawler.factory.StrategyFactory;
import com.ski.crawler.repository.SkiResortRepository;
import com.ski.crawler.service.ScraperService;
import com.ski.crawler.strategy.CrawlStrategy;
import com.ski.crawler.util.CliArgs;
import com.ski.crawler.util.ExcelUtil;
import com.ski.crawler.utils.CrawlerHttp;
import com.ski.crawler.view.ConsoleView;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
public class CrawlCommand implements Command {
@Override
public String name() {
return "crawl";
}
@Override
public void execute(String[] args, CrawlerContext context) throws Exception {
Map<String, String> opts = CliArgs.parseOptions(args, 1);
String siteId = normalizeSite(opts.getOrDefault("site", "skiresort"));
int limit = parseLimit(opts.get("limit"), 100);
int threads = CliArgs.parseInt(opts.get("threads"), 3);
int timeoutMs = CliArgs.parseInt(opts.get("timeout"), 20000);
int retry = CliArgs.parseInt(opts.get("retry"), 3);
long retrySleep = CliArgs.parseInt(opts.get("retry-sleep"), 1000);
boolean dryRun = CliArgs.parseBoolean(opts.get("dry-run"));
boolean full = CliArgs.parseBoolean(opts.get("full"));
boolean incremental = !full;
boolean noProxy = CliArgs.parseBoolean(opts.get("no-proxy"));
boolean color = CliArgs.parseBoolean(opts.get("color"));
boolean showFailures = CliArgs.parseBoolean(opts.get("show-failures"));
Integer widthArg = CliArgs.parseNullableInt(opts.get("width"));
String country = opts.get("country");
String startUrl = opts.get("start-url");
String outRaw = opts.get("out");
String out = (outRaw == null || outRaw.trim().isEmpty()) ? null : outRaw.trim();
String outJsonl = out;
String outXlsx = null;
if (out != null && out.toLowerCase(Locale.ROOT).endsWith(".xlsx")) {
outXlsx = out;
outJsonl = null;
}
String userAgent = opts.getOrDefault("ua", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36");
String proxyHost = opts.getOrDefault("proxy-host", "127.0.0.1");
int proxyPort = CliArgs.parseInt(opts.get("proxy-port"), 7890);
boolean proxyEnabled = !noProxy;
String proxy = opts.get("proxy");
if (proxy != null && !proxy.isEmpty()) {
String p = proxy.trim();
if (p.equalsIgnoreCase("none") || p.equalsIgnoreCase("off") || p.equalsIgnoreCase("false")) {
proxyEnabled = false;
} else {
int idx = p.lastIndexOf(':');
if (idx > 0 && idx < p.length() - 1) {
proxyHost = p.substring(0, idx);
proxyPort = CliArgs.parseInt(p.substring(idx + 1), proxyPort);
} else {
proxyHost = p;
}
}
}
CrawlerHttp http = new CrawlerHttp(userAgent, proxyHost, proxyPort, proxyEnabled, timeoutMs);
int width = resolveWidth(widthArg);
ConsoleView view = new ConsoleView(width, color);
StrategyFactory factory = context.strategies();
SkiResortRepository repo = context.repository();
ScraperService svc = context.scraper();
ScraperService.CrawlReport report;
if (siteId.equals("all")) {
if (outJsonl != null) {
System.err.println("When --site all, JSONL --out is not supported. Use --out result.xlsx or omit --out.");
return;
}
report = crawlAll(factory, svc, startUrl, limit, threads, country, http, repo, incremental, view, showFailures, dryRun, retry, retrySleep);
} else {
CrawlStrategy strategy = factory.create(siteId);
try {
report = svc.crawl(strategy, startUrl, limit, threads, country, http, repo, incremental, outJsonl, view, showFailures, dryRun, retry, retrySleep);
} catch (NetworkException e) {
throw e;
}
}
if (outXlsx != null) {
if (dryRun) {
System.err.println("dry-run is enabled, skip writing: " + outXlsx);
} else {
ExcelUtil.exportResortsBySiteToXlsx(repo.getAll(), outXlsx);
System.err.println("Excel exported: " + repo.getAll().size() + " -> " + outXlsx);
}
}
Map<String, Object> summary = new LinkedHashMap<>();
summary.put("site", report.site);
summary.put("total", report.total);
summary.put("success", report.success);
summary.put("filteredOut", report.filteredOut);
summary.put("skipped", report.skipped);
summary.put("failed", report.failed);
if (outXlsx != null && !dryRun) {
summary.put("out", outXlsx);
} else if (outJsonl != null && !dryRun) {
summary.put("out", outJsonl);
}
view.printSummary(summary, sortByValueDesc(report.byCountry), showFailures ? report.failures : null);
}
private String normalizeSite(String raw) {
if (raw == null) {
return "skiresort";
}
String t = raw.trim().toLowerCase(Locale.ROOT);
if (t.equals("wiki")) {
return "wikipedia";
}
return t;
}
private ScraperService.CrawlReport crawlAll(
StrategyFactory factory,
ScraperService svc,
String startUrl,
int limit,
int threads,
String countryFilter,
CrawlerHttp http,
SkiResortRepository repo,
boolean incremental,
ConsoleView view,
boolean showFailures,
boolean dryRun,
int retryAttempts,
long retrySleepMs
) throws Exception {
List<String> sites = Arrays.asList("skiresort", "wikipedia", "skimap");
Map<String, Long> byCountry = new LinkedHashMap<>();
List<String> failures = new java.util.ArrayList<>();
int total = 0;
int success = 0;
int filteredOut = 0;
int skipped = 0;
int failed = 0;
for (String s : sites) {
CrawlStrategy strategy = factory.create(s);
try {
ScraperService.CrawlReport r = svc.crawl(strategy, null, limit, threads, countryFilter, http, repo, incremental, null, view, showFailures, dryRun, retryAttempts, retrySleepMs);
total += r.total;
success += r.success;
filteredOut += r.filteredOut;
skipped += r.skipped;
failed += r.failed;
mergeByCountry(byCountry, r.byCountry);
if (showFailures && r.failures != null) {
for (String f : r.failures) {
if (failures.size() >= 200) {
break;
}
failures.add(f);
}
}
} catch (Exception e) {
failed += 1;
if (showFailures && failures.size() < 200) {
failures.add("site=" + s + " [" + e.getClass().getSimpleName() + "] " + (e.getMessage() == null ? "" : e.getMessage()));
}
}
}
ScraperService.CrawlReport out = new ScraperService.CrawlReport();
out.site = "all";
out.total = total;
out.success = success;
out.filteredOut = filteredOut;
out.skipped = skipped;
out.failed = failed;
out.byCountry = byCountry;
out.failures = failures;
return out;
}
private void mergeByCountry(Map<String, Long> acc, Map<String, Long> add) {
if (acc == null || add == null || add.isEmpty()) {
return;
}
for (Map.Entry<String, Long> e : add.entrySet()) {
if (e.getKey() == null) {
continue;
}
long v = e.getValue() == null ? 0L : e.getValue();
acc.put(e.getKey(), acc.getOrDefault(e.getKey(), 0L) + v);
}
}
private int parseLimit(String v, int def) {
if (v == null || v.trim().isEmpty()) {
return def;
}
String t = v.trim();
if (t.equalsIgnoreCase("all")) {
return -1;
}
try {
int n = Integer.parseInt(t);
return n <= 0 ? def : n;
} catch (Exception e) {
return def;
}
}
private int resolveWidth(Integer widthArg) {
if (widthArg != null && widthArg > 20) {
return widthArg;
}
String cols = System.getenv("COLUMNS");
if (cols != null) {
try {
int n = Integer.parseInt(cols.trim());
if (n > 20) {
return n;
}
} catch (Exception ignored) {
}
}
return 120;
}
private Map<String, Long> sortByValueDesc(Map<String, Long> m) {
if (m == null || m.isEmpty()) {
return m;
}
return m.entrySet().stream()
.sorted((a, b) -> Long.compare(b.getValue(), a.getValue()))
.collect(LinkedHashMap::new, (acc, e) -> acc.put(e.getKey(), e.getValue()), Map::putAll);
}
}