commit a52792f019da9b32c3bf67a09bb9b8760ee400d7 Author: Chengwuyi <3394813085@qq.com> Date: Sat May 30 21:44:26 2026 +0800 提交项目源码 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8b21d0d Binary files /dev/null and b/.gitignore differ diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..f796551 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,45 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "type": "java", + "name": "CrawlerScheduler", + "request": "launch", + "mainClass": "com.ski.crawler.spider.CrawlerScheduler", + "projectName": "crawler" + }, + { + "type": "java", + "name": "Main (no proxy)", + "request": "launch", + "mainClass": "com.ski.crawler.Main", + "args": [ + "crawl", "--site skiresort", "--limit 5", "--proxy 127.0.0.1:7890", "--timeout 60000", "--retry 2", "--show-failures" + ] + }, + { + "type": "java", + "name": "Main (real, Clash proxy)", + "request": "launch", + "mainClass": "com.ski.crawler.Main", + "vmArgs": [ + "-Djava.net.useSystemProxies=true", + "-Dhttp.proxyHost=127.0.0.1", + "-Dhttp.proxyPort=7890", + "-Dhttps.proxyHost=127.0.0.1", + "-Dhttps.proxyPort=7890" + ] + }, + { + "type": "java", + "name": "Main (real, Clash SOCKS5)", + "request": "launch", + "mainClass": "com.ski.crawler.Main", + "vmArgs": [ + "-Djava.net.useSystemProxies=true", + "-DsocksProxyHost=127.0.0.1", + "-DsocksProxyPort=7891" + ] + } + ] +} diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..ea28f6c --- /dev/null +++ b/pom.xml @@ -0,0 +1,132 @@ + + + 4.0.0 + + com.ski + crawler + 1.0.0 + jar + + Web Crawler + A Java web crawler project + + + 11 + 11 + UTF-8 + 1.15.3 + 4.5.13 + 2.15.2 + 5.2.5 + + + + + + org.jsoup + jsoup + ${jsoup.version} + + + + + org.apache.httpcomponents + httpclient + ${httpclient.version} + + + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + + + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + + + + + com.fasterxml.jackson.core + jackson-annotations + ${jackson.version} + + + + + org.projectlombok + lombok + 1.18.28 + provided + + + + + org.slf4j + slf4j-api + 1.7.36 + + + + ch.qos.logback + logback-classic + 1.2.12 + + + + + junit + junit + 4.13.2 + test + + + + org.apache.poi + poi-ooxml + ${poi.version} + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.11.0 + + 11 + 11 + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.4.1 + + + package + + shade + + + + + com.ski.crawler.Main + + + + + + + + + diff --git a/result.jsonl b/result.jsonl new file mode 100644 index 0000000..a1550e6 --- /dev/null +++ b/result.jsonl @@ -0,0 +1,20 @@ +{"id":null,"name":"Thredbo","country":"Australia","region":"Oceania","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/1121#ski-map-42368","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Valle Nevado","country":"Chile","region":"Americas","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/1144#ski-map-42367","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Las Leñas","country":"Argentina","region":"Americas","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/1129#ski-map-42366","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Damüls-Mellau Au, Damüls, Mellau","country":"Vorarlberg","region":"Austria","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/2700#ski-map-37810","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Appalachian Ski Mtn.","country":"North Carolina","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/285#ski-map-34865","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Las Leñas","country":"Argentina","region":"Americas","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/1129#ski-map-42365","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Blue Mountain","country":"Ontario","region":"Canada","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/113#ski-map-39542","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Smugglers' Notch Resort","country":"Vermont","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/209#ski-map-6815","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Granlibakken Ski Resort","country":"California","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/535#ski-map-40733","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Magic Mountain","country":"Vermont","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/201#ski-map-7492","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Bromley Mountain","country":"Vermont","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/217#ski-map-4224","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Magic Mountain","country":"Vermont","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/201#ski-map-6965","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Wurmberg","country":"Central Uplands","region":"Germany","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/4190#ski-map-7596","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Vail","country":"Colorado","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/507#ski-map-2580","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"King Pine Ski Area","country":"New Hampshire","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/354#ski-map-11664","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Pigeon Mountain","country":"Alberta","region":"Canada","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/2131#ski-map-23689","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"The Pines","country":"New York","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/4872#ski-map-10199","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Ski Cooper","country":"Colorado","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/512#ski-map-6863","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Staller Sattel","country":"Tyrol","region":"Austria","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/12393#ski-map-17983","sourceSite":"skimap","crawlTime":null} +{"id":null,"name":"Val Neigette","country":"Quebec","region":"Canada","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/2205#ski-map-2834","sourceSite":"skimap","crawlTime":null} diff --git a/src/main/java/com/ski/crawler/Main.java b/src/main/java/com/ski/crawler/Main.java new file mode 100644 index 0000000..99c4ea1 --- /dev/null +++ b/src/main/java/com/ski/crawler/Main.java @@ -0,0 +1,40 @@ +package com.ski.crawler; + +import com.ski.crawler.command.CrawlCommand; +import com.ski.crawler.command.ExportCommand; +import com.ski.crawler.command.FilterCommand; +import com.ski.crawler.command.HelpCommand; +import com.ski.crawler.command.ListCommand; +import com.ski.crawler.command.ResumeCommand; +import com.ski.crawler.command.SitesCommand; +import com.ski.crawler.command.StatsCommand; +import com.ski.crawler.controller.CrawlerContext; +import com.ski.crawler.controller.CrawlerController; +import com.ski.crawler.factory.StrategyFactory; +import com.ski.crawler.repository.SkiResortRepository; +import com.ski.crawler.service.ScraperService; + +public class Main { + public static void main(String[] args) { + try { + SkiResortRepository repo = new SkiResortRepository(); + StrategyFactory factory = new StrategyFactory(); + ScraperService service = new ScraperService(); + CrawlerContext context = new CrawlerContext(repo, factory, service); + + CrawlerController controller = new CrawlerController( + new CrawlCommand(), + new ListCommand(), + new FilterCommand(), + new ExportCommand(), + new ResumeCommand(), + new StatsCommand(), + new SitesCommand(), + new HelpCommand() + ); + controller.run(args, context); + } catch (Exception e) { + System.err.println("Crawler failed: " + e.getMessage()); + } + } +} diff --git a/src/main/java/com/ski/crawler/command/Command.java b/src/main/java/com/ski/crawler/command/Command.java new file mode 100644 index 0000000..699ea81 --- /dev/null +++ b/src/main/java/com/ski/crawler/command/Command.java @@ -0,0 +1,10 @@ +package com.ski.crawler.command; + +import com.ski.crawler.controller.CrawlerContext; + +public interface Command { + String name(); + + void execute(String[] args, CrawlerContext context) throws Exception; +} + diff --git a/src/main/java/com/ski/crawler/command/CrawlCommand.java b/src/main/java/com/ski/crawler/command/CrawlCommand.java new file mode 100644 index 0000000..5b122e9 --- /dev/null +++ b/src/main/java/com/ski/crawler/command/CrawlCommand.java @@ -0,0 +1,253 @@ +package com.ski.crawler.command; + +import com.ski.crawler.controller.CrawlerContext; +import com.ski.crawler.exception.NetworkException; +import com.ski.crawler.factory.StrategyFactory; +import com.ski.crawler.repository.SkiResortRepository; +import com.ski.crawler.service.ScraperService; +import com.ski.crawler.strategy.CrawlStrategy; +import com.ski.crawler.util.CliArgs; +import com.ski.crawler.util.ExcelUtil; +import com.ski.crawler.utils.CrawlerHttp; +import com.ski.crawler.view.ConsoleView; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +public class CrawlCommand implements Command { + @Override + public String name() { + return "crawl"; + } + + @Override + public void execute(String[] args, CrawlerContext context) throws Exception { + Map opts = CliArgs.parseOptions(args, 1); + + String siteId = normalizeSite(opts.getOrDefault("site", "skiresort")); + int limit = parseLimit(opts.get("limit"), 100); + int threads = CliArgs.parseInt(opts.get("threads"), 3); + int timeoutMs = CliArgs.parseInt(opts.get("timeout"), 20000); + int retry = CliArgs.parseInt(opts.get("retry"), 3); + long retrySleep = CliArgs.parseInt(opts.get("retry-sleep"), 1000); + boolean dryRun = CliArgs.parseBoolean(opts.get("dry-run")); + boolean full = CliArgs.parseBoolean(opts.get("full")); + boolean incremental = !full; + boolean noProxy = CliArgs.parseBoolean(opts.get("no-proxy")); + boolean color = CliArgs.parseBoolean(opts.get("color")); + boolean showFailures = CliArgs.parseBoolean(opts.get("show-failures")); + Integer widthArg = CliArgs.parseNullableInt(opts.get("width")); + + String country = opts.get("country"); + String startUrl = opts.get("start-url"); + String outRaw = opts.get("out"); + String out = (outRaw == null || outRaw.trim().isEmpty()) ? null : outRaw.trim(); + String outJsonl = out; + String outXlsx = null; + if (out != null && out.toLowerCase(Locale.ROOT).endsWith(".xlsx")) { + outXlsx = out; + outJsonl = null; + } + + String userAgent = opts.getOrDefault("ua", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"); + + String proxyHost = opts.getOrDefault("proxy-host", "127.0.0.1"); + int proxyPort = CliArgs.parseInt(opts.get("proxy-port"), 7890); + boolean proxyEnabled = !noProxy; + String proxy = opts.get("proxy"); + if (proxy != null && !proxy.isEmpty()) { + String p = proxy.trim(); + if (p.equalsIgnoreCase("none") || p.equalsIgnoreCase("off") || p.equalsIgnoreCase("false")) { + proxyEnabled = false; + } else { + int idx = p.lastIndexOf(':'); + if (idx > 0 && idx < p.length() - 1) { + proxyHost = p.substring(0, idx); + proxyPort = CliArgs.parseInt(p.substring(idx + 1), proxyPort); + } else { + proxyHost = p; + } + } + } + + CrawlerHttp http = new CrawlerHttp(userAgent, proxyHost, proxyPort, proxyEnabled, timeoutMs); + int width = resolveWidth(widthArg); + ConsoleView view = new ConsoleView(width, color); + + StrategyFactory factory = context.strategies(); + SkiResortRepository repo = context.repository(); + ScraperService svc = context.scraper(); + + ScraperService.CrawlReport report; + if (siteId.equals("all")) { + if (outJsonl != null) { + System.err.println("When --site all, JSONL --out is not supported. Use --out result.xlsx or omit --out."); + return; + } + report = crawlAll(factory, svc, startUrl, limit, threads, country, http, repo, incremental, view, showFailures, dryRun, retry, retrySleep); + } else { + CrawlStrategy strategy = factory.create(siteId); + try { + report = svc.crawl(strategy, startUrl, limit, threads, country, http, repo, incremental, outJsonl, view, showFailures, dryRun, retry, retrySleep); + } catch (NetworkException e) { + throw e; + } + } + + if (outXlsx != null) { + if (dryRun) { + System.err.println("dry-run is enabled, skip writing: " + outXlsx); + } else { + ExcelUtil.exportResortsBySiteToXlsx(repo.getAll(), outXlsx); + System.err.println("Excel exported: " + repo.getAll().size() + " -> " + outXlsx); + } + } + + Map summary = new LinkedHashMap<>(); + summary.put("site", report.site); + summary.put("total", report.total); + summary.put("success", report.success); + summary.put("filteredOut", report.filteredOut); + summary.put("skipped", report.skipped); + summary.put("failed", report.failed); + if (outXlsx != null && !dryRun) { + summary.put("out", outXlsx); + } else if (outJsonl != null && !dryRun) { + summary.put("out", outJsonl); + } + + view.printSummary(summary, sortByValueDesc(report.byCountry), showFailures ? report.failures : null); + } + + private String normalizeSite(String raw) { + if (raw == null) { + return "skiresort"; + } + String t = raw.trim().toLowerCase(Locale.ROOT); + if (t.equals("wiki")) { + return "wikipedia"; + } + return t; + } + + private ScraperService.CrawlReport crawlAll( + StrategyFactory factory, + ScraperService svc, + String startUrl, + int limit, + int threads, + String countryFilter, + CrawlerHttp http, + SkiResortRepository repo, + boolean incremental, + ConsoleView view, + boolean showFailures, + boolean dryRun, + int retryAttempts, + long retrySleepMs + ) throws Exception { + List sites = Arrays.asList("skiresort", "wikipedia", "skimap"); + Map byCountry = new LinkedHashMap<>(); + List failures = new java.util.ArrayList<>(); + int total = 0; + int success = 0; + int filteredOut = 0; + int skipped = 0; + int failed = 0; + + for (String s : sites) { + CrawlStrategy strategy = factory.create(s); + try { + ScraperService.CrawlReport r = svc.crawl(strategy, null, limit, threads, countryFilter, http, repo, incremental, null, view, showFailures, dryRun, retryAttempts, retrySleepMs); + total += r.total; + success += r.success; + filteredOut += r.filteredOut; + skipped += r.skipped; + failed += r.failed; + mergeByCountry(byCountry, r.byCountry); + if (showFailures && r.failures != null) { + for (String f : r.failures) { + if (failures.size() >= 200) { + break; + } + failures.add(f); + } + } + } catch (Exception e) { + failed += 1; + if (showFailures && failures.size() < 200) { + failures.add("site=" + s + " [" + e.getClass().getSimpleName() + "] " + (e.getMessage() == null ? "" : e.getMessage())); + } + } + } + + ScraperService.CrawlReport out = new ScraperService.CrawlReport(); + out.site = "all"; + out.total = total; + out.success = success; + out.filteredOut = filteredOut; + out.skipped = skipped; + out.failed = failed; + out.byCountry = byCountry; + out.failures = failures; + return out; + } + + private void mergeByCountry(Map acc, Map add) { + if (acc == null || add == null || add.isEmpty()) { + return; + } + for (Map.Entry e : add.entrySet()) { + if (e.getKey() == null) { + continue; + } + long v = e.getValue() == null ? 0L : e.getValue(); + acc.put(e.getKey(), acc.getOrDefault(e.getKey(), 0L) + v); + } + } + + private int parseLimit(String v, int def) { + if (v == null || v.trim().isEmpty()) { + return def; + } + String t = v.trim(); + if (t.equalsIgnoreCase("all")) { + return -1; + } + try { + int n = Integer.parseInt(t); + return n <= 0 ? def : n; + } catch (Exception e) { + return def; + } + } + + private int resolveWidth(Integer widthArg) { + if (widthArg != null && widthArg > 20) { + return widthArg; + } + String cols = System.getenv("COLUMNS"); + if (cols != null) { + try { + int n = Integer.parseInt(cols.trim()); + if (n > 20) { + return n; + } + } catch (Exception ignored) { + } + } + return 120; + } + + private Map sortByValueDesc(Map m) { + if (m == null || m.isEmpty()) { + return m; + } + return m.entrySet().stream() + .sorted((a, b) -> Long.compare(b.getValue(), a.getValue())) + .collect(LinkedHashMap::new, (acc, e) -> acc.put(e.getKey(), e.getValue()), Map::putAll); + } +} diff --git a/src/main/java/com/ski/crawler/command/ExportCommand.java b/src/main/java/com/ski/crawler/command/ExportCommand.java new file mode 100644 index 0000000..a625cf9 --- /dev/null +++ b/src/main/java/com/ski/crawler/command/ExportCommand.java @@ -0,0 +1,77 @@ +package com.ski.crawler.command; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.ski.crawler.controller.CrawlerContext; +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.repository.SkiResortRepository; +import com.ski.crawler.util.CliArgs; +import com.ski.crawler.util.ExcelUtil; +import com.ski.crawler.util.JsonUtil; + +import java.io.BufferedWriter; +import java.util.Locale; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +public class ExportCommand implements Command { + @Override + public String name() { + return "export"; + } + + @Override + public void execute(String[] args, CrawlerContext context) throws Exception { + Map opts = CliArgs.parseOptions(args, 1); + String out = opts.get("out"); + if (out == null || out.trim().isEmpty()) { + System.err.println("Missing --out "); + return; + } + + SkiResortRepository repo = context.repository(); + List all = repo.getAll(); + String path = out.trim(); + if (path.toLowerCase(Locale.ROOT).endsWith(".xlsx")) { + ExcelUtil.exportResortsBySiteToXlsx(all, path); + System.err.println("Exported: " + all.size() + " -> " + path); + return; + } + + ObjectMapper mapper = JsonUtil.mapper(); + try (BufferedWriter w = JsonUtil.openJsonlWriter(path)) { + for (SkiResort r : all) { + w.write(mapper.writeValueAsString(toJson(r))); + w.newLine(); + } + } + System.err.println("Exported: " + all.size() + " -> " + path); + } + + private Map toJson(SkiResort r) { + Map obj = new LinkedHashMap<>(); + obj.put("id", r.getId()); + obj.put("name", r.getName()); + obj.put("country", r.getCountry()); + obj.put("region", r.getRegion()); + obj.put("latitude", r.getLatitude()); + obj.put("longitude", r.getLongitude()); + obj.put("altitudeMin", r.getAltitudeMin()); + obj.put("altitudeMax", r.getAltitudeMax()); + obj.put("totalKm", r.getTotalKm()); + obj.put("slopeCount", r.getSlopeCount()); + obj.put("liftCount", r.getLiftCount()); + obj.put("ticketPriceMin", r.getTicketPriceMin()); + obj.put("ticketPriceMax", r.getTicketPriceMax()); + obj.put("currency", r.getCurrency()); + obj.put("openTime", r.getOpenTime()); + obj.put("snowDepthCm", r.getSnowDepthCm()); + obj.put("temperatureC", r.getTemperatureC()); + obj.put("nearbyHotels", r.getNearbyHotels()); + obj.put("rentalShops", r.getRentalShops()); + obj.put("url", r.getSourceUrl()); + obj.put("sourceSite", r.getSourceSite()); + obj.put("crawlTime", r.getCrawledAt() == null ? null : r.getCrawledAt().toString()); + return obj; + } +} diff --git a/src/main/java/com/ski/crawler/command/FilterCommand.java b/src/main/java/com/ski/crawler/command/FilterCommand.java new file mode 100644 index 0000000..3dfe9d5 --- /dev/null +++ b/src/main/java/com/ski/crawler/command/FilterCommand.java @@ -0,0 +1,18 @@ +package com.ski.crawler.command; + +import com.ski.crawler.controller.CrawlerContext; + +public class FilterCommand implements Command { + private final ListCommand delegate = new ListCommand(); + + @Override + public String name() { + return "filter"; + } + + @Override + public void execute(String[] args, CrawlerContext context) { + delegate.execute(args, context); + } +} + diff --git a/src/main/java/com/ski/crawler/command/HelpCommand.java b/src/main/java/com/ski/crawler/command/HelpCommand.java new file mode 100644 index 0000000..ebcae3f --- /dev/null +++ b/src/main/java/com/ski/crawler/command/HelpCommand.java @@ -0,0 +1,38 @@ +package com.ski.crawler.command; + +import com.ski.crawler.controller.CrawlerContext; + +public class HelpCommand implements Command { + @Override + public String name() { + return "help"; + } + + @Override + public void execute(String[] args, CrawlerContext context) { + System.out.println("命令:"); + System.out.println(" crawl --site --limit [--country <关键词>] [--out ] [--dry-run] [--no-proxy]"); + System.out.println(" list [--country <关键词>]"); + System.out.println(" export --out "); + System.out.println(" resume --in "); + System.out.println(" stats"); + System.out.println(" sites"); + System.out.println(" help"); + System.out.println(); + System.out.println("crawl 参数:"); + System.out.println(" --threads 默认 3"); + System.out.println(" --start-url 覆盖站点入口"); + System.out.println(" --timeout 默认 20000"); + System.out.println(" --ua 覆盖 UA"); + System.out.println(" --proxy 代理配置"); + System.out.println(" --proxy-host / --proxy-port "); + System.out.println(" --no-proxy 禁用代理"); + System.out.println(" --width 表格宽度"); + System.out.println(" --color 表头上色(可选)"); + System.out.println(" --show-failures 结束时输出失败列表(可选)"); + System.out.println(" --full 全量抓取(忽略去重,仍然不会往仓库写重复 URL)"); + System.out.println(" --retry 默认 3"); + System.out.println(" --retry-sleep 默认 1000"); + System.out.println(" --dry-run 不写入仓库/不导出文件(仅展示)"); + } +} diff --git a/src/main/java/com/ski/crawler/command/ListCommand.java b/src/main/java/com/ski/crawler/command/ListCommand.java new file mode 100644 index 0000000..2981502 --- /dev/null +++ b/src/main/java/com/ski/crawler/command/ListCommand.java @@ -0,0 +1,53 @@ +package com.ski.crawler.command; + +import com.ski.crawler.controller.CrawlerContext; +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.repository.SkiResortRepository; +import com.ski.crawler.util.CliArgs; +import com.ski.crawler.view.ConsoleView; + +import java.util.List; +import java.util.Map; + +public class ListCommand implements Command { + @Override + public String name() { + return "list"; + } + + @Override + public void execute(String[] args, CrawlerContext context) { + Map opts = CliArgs.parseOptions(args, 1); + String country = opts.get("country"); + boolean color = CliArgs.parseBoolean(opts.get("color")); + Integer widthArg = CliArgs.parseNullableInt(opts.get("width")); + int width = resolveWidth(widthArg); + + SkiResortRepository repo = context.repository(); + List list = (country == null || country.trim().isEmpty()) ? repo.getAll() : repo.filterByCountry(country); + + ConsoleView view = new ConsoleView(width, color); + view.printHeader(); + for (SkiResort r : list) { + view.printResort(r); + } + } + + private int resolveWidth(Integer widthArg) { + if (widthArg != null && widthArg > 20) { + return widthArg; + } + String cols = System.getenv("COLUMNS"); + if (cols != null) { + try { + int n = Integer.parseInt(cols.trim()); + if (n > 20) { + return n; + } + } catch (Exception ignored) { + } + } + return 120; + } +} + diff --git a/src/main/java/com/ski/crawler/command/ResumeCommand.java b/src/main/java/com/ski/crawler/command/ResumeCommand.java new file mode 100644 index 0000000..319ce28 --- /dev/null +++ b/src/main/java/com/ski/crawler/command/ResumeCommand.java @@ -0,0 +1,140 @@ +package com.ski.crawler.command; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.ski.crawler.controller.CrawlerContext; +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.repository.SkiResortRepository; +import com.ski.crawler.util.CliArgs; +import com.ski.crawler.util.JsonUtil; + +import java.io.BufferedReader; +import java.time.LocalDateTime; +import java.util.List; +import java.util.Map; + +public class ResumeCommand implements Command { + @Override + public String name() { + return "resume"; + } + + @Override + public void execute(String[] args, CrawlerContext context) throws Exception { + Map opts = CliArgs.parseOptions(args, 1); + String in = opts.get("in"); + if (in == null || in.trim().isEmpty()) { + System.err.println("Missing --in "); + return; + } + + SkiResortRepository repo = context.repository(); + ObjectMapper mapper = JsonUtil.mapper(); + int loaded = 0; + int skipped = 0; + try (BufferedReader br = JsonUtil.openJsonlReader(in.trim())) { + String line; + while ((line = br.readLine()) != null) { + String t = line.trim(); + if (t.isEmpty()) { + continue; + } + Map obj = mapper.readValue(t, new TypeReference>() {}); + SkiResort r = fromJson(obj); + if (r.getSourceUrl() == null && obj.get("url") != null) { + r.setSourceUrl(String.valueOf(obj.get("url"))); + } + if (repo.add(r)) { + loaded++; + } else { + skipped++; + } + } + } + System.err.println("Resumed: loaded=" + loaded + " skipped=" + skipped + " totalInRepo=" + repo.getAll().size()); + } + + private SkiResort fromJson(Map obj) { + SkiResort r = new SkiResort(); + r.setName(asString(obj.get("name"))); + r.setCountry(asString(obj.get("country"))); + r.setRegion(asString(obj.get("region"))); + r.setLatitude(asDouble(obj.get("latitude"))); + r.setLongitude(asDouble(obj.get("longitude"))); + r.setAltitudeMin(asInt(obj.get("altitudeMin"))); + r.setAltitudeMax(asInt(obj.get("altitudeMax"))); + r.setTotalKm(asDouble(obj.get("totalKm"))); + r.setSlopeCount(asInt(obj.get("slopeCount"))); + r.setLiftCount(asInt(obj.get("liftCount"))); + r.setTicketPriceMin(asDouble(obj.get("ticketPriceMin"))); + r.setTicketPriceMax(asDouble(obj.get("ticketPriceMax"))); + r.setCurrency(asString(obj.get("currency"))); + r.setOpenTime(asString(obj.get("openTime"))); + r.setSnowDepthCm(asInt(obj.get("snowDepthCm"))); + r.setTemperatureC(asDouble(obj.get("temperatureC"))); + r.setSourceSite(asString(obj.get("sourceSite"))); + r.setSourceUrl(asString(obj.get("url"))); + String crawlTime = asString(obj.get("crawlTime")); + if (crawlTime != null) { + try { + r.setCrawledAt(LocalDateTime.parse(crawlTime)); + } catch (Exception ignored) { + } + } + + Object hotels = obj.get("nearbyHotels"); + if (hotels instanceof List) { + r.setNearbyHotels((List) hotels); + } + Object shops = obj.get("rentalShops"); + if (shops instanceof List) { + r.setRentalShops((List) shops); + } + return r; + } + + private String asString(Object v) { + if (v == null) { + return null; + } + String s = String.valueOf(v).replace('\u00A0', ' ').trim(); + return s.isEmpty() ? null : s; + } + + private Integer asInt(Object v) { + try { + if (v == null) { + return null; + } + if (v instanceof Number) { + return ((Number) v).intValue(); + } + String s = String.valueOf(v).trim(); + if (s.isEmpty()) { + return null; + } + return Integer.parseInt(s); + } catch (Exception e) { + return null; + } + } + + private Double asDouble(Object v) { + try { + if (v == null) { + return null; + } + if (v instanceof Number) { + return ((Number) v).doubleValue(); + } + String s = String.valueOf(v).trim().replace(",", "."); + if (s.isEmpty()) { + return null; + } + return Double.parseDouble(s); + } catch (Exception e) { + return null; + } + } +} + diff --git a/src/main/java/com/ski/crawler/command/SitesCommand.java b/src/main/java/com/ski/crawler/command/SitesCommand.java new file mode 100644 index 0000000..12f5d91 --- /dev/null +++ b/src/main/java/com/ski/crawler/command/SitesCommand.java @@ -0,0 +1,19 @@ +package com.ski.crawler.command; + +import com.ski.crawler.controller.CrawlerContext; + +public class SitesCommand implements Command { + @Override + public String name() { + return "sites"; + } + + @Override + public void execute(String[] args, CrawlerContext context) { + System.out.println("sites:"); + System.out.println(" skiresort https://www.skiresort.info"); + System.out.println(" wikipedia https://en.wikipedia.org/wiki/List_of_ski_areas_and_resorts"); + System.out.println(" skimap https://skimap.org"); + } +} + diff --git a/src/main/java/com/ski/crawler/command/StatsCommand.java b/src/main/java/com/ski/crawler/command/StatsCommand.java new file mode 100644 index 0000000..c65cf8f --- /dev/null +++ b/src/main/java/com/ski/crawler/command/StatsCommand.java @@ -0,0 +1,66 @@ +package com.ski.crawler.command; + +import com.ski.crawler.controller.CrawlerContext; +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.repository.SkiResortRepository; + +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class StatsCommand implements Command { + @Override + public String name() { + return "stats"; + } + + @Override + public void execute(String[] args, CrawlerContext context) { + SkiResortRepository repo = context.repository(); + List all = repo.getAll(); + System.out.println("total=" + all.size()); + + Map byCountry = repo.countByCountry(); + List> top = byCountry.entrySet().stream() + .sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())) + .limit(20) + .collect(Collectors.toList()); + if (!top.isEmpty()) { + System.out.println("byCountry(top20):"); + long max = top.get(0).getValue() == null ? 0 : top.get(0).getValue(); + for (Map.Entry e : top) { + long v = e.getValue() == null ? 0 : e.getValue(); + System.out.println(" " + e.getKey() + ": " + v + " " + bar(v, max, 30)); + } + } + + double sum = 0; + int cnt = 0; + for (SkiResort r : all) { + Double p = r.getTicketPriceMin(); + if (p != null && p >= 0) { + sum += p; + cnt++; + } + } + if (cnt > 0) { + System.out.println("avgTicketPriceMin=" + (sum / cnt) + " samples=" + cnt); + } + } + + private String bar(long v, long max, int width) { + if (max <= 0 || width <= 0) { + return ""; + } + int n = (int) Math.round((double) v * width / (double) max); + if (n <= 0) { + return ""; + } + StringBuilder sb = new StringBuilder(n); + for (int i = 0; i < n; i++) { + sb.append('#'); + } + return sb.toString(); + } +} diff --git a/src/main/java/com/ski/crawler/controller/CrawlerContext.java b/src/main/java/com/ski/crawler/controller/CrawlerContext.java new file mode 100644 index 0000000..8f86520 --- /dev/null +++ b/src/main/java/com/ski/crawler/controller/CrawlerContext.java @@ -0,0 +1,30 @@ +package com.ski.crawler.controller; + +import com.ski.crawler.factory.StrategyFactory; +import com.ski.crawler.repository.SkiResortRepository; +import com.ski.crawler.service.ScraperService; + +public class CrawlerContext { + private final SkiResortRepository repository; + private final StrategyFactory strategyFactory; + private final ScraperService scraperService; + + public CrawlerContext(SkiResortRepository repository, StrategyFactory strategyFactory, ScraperService scraperService) { + this.repository = repository; + this.strategyFactory = strategyFactory; + this.scraperService = scraperService; + } + + public SkiResortRepository repository() { + return repository; + } + + public StrategyFactory strategies() { + return strategyFactory; + } + + public ScraperService scraper() { + return scraperService; + } +} + diff --git a/src/main/java/com/ski/crawler/controller/CrawlerController.java b/src/main/java/com/ski/crawler/controller/CrawlerController.java new file mode 100644 index 0000000..f161ca1 --- /dev/null +++ b/src/main/java/com/ski/crawler/controller/CrawlerController.java @@ -0,0 +1,72 @@ +package com.ski.crawler.controller; + +import com.ski.crawler.command.Command; + +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +public class CrawlerController { + private final Map commands = new HashMap<>(); + + public CrawlerController(Command... cmds) { + if (cmds != null) { + for (Command c : cmds) { + if (c != null && c.name() != null) { + commands.put(c.name().toLowerCase(Locale.ROOT), c); + } + } + } + } + + public void run(String[] args, CrawlerContext context) throws Exception { + String cmd = firstArg(args); + if (cmd.isEmpty()) { + execute("help", args, context); + return; + } + + if (isLegacyLimit(cmd)) { + execute("crawl", new String[]{"crawl", "--limit", cmd}, context); + return; + } + if ("all".equalsIgnoreCase(cmd)) { + execute("crawl", new String[]{"crawl", "--limit", "all"}, context); + return; + } + + execute(cmd, args, context); + } + + private void execute(String cmd, String[] args, CrawlerContext context) throws Exception { + Command c = commands.get(cmd.toLowerCase(Locale.ROOT)); + if (c == null) { + Command help = commands.get("help"); + if (help != null) { + help.execute(args, context); + } + return; + } + c.execute(args, context); + } + + private String firstArg(String[] args) { + if (args == null || args.length == 0 || args[0] == null) { + return ""; + } + return args[0].trim(); + } + + private boolean isLegacyLimit(String s) { + try { + if (s == null) { + return false; + } + Integer.parseInt(s.trim()); + return true; + } catch (Exception e) { + return false; + } + } +} + diff --git a/src/main/java/com/ski/crawler/exception/CrawlerException.java b/src/main/java/com/ski/crawler/exception/CrawlerException.java new file mode 100644 index 0000000..34b5db7 --- /dev/null +++ b/src/main/java/com/ski/crawler/exception/CrawlerException.java @@ -0,0 +1,12 @@ +package com.ski.crawler.exception; + +public class CrawlerException extends Exception { + public CrawlerException(String message) { + super(message); + } + + public CrawlerException(String message, Throwable cause) { + super(message, cause); + } +} + diff --git a/src/main/java/com/ski/crawler/exception/NetworkException.java b/src/main/java/com/ski/crawler/exception/NetworkException.java new file mode 100644 index 0000000..5d767ec --- /dev/null +++ b/src/main/java/com/ski/crawler/exception/NetworkException.java @@ -0,0 +1,12 @@ +package com.ski.crawler.exception; + +public class NetworkException extends CrawlerException { + public NetworkException(String message) { + super(message); + } + + public NetworkException(String message, Throwable cause) { + super(message, cause); + } +} + diff --git a/src/main/java/com/ski/crawler/exception/ParseException.java b/src/main/java/com/ski/crawler/exception/ParseException.java new file mode 100644 index 0000000..acdfd84 --- /dev/null +++ b/src/main/java/com/ski/crawler/exception/ParseException.java @@ -0,0 +1,12 @@ +package com.ski.crawler.exception; + +public class ParseException extends CrawlerException { + public ParseException(String message) { + super(message); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} + diff --git a/src/main/java/com/ski/crawler/factory/StrategyFactory.java b/src/main/java/com/ski/crawler/factory/StrategyFactory.java new file mode 100644 index 0000000..4163ef5 --- /dev/null +++ b/src/main/java/com/ski/crawler/factory/StrategyFactory.java @@ -0,0 +1,31 @@ +package com.ski.crawler.factory; + +import com.ski.crawler.strategy.CrawlStrategy; +import com.ski.crawler.strategy.SkiResortInfoStrategy; +import com.ski.crawler.strategy.SkimapStrategy; +import com.ski.crawler.strategy.WikipediaStrategy; + +import java.util.Locale; + +public class StrategyFactory { + public CrawlStrategy create(String id) { + if (id == null) { + return new SkiResortInfoStrategy(); + } + String t = id.trim().toLowerCase(Locale.ROOT); + if (t.equals("wiki")) { + t = "wikipedia"; + } + switch (t) { + case "skiresort": + return new SkiResortInfoStrategy(); + case "wikipedia": + return new WikipediaStrategy(); + case "skimap": + return new SkimapStrategy(); + default: + throw new IllegalArgumentException("Unknown site: " + id); + } + } +} + diff --git a/src/main/java/com/ski/crawler/model/SkiLift.java b/src/main/java/com/ski/crawler/model/SkiLift.java new file mode 100644 index 0000000..9c40e14 --- /dev/null +++ b/src/main/java/com/ski/crawler/model/SkiLift.java @@ -0,0 +1,83 @@ +package com.ski.crawler.model; + +public class SkiLift { + private Long id; + private Long resortId; + private Integer totalLifts; + private Integer gondolas; + private Integer chairlifts; + private Integer surfaceLifts; + private Integer cableCars; + private Integer travelators; + + public SkiLift() { + } + + public SkiLift(Long resortId) { + this.resortId = resortId; + } + + public Long getId() { + return id; + } + + public void setId(Long id) { + this.id = id; + } + + public Long getResortId() { + return resortId; + } + + public void setResortId(Long resortId) { + this.resortId = resortId; + } + + public Integer getTotalLifts() { + return totalLifts; + } + + public void setTotalLifts(Integer totalLifts) { + this.totalLifts = totalLifts; + } + + public Integer getGondolas() { + return gondolas; + } + + public void setGondolas(Integer gondolas) { + this.gondolas = gondolas; + } + + public Integer getChairlifts() { + return chairlifts; + } + + public void setChairlifts(Integer chairlifts) { + this.chairlifts = chairlifts; + } + + public Integer getSurfaceLifts() { + return surfaceLifts; + } + + public void setSurfaceLifts(Integer surfaceLifts) { + this.surfaceLifts = surfaceLifts; + } + + public Integer getCableCars() { + return cableCars; + } + + public void setCableCars(Integer cableCars) { + this.cableCars = cableCars; + } + + public Integer getTravelators() { + return travelators; + } + + public void setTravelators(Integer travelators) { + this.travelators = travelators; + } +} diff --git a/src/main/java/com/ski/crawler/model/SkiResort.java b/src/main/java/com/ski/crawler/model/SkiResort.java new file mode 100644 index 0000000..f870969 --- /dev/null +++ b/src/main/java/com/ski/crawler/model/SkiResort.java @@ -0,0 +1,252 @@ +package com.ski.crawler.model; + +import java.math.BigDecimal; +import java.time.LocalDateTime; +import java.util.List; + +public class SkiResort { + private Long id; + private String name; + private String country; + private String region; + private Double latitude; + private Double longitude; + private Integer altitudeMin; + private Integer altitudeMax; + private Double totalKm; + private BigDecimal overallScore; + private SkiTrail skiTrail; + private SkiLift skiLift; + private SkiTicket skiTicket; + private String sourceUrl; + private String sourceSite; + private LocalDateTime crawledAt; + + private Integer slopeCount; + private Integer liftCount; + private Double ticketPriceMin; + private Double ticketPriceMax; + private String currency; + private String openTime; + private Double temperatureC; + private Integer snowDepthCm; + private List nearbyHotels; + private List rentalShops; + + public SkiResort() { + } + + public SkiResort(String name, String country, String sourceUrl) { + this.name = name; + this.country = country; + this.sourceUrl = sourceUrl; + } + + public Long getId() { + return id; + } + + public void setId(Long id) { + this.id = id; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getCountry() { + return country; + } + + public void setCountry(String country) { + this.country = country; + } + + public String getRegion() { + return region; + } + + public void setRegion(String region) { + this.region = region; + } + + public Double getLatitude() { + return latitude; + } + + public void setLatitude(Double latitude) { + this.latitude = latitude; + } + + public Double getLongitude() { + return longitude; + } + + public void setLongitude(Double longitude) { + this.longitude = longitude; + } + + public Integer getAltitudeMin() { + return altitudeMin; + } + + public void setAltitudeMin(Integer altitudeMin) { + this.altitudeMin = altitudeMin; + } + + public Integer getAltitudeMax() { + return altitudeMax; + } + + public void setAltitudeMax(Integer altitudeMax) { + this.altitudeMax = altitudeMax; + } + + public Double getTotalKm() { + return totalKm; + } + + public void setTotalKm(Double totalKm) { + this.totalKm = totalKm; + } + + public BigDecimal getOverallScore() { + return overallScore; + } + + public void setOverallScore(BigDecimal overallScore) { + this.overallScore = overallScore; + } + + public SkiTrail getSkiTrail() { + return skiTrail; + } + + public void setSkiTrail(SkiTrail skiTrail) { + this.skiTrail = skiTrail; + } + + public SkiLift getSkiLift() { + return skiLift; + } + + public void setSkiLift(SkiLift skiLift) { + this.skiLift = skiLift; + } + + public SkiTicket getSkiTicket() { + return skiTicket; + } + + public void setSkiTicket(SkiTicket skiTicket) { + this.skiTicket = skiTicket; + } + + public String getSourceUrl() { + return sourceUrl; + } + + public void setSourceUrl(String sourceUrl) { + this.sourceUrl = sourceUrl; + } + + public String getSourceSite() { + return sourceSite; + } + + public void setSourceSite(String sourceSite) { + this.sourceSite = sourceSite; + } + + public LocalDateTime getCrawledAt() { + return crawledAt; + } + + public void setCrawledAt(LocalDateTime crawledAt) { + this.crawledAt = crawledAt; + } + + public Integer getSlopeCount() { + return slopeCount; + } + + public void setSlopeCount(Integer slopeCount) { + this.slopeCount = slopeCount; + } + + public Integer getLiftCount() { + return liftCount; + } + + public void setLiftCount(Integer liftCount) { + this.liftCount = liftCount; + } + + public Double getTicketPriceMin() { + return ticketPriceMin; + } + + public void setTicketPriceMin(Double ticketPriceMin) { + this.ticketPriceMin = ticketPriceMin; + } + + public Double getTicketPriceMax() { + return ticketPriceMax; + } + + public void setTicketPriceMax(Double ticketPriceMax) { + this.ticketPriceMax = ticketPriceMax; + } + + public String getCurrency() { + return currency; + } + + public void setCurrency(String currency) { + this.currency = currency; + } + + public String getOpenTime() { + return openTime; + } + + public void setOpenTime(String openTime) { + this.openTime = openTime; + } + + public Double getTemperatureC() { + return temperatureC; + } + + public void setTemperatureC(Double temperatureC) { + this.temperatureC = temperatureC; + } + + public Integer getSnowDepthCm() { + return snowDepthCm; + } + + public void setSnowDepthCm(Integer snowDepthCm) { + this.snowDepthCm = snowDepthCm; + } + + public List getNearbyHotels() { + return nearbyHotels; + } + + public void setNearbyHotels(List nearbyHotels) { + this.nearbyHotels = nearbyHotels; + } + + public List getRentalShops() { + return rentalShops; + } + + public void setRentalShops(List rentalShops) { + this.rentalShops = rentalShops; + } +} diff --git a/src/main/java/com/ski/crawler/model/SkiReview.java b/src/main/java/com/ski/crawler/model/SkiReview.java new file mode 100644 index 0000000..09de6bb --- /dev/null +++ b/src/main/java/com/ski/crawler/model/SkiReview.java @@ -0,0 +1,76 @@ +package com.ski.crawler.model; + +import java.time.LocalDateTime; + +public class SkiReview { + private Long id; + private Long resortId; + private Double overallScore; + private Double snowScore; + private Double facilitiesScore; + private Integer totalReviews; + private LocalDateTime crawledAt; + + public SkiReview() { + } + + public SkiReview(Long resortId) { + this.resortId = resortId; + } + + public Long getId() { + return id; + } + + public void setId(Long id) { + this.id = id; + } + + public Long getResortId() { + return resortId; + } + + public void setResortId(Long resortId) { + this.resortId = resortId; + } + + public Double getOverallScore() { + return overallScore; + } + + public void setOverallScore(Double overallScore) { + this.overallScore = overallScore; + } + + public Double getSnowScore() { + return snowScore; + } + + public void setSnowScore(Double snowScore) { + this.snowScore = snowScore; + } + + public Double getFacilitiesScore() { + return facilitiesScore; + } + + public void setFacilitiesScore(Double facilitiesScore) { + this.facilitiesScore = facilitiesScore; + } + + public Integer getTotalReviews() { + return totalReviews; + } + + public void setTotalReviews(Integer totalReviews) { + this.totalReviews = totalReviews; + } + + public LocalDateTime getCrawledAt() { + return crawledAt; + } + + public void setCrawledAt(LocalDateTime crawledAt) { + this.crawledAt = crawledAt; + } +} diff --git a/src/main/java/com/ski/crawler/model/SkiTicket.java b/src/main/java/com/ski/crawler/model/SkiTicket.java new file mode 100644 index 0000000..b682501 --- /dev/null +++ b/src/main/java/com/ski/crawler/model/SkiTicket.java @@ -0,0 +1,74 @@ +package com.ski.crawler.model; + +public class SkiTicket { + private Long id; + private Long resortId; + private String ticketType; + private Double priceAdult; + private Double priceChild; + private String currency; + private String season; + + public SkiTicket() { + } + + public SkiTicket(Long resortId) { + this.resortId = resortId; + } + + public Long getId() { + return id; + } + + public void setId(Long id) { + this.id = id; + } + + public Long getResortId() { + return resortId; + } + + public void setResortId(Long resortId) { + this.resortId = resortId; + } + + public String getTicketType() { + return ticketType; + } + + public void setTicketType(String ticketType) { + this.ticketType = ticketType; + } + + public Double getPriceAdult() { + return priceAdult; + } + + public void setPriceAdult(Double priceAdult) { + this.priceAdult = priceAdult; + } + + public Double getPriceChild() { + return priceChild; + } + + public void setPriceChild(Double priceChild) { + this.priceChild = priceChild; + } + + public String getCurrency() { + return currency; + } + + public void setCurrency(String currency) { + this.currency = currency; + } + + public String getSeason() { + return season; + } + + public void setSeason(String season) { + this.season = season; + } +} diff --git a/src/main/java/com/ski/crawler/model/SkiTrail.java b/src/main/java/com/ski/crawler/model/SkiTrail.java new file mode 100644 index 0000000..b29d641 --- /dev/null +++ b/src/main/java/com/ski/crawler/model/SkiTrail.java @@ -0,0 +1,92 @@ +package com.ski.crawler.model; + +public class SkiTrail { + private Long id; + private Long resortId; + private Double totalKm; + private Double beginnerKm; + private Double intermediateKm; + private Double expertKm; + private Integer totalRuns; + private Boolean snowMaking; + private Integer snowDepthCm; + + public SkiTrail() { + } + + public SkiTrail(Long resortId) { + this.resortId = resortId; + } + + public Long getId() { + return id; + } + + public void setId(Long id) { + this.id = id; + } + + public Long getResortId() { + return resortId; + } + + public void setResortId(Long resortId) { + this.resortId = resortId; + } + + public Double getTotalKm() { + return totalKm; + } + + public void setTotalKm(Double totalKm) { + this.totalKm = totalKm; + } + + public Double getBeginnerKm() { + return beginnerKm; + } + + public void setBeginnerKm(Double beginnerKm) { + this.beginnerKm = beginnerKm; + } + + public Double getIntermediateKm() { + return intermediateKm; + } + + public void setIntermediateKm(Double intermediateKm) { + this.intermediateKm = intermediateKm; + } + + public Double getExpertKm() { + return expertKm; + } + + public void setExpertKm(Double expertKm) { + this.expertKm = expertKm; + } + + public Integer getTotalRuns() { + return totalRuns; + } + + public void setTotalRuns(Integer totalRuns) { + this.totalRuns = totalRuns; + } + + public Boolean getSnowMaking() { + return snowMaking; + } + + public void setSnowMaking(Boolean snowMaking) { + this.snowMaking = snowMaking; + } + + public Integer getSnowDepthCm() { + return snowDepthCm; + } + + public void setSnowDepthCm(Integer snowDepthCm) { + this.snowDepthCm = snowDepthCm; + } +} diff --git a/src/main/java/com/ski/crawler/parser/ResortDetailParser.java b/src/main/java/com/ski/crawler/parser/ResortDetailParser.java new file mode 100644 index 0000000..f3cada9 --- /dev/null +++ b/src/main/java/com/ski/crawler/parser/ResortDetailParser.java @@ -0,0 +1,471 @@ +package com.ski.crawler.parser; + +import com.ski.crawler.model.SkiLift; +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.model.SkiTicket; +import com.ski.crawler.model.SkiTrail; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class ResortDetailParser { + private static final Pattern INT_M_PATTERN = Pattern.compile("(\\d{2,5})\\s*m\\b", Pattern.CASE_INSENSITIVE); + private static final Pattern ALT_RANGE_PATTERN = Pattern.compile("(\\d{2,5})\\s*m\\s*(?:-|–|to)\\s*(\\d{2,5})\\s*m\\b", Pattern.CASE_INSENSITIVE); + private static final Pattern KM_PATTERN = Pattern.compile("(\\d+(?:[\\.,]\\d+)?)\\s*km\\b", Pattern.CASE_INSENSITIVE); + private static final Pattern PERCENT_PATTERN = Pattern.compile("(\\d{1,3})\\s*%\\b"); + private static final Pattern NUMBER_PATTERN = Pattern.compile("(\\d+(?:[\\.,]\\d+)?)"); + private static final Pattern CURRENCY_FIRST_PATTERN = Pattern.compile("(?:(SFr\\.)|CHF|€|\\$|£)\\s*(\\d+(?:[\\.,]\\d+)?)"); + private static final Pattern CURRENCY_LAST_PATTERN = Pattern.compile("(\\d+(?:[\\.,]\\d+)?)\\s*(€|\\$|£|CHF|SFr\\.)"); +//解析滑雪场详情页的 HTML 内容,提取出滑雪场的详细信息。 +//它使用 Jsoup 解析 HTML 内容,然后根据 HTML 结构提取出滑雪场的名称、国家、区域、海拔、总距离、总轨迹、总 lift、总票、总评分等信息。 +//最后,它将这些信息封装到 SkiResort 对象中。 + public SkiResort parse(String html) { + SkiResort resort = new SkiResort(); + resort.setSkiTrail(new SkiTrail()); + resort.setSkiLift(new SkiLift()); + resort.setSkiTicket(new SkiTicket()); + + if (html == null || html.isEmpty()) { + return resort; + } + + Document doc; + try { + doc = Jsoup.parse(html); + } catch (Exception e) { + return resort; + } + + tryFillName(doc, resort); + tryFillCountryRegionFromBreadcrumb(doc, resort); + tryFillAltitude(doc, resort); + tryFillTotalKmAndTrailBreakdown(doc, resort); + tryFillLifts(doc, resort.getSkiLift()); + tryFillTickets(doc, resort.getSkiTicket()); + tryFillOverallScore(doc, resort); + + return resort; + } + + private void tryFillName(Document doc, SkiResort resort) { + try { + Element nameEl = doc.selectFirst(".resort-name"); + if (nameEl == null) { + nameEl = doc.selectFirst("h1"); + } + if (nameEl != null) { + String name = cleanText(nameEl.text()); + if (!name.isEmpty()) { + resort.setName(name); + } + } + } catch (Exception ignored) { + } + } + + private void tryFillCountryRegionFromBreadcrumb(Document doc, SkiResort resort) { + try { + Elements crumbs = doc.select(".breadcrumb a, nav.breadcrumb a, ol.breadcrumb a, ul.breadcrumb a, .breadcrumb li, nav.breadcrumb li, ol.breadcrumb li, ul.breadcrumb li"); + List items = new ArrayList<>(); + for (Element el : crumbs) { + String t = cleanText(el.text()); + if (t.isEmpty()) { + continue; + } + String lower = t.toLowerCase(Locale.ROOT); + if (lower.equals("ski resorts") || lower.equals("ski-resorts") || lower.equals("home") || lower.equals("worldwide")) { + continue; + } + items.add(t); + } + + if (items.size() >= 3) { + resort.setCountry(items.get(items.size() - 3)); + resort.setRegion(items.get(items.size() - 2)); + } else if (items.size() == 2) { + resort.setCountry(items.get(0)); + resort.setRegion(items.get(1)); + } else if (items.size() == 1) { + resort.setCountry(items.get(0)); + } + } catch (Exception ignored) { + } + } + + private void tryFillAltitude(Document doc, SkiResort resort) { + try { + String text = doc.text(); + + Integer min = null; + Integer max = null; + + Matcher range = ALT_RANGE_PATTERN.matcher(text); + if (range.find()) { + min = safeParseInt(range.group(1)); + max = safeParseInt(range.group(2)); + } else { + List ms = new ArrayList<>(); + Matcher m = INT_M_PATTERN.matcher(text); + while (m.find() && ms.size() < 3) { + Integer v = safeParseInt(m.group(1)); + if (v != null) { + ms.add(v); + } + } + if (ms.size() >= 2) { + min = ms.get(ms.size() - 2); + max = ms.get(ms.size() - 1); + } + } + + resort.setAltitudeMin(min); + resort.setAltitudeMax(max); + } catch (Exception ignored) { + } + } + + private void tryFillTotalKmAndTrailBreakdown(Document doc, SkiResort resort) { + try { + Double totalKm = null; + + Element kmEl = firstElementContaining(doc, "km", "slope", "slopes", "piste"); + if (kmEl != null) { + totalKm = firstDoubleFrom(KM_PATTERN, kmEl.text()); + } + if (totalKm == null) { + totalKm = firstDoubleFrom(KM_PATTERN, doc.text()); + } + resort.setTotalKm(totalKm); + + SkiTrail trail = resort.getSkiTrail(); + if (trail == null) { + trail = new SkiTrail(); + resort.setSkiTrail(trail); + } + trail.setTotalKm(totalKm); + + Integer beginnerPct = percentNearKeyword(doc, "beginner", "easy"); + Integer intermediatePct = percentNearKeyword(doc, "intermediate", "medium"); + Integer expertPct = percentNearKeyword(doc, "expert", "advanced", "difficult"); + + if (totalKm != null) { + if (beginnerPct != null) { + trail.setBeginnerKm(roundKm(totalKm * beginnerPct / 100.0)); + } + if (intermediatePct != null) { + trail.setIntermediateKm(roundKm(totalKm * intermediatePct / 100.0)); + } + if (expertPct != null) { + trail.setExpertKm(roundKm(totalKm * expertPct / 100.0)); + } + } + } catch (Exception ignored) { + } + } + + private void tryFillLifts(Document doc, SkiLift lift) { + if (lift == null) { + return; + } + try { + String text = doc.text(); + lift.setTotalLifts(intNear(text, "lift", "lifts")); + lift.setGondolas(intNear(text, "gondola", "gondolas")); + lift.setChairlifts(intNear(text, "chairlift", "chairlifts")); + lift.setSurfaceLifts(intNear(text, "surface lift", "surface lifts", "t-bar", "drag lift", "platter lift")); + lift.setCableCars(intNear(text, "cable car", "cable cars")); + lift.setTravelators(intNear(text, "travelator", "travelators", "moving carpet")); + } catch (Exception ignored) { + } + } + + private void tryFillTickets(Document doc, SkiTicket ticket) { + if (ticket == null) { + return; + } + try { + Element adultEl = firstElementContaining(doc, "adult", "adults"); + Element childEl = firstElementContaining(doc, "child", "children", "kid", "kids"); + + Price adult = (adultEl != null) ? extractPrice(adultEl.text()) : null; + Price child = (childEl != null) ? extractPrice(childEl.text()) : null; + + if (adult == null || child == null) { + List prices = extractAllPrices(doc.text(), 4); + if (adult == null && !prices.isEmpty()) { + adult = prices.get(0); + } + if (child == null && prices.size() >= 2) { + child = prices.get(1); + } + } + + if (adult != null) { + ticket.setPriceAdult(adult.amount); + ticket.setCurrency(adult.currency); + } + if (child != null) { + ticket.setPriceChild(child.amount); + if (ticket.getCurrency() == null) { + ticket.setCurrency(child.currency); + } + } + } catch (Exception ignored) { + } + } + + private void tryFillOverallScore(Document doc, SkiResort resort) { + try { + Element scoreEl = firstElementContaining(doc, "score", "rating", "stars"); + BigDecimal score = null; + if (scoreEl != null) { + score = firstBigDecimal(scoreEl.text()); + } + if (score == null) { + score = firstBigDecimal(doc.text()); + } + if (score != null) { + if (score.compareTo(BigDecimal.ZERO) < 0 || score.compareTo(new BigDecimal("10")) > 0) { + return; + } + resort.setOverallScore(score); + } + } catch (Exception ignored) { + } + } + + private Element firstElementContaining(Document doc, String... keywords) { + Elements candidates = doc.select("div, span, p, li, td, th"); + for (Element el : candidates) { + String t = el.text(); + if (t == null || t.isEmpty()) { + continue; + } + String lower = t.toLowerCase(Locale.ROOT); + for (String k : keywords) { + if (k != null && !k.isEmpty() && lower.contains(k.toLowerCase(Locale.ROOT))) { + return el; + } + } + } + return null; + } + + private Integer percentNearKeyword(Document doc, String... keywords) { + Elements candidates = doc.select("div, span, p, li, td, th"); + for (Element el : candidates) { + String t = el.text(); + if (t == null || t.isEmpty()) { + continue; + } + String lower = t.toLowerCase(Locale.ROOT); + boolean hit = false; + for (String k : keywords) { + if (k != null && !k.isEmpty() && lower.contains(k.toLowerCase(Locale.ROOT))) { + hit = true; + break; + } + } + if (!hit) { + continue; + } + Matcher m = PERCENT_PATTERN.matcher(t); + if (m.find()) { + Integer pct = safeParseInt(m.group(1)); + if (pct != null && pct >= 0 && pct <= 100) { + return pct; + } + } + } + return null; + } + + private Integer intNear(String text, String... keywords) { + if (text == null || text.isEmpty()) { + return null; + } + String lower = text.toLowerCase(Locale.ROOT); + int bestIndex = -1; + for (String k : keywords) { + if (k == null || k.isEmpty()) { + continue; + } + int idx = lower.indexOf(k.toLowerCase(Locale.ROOT)); + if (idx >= 0) { + bestIndex = idx; + break; + } + } + if (bestIndex < 0) { + return null; + } + int start = Math.max(0, bestIndex - 40); + int end = Math.min(text.length(), bestIndex + 40); + String window = text.substring(start, end); + Matcher m = Pattern.compile("(\\d{1,4})").matcher(window); + if (m.find()) { + return safeParseInt(m.group(1)); + } + return null; + } + + private Double roundKm(double v) { + return new BigDecimal(v).setScale(2, RoundingMode.HALF_UP).doubleValue(); + } + + private Double firstDoubleFrom(Pattern pattern, String text) { + if (text == null) { + return null; + } + Matcher m = pattern.matcher(text); + if (m.find()) { + return safeParseDouble(m.group(1)); + } + return null; + } + + private BigDecimal firstBigDecimal(String text) { + if (text == null) { + return null; + } + Matcher m = NUMBER_PATTERN.matcher(text); + if (m.find()) { + Double d = safeParseDouble(m.group(1)); + if (d == null) { + return null; + } + return BigDecimal.valueOf(d).setScale(2, RoundingMode.HALF_UP); + } + return null; + } + + private Price extractPrice(String text) { + if (text == null) { + return null; + } + Matcher m1 = CURRENCY_FIRST_PATTERN.matcher(text); + if (m1.find()) { + String cur = normalizeCurrency(m1.group(1), text.substring(m1.start(), Math.min(text.length(), m1.end()))); + Double amount = safeParseDouble(m1.group(2)); + if (amount != null) { + return new Price(cur, amount); + } + } + Matcher m2 = CURRENCY_LAST_PATTERN.matcher(text); + if (m2.find()) { + Double amount = safeParseDouble(m2.group(1)); + String cur = normalizeCurrency(null, m2.group(2)); + if (amount != null) { + return new Price(cur, amount); + } + } + return null; + } + + private List extractAllPrices(String text, int limit) { + List out = new ArrayList<>(); + if (text == null || text.isEmpty()) { + return out; + } + Matcher m1 = CURRENCY_FIRST_PATTERN.matcher(text); + while (m1.find() && out.size() < limit) { + Double amount = safeParseDouble(m1.group(2)); + if (amount == null) { + continue; + } + String cur = normalizeCurrency(m1.group(1), text.substring(m1.start(), Math.min(text.length(), m1.end()))); + out.add(new Price(cur, amount)); + } + Matcher m2 = CURRENCY_LAST_PATTERN.matcher(text); + while (m2.find() && out.size() < limit) { + Double amount = safeParseDouble(m2.group(1)); + if (amount == null) { + continue; + } + String cur = normalizeCurrency(null, m2.group(2)); + out.add(new Price(cur, amount)); + } + return out; + } + + private String normalizeCurrency(String group1, String raw) { + String src = (group1 != null && !group1.isEmpty()) ? group1 : raw; + if (src == null) { + return null; + } + String s = src.trim(); + if (s.startsWith("SFr")) { + return "SFr."; + } + if (s.equalsIgnoreCase("CHF")) { + return "CHF"; + } + if (s.contains("€")) { + return "€"; + } + if (s.contains("$")) { + return "$"; + } + if (s.contains("£")) { + return "£"; + } + return s.isEmpty() ? null : s; + } + + private String cleanText(String s) { + if (s == null) { + return ""; + } + return s.replace('\u00A0', ' ').trim(); + } + + private Integer safeParseInt(String s) { + try { + if (s == null) { + return null; + } + String t = s.replaceAll("[^0-9]", ""); + if (t.isEmpty()) { + return null; + } + return Integer.parseInt(t); + } catch (Exception e) { + return null; + } + } + + private Double safeParseDouble(String s) { + try { + if (s == null) { + return null; + } + String t = s.trim().replace(",", "."); + t = t.replaceAll("[^0-9.]", ""); + if (t.isEmpty()) { + return null; + } + return Double.parseDouble(t); + } catch (Exception e) { + return null; + } + } + + private static class Price { + private final String currency; + private final Double amount; + + private Price(String currency, Double amount) { + this.currency = currency; + this.amount = amount; + } + } +} diff --git a/src/main/java/com/ski/crawler/parser/ResortParser.java b/src/main/java/com/ski/crawler/parser/ResortParser.java new file mode 100644 index 0000000..5f65855 --- /dev/null +++ b/src/main/java/com/ski/crawler/parser/ResortParser.java @@ -0,0 +1,34 @@ +package com.ski.crawler.parser; + +import com.ski.crawler.model.SkiResort; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.time.LocalDateTime; +//ResortParser 类是解析器,负责解析滑雪场的 HTML 页内容。 +//它使用 Jsoup 解析 HTML 内容,然后根据 HTML 结构提取出滑雪场的名称、国家、区域、海拔、总距离、总轨迹、总 lift、总票、总评分等信息。 +//最后,它将这些信息封装到 SkiResort 对象中。 +public class ResortParser { + public SkiResort parseResort(String html, String sourceUrl) { + Document doc = Jsoup.parse(html); + SkiResort resort = new SkiResort(); + + String title = doc.title(); + resort.setName((title == null || title.isEmpty()) ? "UNKNOWN" : title); + resort.setSourceUrl(sourceUrl); + resort.setCrawledAt(LocalDateTime.now()); + + Element countryMeta = doc.selectFirst("meta[name=country]"); + if (countryMeta != null) { + resort.setCountry(countryMeta.attr("content")); + } + + Element regionMeta = doc.selectFirst("meta[name=region]"); + if (regionMeta != null) { + resort.setRegion(regionMeta.attr("content")); + } + + return resort; + } +} diff --git a/src/main/java/com/ski/crawler/repository/SkiResortRepository.java b/src/main/java/com/ski/crawler/repository/SkiResortRepository.java new file mode 100644 index 0000000..1255533 --- /dev/null +++ b/src/main/java/com/ski/crawler/repository/SkiResortRepository.java @@ -0,0 +1,74 @@ +package com.ski.crawler.repository; + +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.util.ValidationUtil; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.LongAdder; + +public class SkiResortRepository { + private final Map byUrl = new LinkedHashMap<>(); + + public synchronized boolean containsUrl(String url) { + if (url == null || url.trim().isEmpty()) { + return false; + } + return byUrl.containsKey(url.trim()); + } + + public synchronized boolean add(SkiResort resort) { + if (resort == null) { + return false; + } + SkiResort cleaned = ValidationUtil.clean(resort); + ValidationUtil.validate(cleaned); + String url = cleaned.getSourceUrl().trim(); + if (byUrl.containsKey(url)) { + return false; + } + byUrl.put(url, cleaned); + return true; + } + + public synchronized List getAll() { + return Collections.unmodifiableList(new ArrayList<>(byUrl.values())); + } + + public synchronized List filterByCountry(String keyword) { + String k = ValidationUtil.normalizeCountryKey(keyword); + if (k.isEmpty()) { + return getAll(); + } + List out = new ArrayList<>(); + for (SkiResort r : byUrl.values()) { + String c = ValidationUtil.normalizeCountryKey(r.getCountry()); + if (!c.isEmpty() && (c.equals(k) || c.contains(k))) { + out.add(r); + } + } + return Collections.unmodifiableList(out); + } + + public Map countByCountry() { + Map tmp = new ConcurrentHashMap<>(); + for (SkiResort r : getAll()) { + String c = r.getCountry(); + if (c == null || c.trim().isEmpty()) { + continue; + } + String key = c.replace('\u00A0', ' ').trim(); + tmp.computeIfAbsent(key, x -> new LongAdder()).increment(); + } + Map out = new LinkedHashMap<>(); + for (Map.Entry e : tmp.entrySet()) { + out.put(e.getKey(), e.getValue().sum()); + } + return out; + } +} + diff --git a/src/main/java/com/ski/crawler/service/ScraperService.java b/src/main/java/com/ski/crawler/service/ScraperService.java new file mode 100644 index 0000000..e77c9d7 --- /dev/null +++ b/src/main/java/com/ski/crawler/service/ScraperService.java @@ -0,0 +1,243 @@ +package com.ski.crawler.service; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.ski.crawler.exception.NetworkException; +import com.ski.crawler.exception.ParseException; +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.repository.SkiResortRepository; +import com.ski.crawler.strategy.CrawlStrategy; +import com.ski.crawler.util.JsonUtil; +import com.ski.crawler.util.RetryUtil; +import com.ski.crawler.util.ValidationUtil; +import com.ski.crawler.utils.CrawlerHttp; +import com.ski.crawler.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.LongAdder; + +public class ScraperService { + private static final Logger log = LoggerFactory.getLogger(ScraperService.class); + + public CrawlReport crawl( + CrawlStrategy strategy, + String startUrl, + int limit, + int threads, + String countryFilter, + CrawlerHttp http, + SkiResortRepository repo, + boolean incremental, + String outPath, + ConsoleView view, + boolean showFailures, + boolean dryRun, + int retryAttempts, + long retrySleepMs + ) throws NetworkException { + String actualStartUrl = (startUrl == null || startUrl.isEmpty()) ? strategy.defaultStartUrl() : startUrl; + + List urls; + try { + urls = RetryUtil.retry(() -> strategy.collectDetailUrls(actualStartUrl, limit, http), retryAttempts, retrySleepMs); + } catch (NetworkException e) { + throw e; + } catch (Exception e) { + throw new NetworkException("Collect urls failed: " + e.getMessage(), e); + } + + int total = urls.size(); + Queue queue = new ConcurrentLinkedQueue<>(urls); + AtomicInteger done = new AtomicInteger(0); + AtomicInteger success = new AtomicInteger(0); + AtomicInteger skipped = new AtomicInteger(0); + AtomicInteger failed = new AtomicInteger(0); + AtomicInteger filteredOut = new AtomicInteger(0); + + Map byCountry = new ConcurrentHashMap<>(); + List failures = Collections.synchronizedList(new ArrayList<>()); + Map seenThisRun = new ConcurrentHashMap<>(); + Object outLock = new Object(); + + BufferedWriter outWriter = null; + ObjectMapper mapper = null; + if (outPath != null && !outPath.trim().isEmpty()) { + try { + outWriter = JsonUtil.openJsonlWriter(outPath.trim()); + mapper = JsonUtil.mapper(); + } catch (Exception e) { + throw new NetworkException("Open out file failed: " + e.getMessage(), e); + } + } + final BufferedWriter outWriterFinal = outWriter; + final ObjectMapper mapperFinal = mapper; + + view.printHeader(); + + int workerCount = Math.max(1, threads); + ExecutorService pool = Executors.newFixedThreadPool(workerCount); + for (int i = 0; i < workerCount; i++) { + pool.submit(() -> { + while (true) { + String url = queue.poll(); + if (url == null) { + return; + } + + try { + if (incremental) { + if (repo.containsUrl(url)) { + skipped.incrementAndGet(); + continue; + } + if (dryRun) { + if (seenThisRun.putIfAbsent(url, Boolean.TRUE) != null) { + skipped.incrementAndGet(); + continue; + } + } + } + + String html = RetryUtil.retry(() -> http.getHtml(url), retryAttempts, retrySleepMs); + SkiResort resort = strategy.parseDetail(url, html); + resort.setSourceSite(strategy.id()); + resort.setSourceUrl(url); + SkiResort cleaned = ValidationUtil.clean(resort); + ValidationUtil.validate(cleaned); + success.incrementAndGet(); + + String country = cleaned.getCountry(); + if (country != null && !country.trim().isEmpty()) { + String key = country.replace('\u00A0', ' ').trim(); + byCountry.computeIfAbsent(key, k -> new LongAdder()).increment(); + } + + String filter = ValidationUtil.normalizeCountryKey(countryFilter); + if (!filter.isEmpty()) { + String c = ValidationUtil.normalizeCountryKey(cleaned.getCountry()); + if (c.isEmpty() || (!c.equals(filter) && !c.contains(filter))) { + filteredOut.incrementAndGet(); + if (!dryRun) { + repo.add(cleaned); + } + continue; + } + } + + synchronized (outLock) { + view.printResort(cleaned); + if (!dryRun && outWriterFinal != null && mapperFinal != null) { + outWriterFinal.write(mapperFinal.writeValueAsString(toJson(cleaned))); + outWriterFinal.newLine(); + } + } + + if (!dryRun) { + repo.add(cleaned); + } + } catch (ParseException e) { + failed.incrementAndGet(); + if (showFailures && failures.size() < 200) { + failures.add(url + " [ParseException] " + safeMsg(e.getMessage())); + } + log.error("Parse failed: {}", url, e); + } catch (Exception e) { + failed.incrementAndGet(); + if (showFailures && failures.size() < 200) { + failures.add(url + " [" + e.getClass().getSimpleName() + "] " + safeMsg(e.getMessage())); + } + log.error("Crawl failed: {}", url, e); + } finally { + int finished = done.incrementAndGet(); + log.info("{}/{} success={} skipped={} failed={}", finished, total, success.get(), skipped.get(), failed.get()); + } + } + }); + } + + pool.shutdown(); + try { + pool.awaitTermination(7, TimeUnit.DAYS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + + if (outWriterFinal != null) { + try (BufferedWriter w = outWriterFinal) { + w.flush(); + } catch (Exception ignored) { + } + } + + Map byCountryOut = new LinkedHashMap<>(); + for (Map.Entry e : byCountry.entrySet()) { + byCountryOut.put(e.getKey(), e.getValue().sum()); + } + + CrawlReport report = new CrawlReport(); + report.site = strategy.id(); + report.total = total; + report.success = success.get(); + report.filteredOut = filteredOut.get(); + report.skipped = skipped.get(); + report.failed = failed.get(); + report.byCountry = byCountryOut; + report.failures = new ArrayList<>(failures); + return report; + } + + private Map toJson(SkiResort r) { + Map obj = new LinkedHashMap<>(); + obj.put("id", r.getId()); + obj.put("name", r.getName()); + obj.put("country", r.getCountry()); + obj.put("region", r.getRegion()); + obj.put("latitude", r.getLatitude()); + obj.put("longitude", r.getLongitude()); + obj.put("altitudeMin", r.getAltitudeMin()); + obj.put("altitudeMax", r.getAltitudeMax()); + obj.put("totalKm", r.getTotalKm()); + obj.put("slopeCount", r.getSlopeCount()); + obj.put("liftCount", r.getLiftCount()); + obj.put("ticketPriceMin", r.getTicketPriceMin()); + obj.put("ticketPriceMax", r.getTicketPriceMax()); + obj.put("currency", r.getCurrency()); + obj.put("openTime", r.getOpenTime()); + obj.put("snowDepthCm", r.getSnowDepthCm()); + obj.put("temperatureC", r.getTemperatureC()); + obj.put("nearbyHotels", r.getNearbyHotels()); + obj.put("rentalShops", r.getRentalShops()); + obj.put("url", r.getSourceUrl()); + obj.put("sourceSite", r.getSourceSite()); + obj.put("crawlTime", r.getCrawledAt() == null ? null : r.getCrawledAt().toString()); + return obj; + } + + private String safeMsg(String s) { + return s == null ? "" : s.replace('\n', ' ').replace('\r', ' ').trim(); + } + + public static class CrawlReport { + public String site; + public int total; + public int success; + public int filteredOut; + public int skipped; + public int failed; + public Map byCountry; + public List failures; + } +} diff --git a/src/main/java/com/ski/crawler/site/CrawlerSite.java b/src/main/java/com/ski/crawler/site/CrawlerSite.java new file mode 100644 index 0000000..738970b --- /dev/null +++ b/src/main/java/com/ski/crawler/site/CrawlerSite.java @@ -0,0 +1,22 @@ +//站点抽象接口 :每个站点实现“列表采集 + 详情解析”两件事 +//每个站点需要实现以下方法: +//id():返回站点的唯一标识符,用于在命令行中指定要采集的站点。 +//defaultStartUrl():返回站点的默认采集起始 URL。 +//collectDetailUrls():采集站点的详情页 URL 列表,返回一个字符串列表。 +//parseDetail():解析详情页 HTML,返回一个 SkiResort 实例。 +package com.ski.crawler.site; + +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.utils.CrawlerHttp; + +import java.util.List; + +public interface CrawlerSite { + String id();// + + String defaultStartUrl();// + + List collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws Exception; + + SkiResort parseDetail(String sourceUrl, String html) throws Exception; +} diff --git a/src/main/java/com/ski/crawler/site/SkimapOrgSite.java b/src/main/java/com/ski/crawler/site/SkimapOrgSite.java new file mode 100644 index 0000000..5871ef5 --- /dev/null +++ b/src/main/java/com/ski/crawler/site/SkimapOrgSite.java @@ -0,0 +1,194 @@ +package com.ski.crawler.site; + +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.utils.CrawlerHttp; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class SkimapOrgSite implements CrawlerSite { + private static final Pattern LAT_LON_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*,\\s*(-?\\d+(?:\\.\\d+)?)"); + + @Override + public String id() { + return "skimap"; + } + + @Override + public String defaultStartUrl() { + return "https://skimap.org"; + } + + @Override + public List collectDetailUrls(String startUrl, int limit, CrawlerHttp http) { + Set out = new LinkedHashSet<>(); + Set visited = new LinkedHashSet<>(); + + String page = startUrl; + while (page != null && !page.isEmpty() && !visited.contains(page)) { + visited.add(page); + if (page.toLowerCase(Locale.ROOT).contains("/skiareas/view/")) { + out.add(page); + break; + } + + Document doc = http.getDocument(page); + for (Element a : doc.select("a[href]")) { + String href = a.attr("href"); + if (href == null || href.isEmpty()) { + continue; + } + String abs = a.absUrl("href"); + if (abs == null || abs.isEmpty()) { + continue; + } + String lower = abs.toLowerCase(Locale.ROOT); + if (!lower.contains("/skiareas/view/")) { + continue; + } + out.add(abs); + if (limit > 0 && out.size() >= limit) { + return new ArrayList<>(out); + } + } + + String next = findNext(doc); + page = (next != null && !visited.contains(next)) ? next : null; + } + + return new ArrayList<>(out); + } + + @Override + public SkiResort parseDetail(String sourceUrl, String html) { + Document doc = org.jsoup.Jsoup.parse(html, sourceUrl); + SkiResort resort = new SkiResort(); + + String name = null; + Element h1 = doc.selectFirst("h1"); + if (h1 != null) { + name = clean(h1.text()); + } + if (name == null || name.isEmpty()) { + Element ogTitle = doc.selectFirst("meta[property=og:title]"); + if (ogTitle != null) { + name = clean(ogTitle.attr("content")); + } + } + if (name != null && !name.isEmpty()) { + resort.setName(name); + } + + List crumbs = new ArrayList<>(); + for (Element a : doc.select(".breadcrumb a, nav.breadcrumb a, ol.breadcrumb a, ul.breadcrumb a")) { + String t = clean(a.text()); + if (!t.isEmpty()) { + crumbs.add(t); + } + } + if (crumbs.size() >= 1) { + resort.setCountry(crumbs.get(crumbs.size() - 1)); + } + if (crumbs.size() >= 2) { + resort.setRegion(crumbs.get(crumbs.size() - 2)); + } + + Double[] latLon = extractLatLon(doc); + if (latLon != null) { + resort.setLatitude(latLon[0]); + resort.setLongitude(latLon[1]); + } + + return resort; + } + + private String findNext(Document doc) { + Element e = doc.selectFirst("a[rel=next], a.next, li.pagination-next a, a[aria-label=Next]"); + if (e != null) { + String abs = e.absUrl("href"); + return abs == null || abs.isEmpty() ? null : abs; + } + for (Element a : doc.select("a[href]")) { + String t = clean(a.text()).toLowerCase(Locale.ROOT); + if (t.equals("next") || t.equals("next ›") || t.contains("next")) { + String abs = a.absUrl("href"); + if (abs != null && !abs.isEmpty()) { + return abs; + } + } + } + return null; + } + + private Double[] extractLatLon(Document doc) { + Element metaLat = doc.selectFirst("meta[property=place:location:latitude], meta[name=geo.position]"); + Element metaLon = doc.selectFirst("meta[property=place:location:longitude]"); + if (metaLat != null && metaLon != null) { + Double lat = safeParseDouble(metaLat.attr("content")); + Double lon = safeParseDouble(metaLon.attr("content")); + if (lat != null && lon != null) { + return new Double[]{lat, lon}; + } + } + if (metaLat != null) { + Double[] ll = parseLatLon(metaLat.attr("content")); + if (ll != null) { + return ll; + } + } + Double[] ll = parseLatLon(doc.text()); + if (ll != null) { + return ll; + } + return null; + } + + private Double[] parseLatLon(String text) { + if (text == null || text.isEmpty()) { + return null; + } + Matcher m = LAT_LON_PATTERN.matcher(text); + while (m.find()) { + Double lat = safeParseDouble(m.group(1)); + Double lon = safeParseDouble(m.group(2)); + if (lat == null || lon == null) { + continue; + } + if (lat < -90 || lat > 90 || lon < -180 || lon > 180) { + continue; + } + return new Double[]{lat, lon}; + } + return null; + } + + private String clean(String s) { + if (s == null) { + return ""; + } + return s.replace('\u00A0', ' ').trim(); + } + + private Double safeParseDouble(String s) { + try { + if (s == null) { + return null; + } + String t = s.trim().replace(",", "."); + t = t.replaceAll("[^0-9.\\-]", ""); + if (t.isEmpty()) { + return null; + } + return Double.parseDouble(t); + } catch (Exception e) { + return null; + } + } +} diff --git a/src/main/java/com/ski/crawler/site/SkiresortInfoSite.java b/src/main/java/com/ski/crawler/site/SkiresortInfoSite.java new file mode 100644 index 0000000..7b6cbef --- /dev/null +++ b/src/main/java/com/ski/crawler/site/SkiresortInfoSite.java @@ -0,0 +1,33 @@ +package com.ski.crawler.site; + +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.parser.ResortDetailParser; +import com.ski.crawler.spider.ResortListSpider; +import com.ski.crawler.utils.CrawlerHttp; + +import java.util.List; + +public class SkiresortInfoSite implements CrawlerSite { + private final ResortDetailParser detailParser = new ResortDetailParser(); + + @Override + public String id() { + return "skiresort"; + } + + @Override + public String defaultStartUrl() { + return "https://www.skiresort.info/ski-resorts/"; + } + + @Override + public List collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws Exception { + ResortListSpider listSpider = new ResortListSpider(http); + return limit > 0 ? listSpider.fetchFirst(startUrl, limit) : listSpider.fetchAll(startUrl); + } + + @Override + public SkiResort parseDetail(String sourceUrl, String html) throws Exception { + return detailParser.parse(html); + } +} diff --git a/src/main/java/com/ski/crawler/site/WikipediaSite.java b/src/main/java/com/ski/crawler/site/WikipediaSite.java new file mode 100644 index 0000000..1971bd1 --- /dev/null +++ b/src/main/java/com/ski/crawler/site/WikipediaSite.java @@ -0,0 +1,204 @@ +package com.ski.crawler.site; + +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.utils.CrawlerHttp; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class WikipediaSite implements CrawlerSite { + private static final Pattern GEO_SEMI_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*;\\s*(-?\\d+(?:\\.\\d+)?)"); + private static final Pattern GEO_COMMA_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*,\\s*(-?\\d+(?:\\.\\d+)?)"); + + @Override + public String id() { + return "wikipedia"; + } + + @Override + public String defaultStartUrl() { + return "https://en.wikipedia.org/wiki/List_of_ski_areas_and_resorts"; + } + + @Override + public List collectDetailUrls(String startUrl, int limit, CrawlerHttp http) { + Document doc = http.getDocument(startUrl); + Element content = doc.selectFirst("#mw-content-text"); + if (content == null) { + content = doc.body(); + } + + Set out = new LinkedHashSet<>(); + for (Element a : content.select("a[href]")) { + String href = a.attr("href"); + if (href == null || href.isEmpty()) { + continue; + } + if (!href.startsWith("/wiki/")) { + continue; + } + if (href.contains(":")) { + continue; + } + if (href.contains("#")) { + href = href.substring(0, href.indexOf('#')); + } + String abs = a.absUrl("href"); + if (abs == null || abs.isEmpty()) { + continue; + } + String lower = abs.toLowerCase(Locale.ROOT); + if (lower.contains("list_of_")) { + continue; + } + out.add(abs); + if (limit > 0 && out.size() >= limit) { + break; + } + } + return new ArrayList<>(out); + } + + @Override + public SkiResort parseDetail(String sourceUrl, String html) { + Document doc = org.jsoup.Jsoup.parse(html, sourceUrl); + SkiResort resort = new SkiResort(); + + Element h1 = doc.selectFirst("#firstHeading"); + if (h1 == null) { + h1 = doc.selectFirst("h1"); + } + if (h1 != null) { + String name = clean(h1.text()); + if (!name.isEmpty()) { + resort.setName(name); + } + } + + Element infobox = doc.selectFirst("table.infobox"); + if (infobox != null) { + String country = extractInfoboxValue(infobox, "Country"); + if (country != null && !country.isEmpty()) { + resort.setCountry(country); + } + String region = extractInfoboxValue(infobox, "Location"); + if (region != null && !region.isEmpty()) { + resort.setRegion(region); + } + } + + Double[] latLon = extractLatLon(doc); + if (latLon != null) { + resort.setLatitude(latLon[0]); + resort.setLongitude(latLon[1]); + } + + return resort; + } + + private Double[] extractLatLon(Document doc) { + Element geoDec = doc.selectFirst("span.geo-dec"); + if (geoDec != null) { + Double[] ll = parseLatLon(geoDec.text()); + if (ll != null) { + return ll; + } + } + Element geo = doc.selectFirst("span.geo"); + if (geo != null) { + Double[] ll = parseLatLon(geo.text()); + if (ll != null) { + return ll; + } + } + Element metaLat = doc.selectFirst("meta[property=place:location:latitude], meta[name=geo.position]"); + Element metaLon = doc.selectFirst("meta[property=place:location:longitude]"); + if (metaLat != null && metaLon != null) { + Double lat = safeParseDouble(metaLat.attr("content")); + Double lon = safeParseDouble(metaLon.attr("content")); + if (lat != null && lon != null) { + return new Double[]{lat, lon}; + } + } + if (metaLat != null) { + Double[] ll = parseLatLon(metaLat.attr("content")); + if (ll != null) { + return ll; + } + } + return null; + } + + private Double[] parseLatLon(String text) { + if (text == null || text.isEmpty()) { + return null; + } + Matcher m1 = GEO_SEMI_PATTERN.matcher(text); + if (m1.find()) { + Double lat = safeParseDouble(m1.group(1)); + Double lon = safeParseDouble(m1.group(2)); + if (lat != null && lon != null) { + return new Double[]{lat, lon}; + } + } + Matcher m2 = GEO_COMMA_PATTERN.matcher(text); + if (m2.find()) { + Double lat = safeParseDouble(m2.group(1)); + Double lon = safeParseDouble(m2.group(2)); + if (lat != null && lon != null) { + return new Double[]{lat, lon}; + } + } + return null; + } + + private String extractInfoboxValue(Element infobox, String header) { + for (Element row : infobox.select("tr")) { + Element th = row.selectFirst("th"); + Element td = row.selectFirst("td"); + if (th == null || td == null) { + continue; + } + String key = clean(th.text()); + if (!header.equalsIgnoreCase(key)) { + continue; + } + String value = clean(td.text()); + if (value.isEmpty()) { + return null; + } + return value; + } + return null; + } + + private String clean(String s) { + if (s == null) { + return ""; + } + return s.replace('\u00A0', ' ').trim(); + } + + private Double safeParseDouble(String s) { + try { + if (s == null) { + return null; + } + String t = s.trim().replace(",", "."); + t = t.replaceAll("[^0-9.\\-]", ""); + if (t.isEmpty()) { + return null; + } + return Double.parseDouble(t); + } catch (Exception e) { + return null; + } + } +} diff --git a/src/main/java/com/ski/crawler/spider/ResortListSpider.java b/src/main/java/com/ski/crawler/spider/ResortListSpider.java new file mode 100644 index 0000000..7559362 --- /dev/null +++ b/src/main/java/com/ski/crawler/spider/ResortListSpider.java @@ -0,0 +1,98 @@ +//负责收集所有滑雪场的详情页地址 +package com.ski.crawler.spider; + +import com.ski.crawler.utils.CrawlerHttp; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Random; +import java.util.Set; +//ResortListSpider 类是爬虫的列表采集器,负责采集所有滑雪场的列表页。 +//它使用一个队列来存储待采集的 URL,每次从队列中取出一个 URL,然后使用 Jsoup 连接该 URL 并获取 HTML 内容。 +//最后,它解析 HTML 内容,提取出所有滑雪场的详情页 URL,并将它们添加到队列中。 +public class ResortListSpider { + private final CrawlerHttp http; + private final Random random = new Random(); + private final LinkedList queue = new LinkedList<>();// + + public ResortListSpider(CrawlerHttp http) { + this.http = http; + } + + public List fetchAll(String startUrl) throws IOException, InterruptedException { + return fetchFirst(startUrl, -1); + } + + public List fetchFirst(String startUrl, int limit) throws IOException, InterruptedException { + Set visitedPages = new HashSet<>(); + Set detailUrls = new HashSet<>(); + String page = startUrl; + + while (page != null && !visitedPages.contains(page)) { + visitedPages.add(page); + + Document doc = http.getDocument(page); + + Elements links = doc.select("a[href]"); + for (Element a : links) { + String href = a.attr("href"); + if (href == null || href.isEmpty()) { + continue; + } + if (href.startsWith("/ski-resort/") || href.startsWith("https://www.skiresort.info/ski-resort/")) { + String abs = a.absUrl("href"); + if (!abs.isEmpty()) { + detailUrls.add(abs); + if (limit > 0 && detailUrls.size() >= limit) { + break; + } + } + } + } + + if (limit > 0 && detailUrls.size() >= limit) { + break; + } + + String next = findNext(doc); + if (next != null && !visitedPages.contains(next)) { + page = next; + } else { + page = null; + } + + Thread.sleep(2000 + random.nextInt(2001)); + } + + queue.clear(); + queue.addAll(detailUrls); + return new LinkedList<>(queue); + } + + private String findNext(Document doc) { + Element e = doc.selectFirst("a[rel=next], a.next, li.pagination-next a"); + if (e != null) { + return e.absUrl("href"); + } + + for (Element a : doc.select("a[href]")) { + String t = a.text().toLowerCase(); + if (t.contains("next") || t.contains("下一页") || t.contains("weiter")) { + String abs = a.absUrl("href"); + if (!abs.isEmpty()) { + return abs; + } + } + } + return null; + } + + public LinkedList getQueue() { + return new LinkedList<>(queue); + } +} diff --git a/src/main/java/com/ski/crawler/strategy/CrawlStrategy.java b/src/main/java/com/ski/crawler/strategy/CrawlStrategy.java new file mode 100644 index 0000000..967214c --- /dev/null +++ b/src/main/java/com/ski/crawler/strategy/CrawlStrategy.java @@ -0,0 +1,19 @@ +package com.ski.crawler.strategy; + +import com.ski.crawler.exception.NetworkException; +import com.ski.crawler.exception.ParseException; +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.utils.CrawlerHttp; + +import java.util.List; + +public interface CrawlStrategy { + String id(); + + String defaultStartUrl(); + + List collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws NetworkException; + + SkiResort parseDetail(String sourceUrl, String html) throws ParseException; +} + diff --git a/src/main/java/com/ski/crawler/strategy/SkiResortInfoStrategy.java b/src/main/java/com/ski/crawler/strategy/SkiResortInfoStrategy.java new file mode 100644 index 0000000..0e786a0 --- /dev/null +++ b/src/main/java/com/ski/crawler/strategy/SkiResortInfoStrategy.java @@ -0,0 +1,81 @@ +package com.ski.crawler.strategy; + +import com.ski.crawler.exception.NetworkException; +import com.ski.crawler.exception.ParseException; +import com.ski.crawler.model.SkiLift; +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.model.SkiTicket; +import com.ski.crawler.model.SkiTrail; +import com.ski.crawler.parser.ResortDetailParser; +import com.ski.crawler.spider.ResortListSpider; +import com.ski.crawler.utils.CrawlerHttp; + +import java.util.List; + +public class SkiResortInfoStrategy implements CrawlStrategy { + private final ResortDetailParser detailParser = new ResortDetailParser(); + + @Override + public String id() { + return "skiresort"; + } + + @Override + public String defaultStartUrl() { + return "https://www.skiresort.info/ski-resorts/"; + } + + @Override + public List collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws NetworkException { + try { + ResortListSpider listSpider = new ResortListSpider(http); + return limit > 0 ? listSpider.fetchFirst(startUrl, limit) : listSpider.fetchAll(startUrl); + } catch (Exception e) { + throw new NetworkException("Collect urls failed: " + e.getMessage(), e); + } + } + + @Override + public SkiResort parseDetail(String sourceUrl, String html) throws ParseException { + try { + SkiResort resort = detailParser.parse(html); + resort.setSourceUrl(sourceUrl); + resort.setSourceSite(id()); + + SkiTrail trail = resort.getSkiTrail(); + if (trail != null && trail.getTotalRuns() != null) { + resort.setSlopeCount(trail.getTotalRuns()); + } + + SkiLift lift = resort.getSkiLift(); + if (lift != null && lift.getTotalLifts() != null) { + resort.setLiftCount(lift.getTotalLifts()); + } + + SkiTicket ticket = resort.getSkiTicket(); + if (ticket != null) { + Double a = ticket.getPriceAdult(); + Double c = ticket.getPriceChild(); + if (ticket.getCurrency() != null && resort.getCurrency() == null) { + resort.setCurrency(ticket.getCurrency()); + } + Double min = null; + Double max = null; + if (a != null) { + min = a; + max = a; + } + if (c != null) { + min = min == null ? c : Math.min(min, c); + max = max == null ? c : Math.max(max, c); + } + resort.setTicketPriceMin(min); + resort.setTicketPriceMax(max); + } + + return resort; + } catch (Exception e) { + throw new ParseException("Parse detail failed: " + e.getMessage(), e); + } + } +} diff --git a/src/main/java/com/ski/crawler/strategy/SkimapStrategy.java b/src/main/java/com/ski/crawler/strategy/SkimapStrategy.java new file mode 100644 index 0000000..8acb633 --- /dev/null +++ b/src/main/java/com/ski/crawler/strategy/SkimapStrategy.java @@ -0,0 +1,199 @@ +package com.ski.crawler.strategy; + +import com.ski.crawler.exception.NetworkException; +import com.ski.crawler.exception.ParseException; +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.utils.CrawlerHttp; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class SkimapStrategy implements CrawlStrategy { + private static final Pattern LAT_LON_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*,\\s*(-?\\d+(?:\\.\\d+)?)"); + + @Override + public String id() { + return "skimap"; + } + + @Override + public String defaultStartUrl() { + return "https://skimap.org"; + } + + @Override + public List collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws NetworkException { + try { + Set out = new LinkedHashSet<>(); + Set visited = new LinkedHashSet<>(); + + String page = startUrl; + while (page != null && !page.isEmpty() && !visited.contains(page)) { + visited.add(page); + if (page.toLowerCase(Locale.ROOT).contains("/skiareas/view/")) { + out.add(page); + break; + } + + Document doc = http.getDocument(page); + for (Element a : doc.select("a[href]")) { + String abs = a.absUrl("href"); + if (abs == null || abs.isEmpty()) { + continue; + } + String lower = abs.toLowerCase(Locale.ROOT); + if (!lower.contains("/skiareas/view/")) { + continue; + } + out.add(abs); + if (limit > 0 && out.size() >= limit) { + return new ArrayList<>(out); + } + } + + String next = findNext(doc); + page = (next != null && !visited.contains(next)) ? next : null; + } + + return new ArrayList<>(out); + } catch (Exception e) { + throw new NetworkException("Collect urls failed: " + e.getMessage(), e); + } + } + + @Override + public SkiResort parseDetail(String sourceUrl, String html) throws ParseException { + try { + Document doc = org.jsoup.Jsoup.parse(html, sourceUrl); + SkiResort resort = new SkiResort(); + resort.setSourceUrl(sourceUrl); + resort.setSourceSite(id()); + + String name = null; + Element h1 = doc.selectFirst("h1"); + if (h1 != null) { + name = clean(h1.text()); + } + if (name == null || name.isEmpty()) { + Element ogTitle = doc.selectFirst("meta[property=og:title]"); + if (ogTitle != null) { + name = clean(ogTitle.attr("content")); + } + } + if (name != null && !name.isEmpty()) { + resort.setName(name); + } + + List crumbs = new ArrayList<>(); + for (Element a : doc.select(".breadcrumb a, nav.breadcrumb a, ol.breadcrumb a, ul.breadcrumb a")) { + String t = clean(a.text()); + if (!t.isEmpty()) { + crumbs.add(t); + } + } + if (crumbs.size() >= 1) { + resort.setCountry(crumbs.get(crumbs.size() - 1)); + } + if (crumbs.size() >= 2) { + resort.setRegion(crumbs.get(crumbs.size() - 2)); + } + + Double[] latLon = extractLatLon(doc); + if (latLon != null) { + resort.setLatitude(latLon[0]); + resort.setLongitude(latLon[1]); + } + + return resort; + } catch (Exception e) { + throw new ParseException("Parse detail failed: " + e.getMessage(), e); + } + } + + private String findNext(Document doc) { + Element e = doc.selectFirst("a[rel=next], a.next, li.pagination-next a, a[aria-label=Next]"); + if (e != null) { + String abs = e.absUrl("href"); + return abs == null || abs.isEmpty() ? null : abs; + } + for (Element a : doc.select("a[href]")) { + String t = clean(a.text()).toLowerCase(Locale.ROOT); + if (t.equals("next") || t.equals("next ›") || t.contains("next")) { + String abs = a.absUrl("href"); + if (abs != null && !abs.isEmpty()) { + return abs; + } + } + } + return null; + } + + private Double[] extractLatLon(Document doc) { + Element metaLat = doc.selectFirst("meta[property=place:location:latitude], meta[name=geo.position]"); + Element metaLon = doc.selectFirst("meta[property=place:location:longitude]"); + if (metaLat != null && metaLon != null) { + Double lat = safeParseDouble(metaLat.attr("content")); + Double lon = safeParseDouble(metaLon.attr("content")); + if (lat != null && lon != null) { + return new Double[]{lat, lon}; + } + } + if (metaLat != null) { + Double[] ll = parseLatLon(metaLat.attr("content")); + if (ll != null) { + return ll; + } + } + return parseLatLon(doc.text()); + } + + private Double[] parseLatLon(String text) { + if (text == null || text.isEmpty()) { + return null; + } + Matcher m = LAT_LON_PATTERN.matcher(text); + while (m.find()) { + Double lat = safeParseDouble(m.group(1)); + Double lon = safeParseDouble(m.group(2)); + if (lat == null || lon == null) { + continue; + } + if (lat < -90 || lat > 90 || lon < -180 || lon > 180) { + continue; + } + return new Double[]{lat, lon}; + } + return null; + } + + private String clean(String s) { + if (s == null) { + return ""; + } + return s.replace('\u00A0', ' ').trim(); + } + + private Double safeParseDouble(String s) { + try { + if (s == null) { + return null; + } + String t = s.trim().replace(",", "."); + t = t.replaceAll("[^0-9.\\-]", ""); + if (t.isEmpty()) { + return null; + } + return Double.parseDouble(t); + } catch (Exception e) { + return null; + } + } +} + diff --git a/src/main/java/com/ski/crawler/strategy/WikipediaStrategy.java b/src/main/java/com/ski/crawler/strategy/WikipediaStrategy.java new file mode 100644 index 0000000..3848c98 --- /dev/null +++ b/src/main/java/com/ski/crawler/strategy/WikipediaStrategy.java @@ -0,0 +1,244 @@ +package com.ski.crawler.strategy; + +import com.ski.crawler.exception.NetworkException; +import com.ski.crawler.exception.ParseException; +import com.ski.crawler.model.SkiResort; +import com.ski.crawler.utils.CrawlerHttp; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class WikipediaStrategy implements CrawlStrategy { + private static final Pattern GEO_SEMI_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*;\\s*(-?\\d+(?:\\.\\d+)?)"); + private static final Pattern GEO_COMMA_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*,\\s*(-?\\d+(?:\\.\\d+)?)"); + private static final Pattern INT_M_PATTERN = Pattern.compile("(\\d{2,5})\\s*m\\b", Pattern.CASE_INSENSITIVE); + + @Override + public String id() { + return "wikipedia"; + } + + @Override + public String defaultStartUrl() { + return "https://en.wikipedia.org/wiki/List_of_ski_areas_and_resorts"; + } + + @Override + public List collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws NetworkException { + try { + Document doc = http.getDocument(startUrl); + Element content = doc.selectFirst("#mw-content-text"); + if (content == null) { + content = doc.body(); + } + + Set out = new LinkedHashSet<>(); + for (Element a : content.select("a[href]")) { + String href = a.attr("href"); + if (href == null || href.isEmpty()) { + continue; + } + if (!href.startsWith("/wiki/")) { + continue; + } + if (href.contains(":")) { + continue; + } + if (href.contains("#")) { + href = href.substring(0, href.indexOf('#')); + } + String abs = a.absUrl("href"); + if (abs == null || abs.isEmpty()) { + continue; + } + String lower = abs.toLowerCase(Locale.ROOT); + if (lower.contains("list_of_")) { + continue; + } + out.add(abs); + if (limit > 0 && out.size() >= limit) { + break; + } + } + return new ArrayList<>(out); + } catch (Exception e) { + throw new NetworkException("Collect urls failed: " + e.getMessage(), e); + } + } + + @Override + public SkiResort parseDetail(String sourceUrl, String html) throws ParseException { + try { + Document doc = org.jsoup.Jsoup.parse(html, sourceUrl); + SkiResort resort = new SkiResort(); + resort.setSourceUrl(sourceUrl); + resort.setSourceSite(id()); + + Element h1 = doc.selectFirst("#firstHeading"); + if (h1 == null) { + h1 = doc.selectFirst("h1"); + } + if (h1 != null) { + String name = clean(h1.text()); + if (!name.isEmpty()) { + resort.setName(name); + } + } + + Element infobox = doc.selectFirst("table.infobox"); + if (infobox != null) { + String country = extractInfoboxValue(infobox, "Country"); + if (country != null && !country.isEmpty()) { + resort.setCountry(country); + } + String location = extractInfoboxValue(infobox, "Location"); + if (location != null && !location.isEmpty()) { + resort.setRegion(location); + } + + Integer top = extractElevation(infobox, "Top elevation", "Highest elevation"); + Integer base = extractElevation(infobox, "Base elevation", "Lowest elevation"); + if (base != null) { + resort.setAltitudeMin(base); + } + if (top != null) { + resort.setAltitudeMax(top); + } + } + + Double[] latLon = extractLatLon(doc); + if (latLon != null) { + resort.setLatitude(latLon[0]); + resort.setLongitude(latLon[1]); + } + + return resort; + } catch (Exception e) { + throw new ParseException("Parse detail failed: " + e.getMessage(), e); + } + } + + private Double[] extractLatLon(Document doc) { + Element geoDec = doc.selectFirst("span.geo-dec"); + if (geoDec != null) { + Double[] ll = parseLatLon(geoDec.text()); + if (ll != null) { + return ll; + } + } + Element geo = doc.selectFirst("span.geo"); + if (geo != null) { + Double[] ll = parseLatLon(geo.text()); + if (ll != null) { + return ll; + } + } + Element metaLat = doc.selectFirst("meta[property=place:location:latitude], meta[name=geo.position]"); + Element metaLon = doc.selectFirst("meta[property=place:location:longitude]"); + if (metaLat != null && metaLon != null) { + Double lat = safeParseDouble(metaLat.attr("content")); + Double lon = safeParseDouble(metaLon.attr("content")); + if (lat != null && lon != null) { + return new Double[]{lat, lon}; + } + } + if (metaLat != null) { + Double[] ll = parseLatLon(metaLat.attr("content")); + if (ll != null) { + return ll; + } + } + return null; + } + + private Double[] parseLatLon(String text) { + if (text == null || text.isEmpty()) { + return null; + } + Matcher m1 = GEO_SEMI_PATTERN.matcher(text); + if (m1.find()) { + Double lat = safeParseDouble(m1.group(1)); + Double lon = safeParseDouble(m1.group(2)); + if (lat != null && lon != null) { + return new Double[]{lat, lon}; + } + } + Matcher m2 = GEO_COMMA_PATTERN.matcher(text); + if (m2.find()) { + Double lat = safeParseDouble(m2.group(1)); + Double lon = safeParseDouble(m2.group(2)); + if (lat != null && lon != null) { + return new Double[]{lat, lon}; + } + } + return null; + } + + private Integer extractElevation(Element infobox, String... headers) { + for (String h : headers) { + String v = extractInfoboxValue(infobox, h); + if (v == null || v.isEmpty()) { + continue; + } + Matcher m = INT_M_PATTERN.matcher(v); + if (m.find()) { + try { + return Integer.parseInt(m.group(1)); + } catch (Exception ignored) { + } + } + } + return null; + } + + private String extractInfoboxValue(Element infobox, String header) { + for (Element row : infobox.select("tr")) { + Element th = row.selectFirst("th"); + Element td = row.selectFirst("td"); + if (th == null || td == null) { + continue; + } + String key = clean(th.text()); + if (!header.equalsIgnoreCase(key)) { + continue; + } + String value = clean(td.text()); + if (value.isEmpty()) { + return null; + } + return value; + } + return null; + } + + private String clean(String s) { + if (s == null) { + return ""; + } + return s.replace('\u00A0', ' ').trim(); + } + + private Double safeParseDouble(String s) { + try { + if (s == null) { + return null; + } + String t = s.trim().replace(",", "."); + t = t.replaceAll("[^0-9.\\-]", ""); + if (t.isEmpty()) { + return null; + } + return Double.parseDouble(t); + } catch (Exception e) { + return null; + } + } +} + diff --git a/src/main/java/com/ski/crawler/util/CliArgs.java b/src/main/java/com/ski/crawler/util/CliArgs.java new file mode 100644 index 0000000..bca51d0 --- /dev/null +++ b/src/main/java/com/ski/crawler/util/CliArgs.java @@ -0,0 +1,74 @@ +package com.ski.crawler.util; + +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +public class CliArgs { + public static Map parseOptions(String[] args, int startIndex) { + Map out = new HashMap<>(); + if (args == null) { + return out; + } + for (int i = startIndex; i < args.length; i++) { + String a = args[i]; + if (a == null) { + continue; + } + String t = a.trim(); + if (!t.startsWith("--")) { + continue; + } + String body = t.substring(2); + String key; + String value; + int eq = body.indexOf('='); + if (eq >= 0) { + key = body.substring(0, eq).trim().toLowerCase(Locale.ROOT); + value = body.substring(eq + 1).trim(); + } else { + key = body.trim().toLowerCase(Locale.ROOT); + if (i + 1 < args.length && args[i + 1] != null && !args[i + 1].trim().startsWith("--")) { + value = args[++i].trim(); + } else { + value = "true"; + } + } + if (!key.isEmpty()) { + out.put(key, value); + } + } + return out; + } + + public static int parseInt(String v, int def) { + try { + if (v == null || v.trim().isEmpty()) { + return def; + } + return Integer.parseInt(v.trim()); + } catch (Exception e) { + return def; + } + } + + public static Integer parseNullableInt(String v) { + try { + if (v == null || v.trim().isEmpty()) { + return null; + } + return Integer.parseInt(v.trim()); + } catch (Exception e) { + return null; + } + } + + public static boolean parseBoolean(String v) { + if (v == null) { + return false; + } + String t = v.trim().toLowerCase(Locale.ROOT); + return t.equals("true") || t.equals("1") || t.equals("yes") || t.equals("y") || t.equals("on"); + } +} + diff --git a/src/main/java/com/ski/crawler/util/ExcelUtil.java b/src/main/java/com/ski/crawler/util/ExcelUtil.java new file mode 100644 index 0000000..de233ca --- /dev/null +++ b/src/main/java/com/ski/crawler/util/ExcelUtil.java @@ -0,0 +1,179 @@ +package com.ski.crawler.util; + +import com.ski.crawler.model.SkiResort; +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.CellStyle; +import org.apache.poi.ss.usermodel.Font; +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.ss.usermodel.Workbook; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +import java.io.FileOutputStream; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +public class ExcelUtil { + private static final List DEFAULT_SHEETS = Arrays.asList("skiresort", "wikipedia", "skimap"); + + public static void exportResortsBySiteToXlsx(List resorts, String path) throws Exception { + Map> bySite = new LinkedHashMap<>(); + for (String s : DEFAULT_SHEETS) { + bySite.put(s, new ArrayList<>()); + } + List other = new ArrayList<>(); + if (resorts != null) { + for (SkiResort r : resorts) { + String site = normalizeSite(r == null ? null : r.getSourceSite()); + if (bySite.containsKey(site)) { + bySite.get(site).add(r); + } else { + other.add(r); + } + } + } + + try (Workbook wb = new XSSFWorkbook()) { + CellStyle headerStyle = createHeaderStyle(wb); + for (Map.Entry> e : bySite.entrySet()) { + writeSheet(wb, headerStyle, e.getKey(), e.getValue()); + } + if (!other.isEmpty()) { + writeSheet(wb, headerStyle, "other", other); + } + try (FileOutputStream out = new FileOutputStream(path)) { + wb.write(out); + } + } + } + + private static void writeSheet(Workbook wb, CellStyle headerStyle, String sheetName, List rows) { + Sheet sheet = wb.createSheet(safeSheetName(sheetName)); + sheet.createFreezePane(0, 1); + + int r = 0; + Row header = sheet.createRow(r++); + String[] cols = new String[]{ + "sourceSite", "name", "country", "region", + "latitude", "longitude", + "altitudeMin", "altitudeMax", + "totalKm", "slopeCount", "liftCount", + "ticketPriceMin", "ticketPriceMax", "currency", + "overallScore", + "url", + "crawlTime" + }; + for (int i = 0; i < cols.length; i++) { + Cell c = header.createCell(i); + c.setCellValue(cols[i]); + c.setCellStyle(headerStyle); + } + + if (rows != null) { + for (SkiResort sr : rows) { + if (sr == null) { + continue; + } + Row row = sheet.createRow(r++); + int i = 0; + setCell(row, i++, sr.getSourceSite()); + setCell(row, i++, sr.getName()); + setCell(row, i++, sr.getCountry()); + setCell(row, i++, sr.getRegion()); + setCell(row, i++, sr.getLatitude()); + setCell(row, i++, sr.getLongitude()); + setCell(row, i++, sr.getAltitudeMin()); + setCell(row, i++, sr.getAltitudeMax()); + setCell(row, i++, sr.getTotalKm()); + setCell(row, i++, sr.getSlopeCount()); + setCell(row, i++, sr.getLiftCount()); + setCell(row, i++, sr.getTicketPriceMin()); + setCell(row, i++, sr.getTicketPriceMax()); + setCell(row, i++, sr.getCurrency()); + setCell(row, i++, sr.getOverallScore()); + setCell(row, i++, sr.getSourceUrl()); + setCell(row, i, sr.getCrawledAt() == null ? null : sr.getCrawledAt().toString()); + } + } + + int[] widths = new int[]{ + 12, 28, 14, 18, + 12, 12, + 12, 12, + 10, 10, 10, + 14, 14, 10, + 12, + 40, + 22 + }; + for (int i = 0; i < widths.length; i++) { + sheet.setColumnWidth(i, Math.min(255, Math.max(8, widths[i])) * 256); + } + } + + private static CellStyle createHeaderStyle(Workbook wb) { + CellStyle style = wb.createCellStyle(); + Font font = wb.createFont(); + font.setBold(true); + style.setFont(font); + return style; + } + + private static void setCell(Row row, int col, Object v) { + Cell cell = row.createCell(col); + if (v == null) { + return; + } + if (v instanceof Integer) { + cell.setCellValue(((Integer) v).doubleValue()); + return; + } + if (v instanceof Long) { + cell.setCellValue(((Long) v).doubleValue()); + return; + } + if (v instanceof Double) { + cell.setCellValue((Double) v); + return; + } + if (v instanceof Float) { + cell.setCellValue(((Float) v).doubleValue()); + return; + } + if (v instanceof BigDecimal) { + cell.setCellValue(((BigDecimal) v).doubleValue()); + return; + } + cell.setCellValue(String.valueOf(v)); + } + + private static String normalizeSite(String s) { + if (s == null) { + return ""; + } + return s.trim().toLowerCase(Locale.ROOT); + } + + private static String safeSheetName(String name) { + String n = name == null ? "sheet" : name.trim(); + if (n.isEmpty()) { + n = "sheet"; + } + n = n.replace(':', '-') + .replace('\\', '-') + .replace('/', '-') + .replace('?', '-') + .replace('*', '-') + .replace('[', '(') + .replace(']', ')'); + if (n.length() > 31) { + n = n.substring(0, 31); + } + return n; + } +} diff --git a/src/main/java/com/ski/crawler/util/JsonUtil.java b/src/main/java/com/ski/crawler/util/JsonUtil.java new file mode 100644 index 0000000..1a7508b --- /dev/null +++ b/src/main/java/com/ski/crawler/util/JsonUtil.java @@ -0,0 +1,43 @@ +package com.ski.crawler.util; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +public class JsonUtil { + private static final ObjectMapper MAPPER = new ObjectMapper(); + + public static ObjectMapper mapper() { + return MAPPER; + } + + public static BufferedWriter openJsonlWriter(String path) throws Exception { + return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), StandardCharsets.UTF_8)); + } + + public static BufferedReader openJsonlReader(String path) throws Exception { + return new BufferedReader(new InputStreamReader(new FileInputStream(path), StandardCharsets.UTF_8)); + } + + public static List readAllLines(String path) throws Exception { + List out = new ArrayList<>(); + try (BufferedReader br = openJsonlReader(path)) { + String line; + while ((line = br.readLine()) != null) { + if (!line.trim().isEmpty()) { + out.add(line); + } + } + } + return out; + } +} + diff --git a/src/main/java/com/ski/crawler/util/RetryUtil.java b/src/main/java/com/ski/crawler/util/RetryUtil.java new file mode 100644 index 0000000..fa0be2f --- /dev/null +++ b/src/main/java/com/ski/crawler/util/RetryUtil.java @@ -0,0 +1,33 @@ +package com.ski.crawler.util; + +import com.ski.crawler.exception.NetworkException; + +import java.util.concurrent.Callable; + +public class RetryUtil { + public static T retry(Callable task, int maxAttempts, long baseSleepMs) throws Exception { + Exception last = null; + int attempts = Math.max(1, maxAttempts); + for (int i = 1; i <= attempts; i++) { + try { + return task.call(); + } catch (Exception e) { + last = e; + if (i == attempts) { + throw e; + } + long sleep = baseSleepMs <= 0 ? 0 : baseSleepMs; + if (sleep > 0) { + try { + Thread.sleep(sleep); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new NetworkException("Retry interrupted", ie); + } + } + } + } + throw last == null ? new NetworkException("Retry failed") : last; + } +} + diff --git a/src/main/java/com/ski/crawler/util/ValidationUtil.java b/src/main/java/com/ski/crawler/util/ValidationUtil.java new file mode 100644 index 0000000..bf82e71 --- /dev/null +++ b/src/main/java/com/ski/crawler/util/ValidationUtil.java @@ -0,0 +1,60 @@ +package com.ski.crawler.util; + +import com.ski.crawler.model.SkiResort; + +import java.util.Locale; + +public class ValidationUtil { + public static SkiResort clean(SkiResort r) { + if (r == null) { + return null; + } + r.setName(trimToNull(r.getName())); + r.setCountry(trimToNull(r.getCountry())); + r.setRegion(trimToNull(r.getRegion())); + r.setSourceUrl(trimToNull(r.getSourceUrl())); + r.setSourceSite(trimToNull(r.getSourceSite())); + + if (r.getLatitude() != null && (r.getLatitude() < -90 || r.getLatitude() > 90)) { + r.setLatitude(null); + } + if (r.getLongitude() != null && (r.getLongitude() < -180 || r.getLongitude() > 180)) { + r.setLongitude(null); + } + if (r.getTicketPriceMin() != null && r.getTicketPriceMin() < 0) { + r.setTicketPriceMin(null); + } + if (r.getTicketPriceMax() != null && r.getTicketPriceMax() < 0) { + r.setTicketPriceMax(null); + } + return r; + } + + public static void validate(SkiResort r) { + if (r == null) { + throw new IllegalArgumentException("SkiResort is null"); + } + if (r.getSourceUrl() == null || r.getSourceUrl().isEmpty()) { + throw new IllegalArgumentException("sourceUrl is empty"); + } + if (r.getName() == null || r.getName().isEmpty()) { + throw new IllegalArgumentException("name is empty"); + } + } + + public static String normalizeCountryKey(String country) { + if (country == null) { + return ""; + } + return country.replace('\u00A0', ' ').trim().toLowerCase(Locale.ROOT); + } + + private static String trimToNull(String s) { + if (s == null) { + return null; + } + String t = s.replace('\u00A0', ' ').trim(); + return t.isEmpty() ? null : t; + } +} + diff --git a/src/main/java/com/ski/crawler/utils/CrawlerHttp.java b/src/main/java/com/ski/crawler/utils/CrawlerHttp.java new file mode 100644 index 0000000..a0f77fd --- /dev/null +++ b/src/main/java/com/ski/crawler/utils/CrawlerHttp.java @@ -0,0 +1,52 @@ +//统一 HTTP 配置 :UA/代理/超时集中管理,避免各处硬编码 +package com.ski.crawler.utils; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.net.InetSocketAddress; +import java.net.Socket; + +public class CrawlerHttp { + private final String userAgent; + private final String proxyHost; + private final int proxyPort; + private final boolean proxyEnabled; + private final int timeoutMs; + + public CrawlerHttp(String userAgent, String proxyHost, int proxyPort, boolean proxyEnabled, int timeoutMs) { + this.userAgent = userAgent; + this.proxyHost = proxyHost; + this.proxyPort = proxyPort; + this.proxyEnabled = proxyEnabled; + this.timeoutMs = timeoutMs; + } + + public Document getDocument(String url) { + org.jsoup.Connection conn = Jsoup.connect(url) + .userAgent(userAgent) + .timeout(timeoutMs) + .followRedirects(true); + if (proxyEnabled && proxyHost != null && !proxyHost.isEmpty() && proxyPort > 0 && isProxyReachable(proxyHost, proxyPort, 300)) { + conn = conn.proxy(proxyHost, proxyPort); + } + try { + return conn.get(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public String getHtml(String url) { + return getDocument(url).outerHtml(); + } + + private boolean isProxyReachable(String host, int port, int timeoutMs) { + try (Socket socket = new Socket()) { + socket.connect(new InetSocketAddress(host, port), timeoutMs); + return true; + } catch (Exception e) { + return false; + } + } +} diff --git a/src/main/java/com/ski/crawler/utils/HttpClientUtil.java b/src/main/java/com/ski/crawler/utils/HttpClientUtil.java new file mode 100644 index 0000000..671e91f --- /dev/null +++ b/src/main/java/com/ski/crawler/utils/HttpClientUtil.java @@ -0,0 +1,27 @@ +//网络请求工具 +package com.ski.crawler.utils; + +import org.apache.http.HttpEntity; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.util.EntityUtils; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +public class HttpClientUtil { + public String get(String url) throws IOException { + HttpGet request = new HttpGet(url); + + try (CloseableHttpClient client = HttpClients.createDefault(); + CloseableHttpResponse response = client.execute(request)) { + HttpEntity entity = response.getEntity(); + if (entity == null) { + return ""; + } + return EntityUtils.toString(entity, StandardCharsets.UTF_8); + } + } +} diff --git a/src/main/java/com/ski/crawler/view/ConsoleView.java b/src/main/java/com/ski/crawler/view/ConsoleView.java new file mode 100644 index 0000000..a1f288c --- /dev/null +++ b/src/main/java/com/ski/crawler/view/ConsoleView.java @@ -0,0 +1,336 @@ +package com.ski.crawler.view; + +import com.ski.crawler.model.SkiResort; + +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class ConsoleView { + private final boolean color; + private final int width; + private final TablePrinter table; + + public ConsoleView(int width, boolean color) { + this.width = Math.max(60, width); + this.color = color; + this.table = new TablePrinter(this.width, this.color); + } + + public void printHeader() { + table.printHeader(); + } + + public void printResort(SkiResort r) { + table.printRow(r); + } + + public void printSummary(Map summary, Map byCountry, List failures) { + System.err.println("---- summary ----"); + for (Map.Entry e : summary.entrySet()) { + System.err.println(e.getKey() + "=" + (e.getValue() == null ? "" : e.getValue())); + } + if (byCountry != null && !byCountry.isEmpty()) { + System.err.println("by country:"); + for (Map.Entry e : byCountry.entrySet()) { + System.err.println(" " + e.getKey() + ": " + e.getValue()); + } + } + if (failures != null && !failures.isEmpty()) { + System.err.println("failures:"); + for (String f : failures) { + System.err.println(" " + f); + } + } + } + + private static class TablePrinter { + private final int width; + private final boolean color; + private boolean headerPrinted; + + private TablePrinter(int width, boolean color) { + this.width = width; + this.color = color; + } + + private void printHeader() { + if (headerPrinted) { + return; + } + headerPrinted = true; + List cols = columns(); + String line = formatRow(cols, new String[]{"SITE", "NAME", "COUNTRY", "REGION", "COORD", "ALT", "KM", "LIFTS", "PRICE", "SCORE", "URL"}, true); + System.out.println(line); + System.out.println(repeat("-", displayWidth(stripAnsi(line)))); + } + + private void printRow(SkiResort r) { + List cols = columns(); + String coord = formatCoord(r.getLatitude(), r.getLongitude()); + String alt = formatAlt(r.getAltitudeMin(), r.getAltitudeMax()); + String km = r.getTotalKm() == null ? "" : String.valueOf(r.getTotalKm()); + String lifts = r.getLiftCount() == null ? "" : String.valueOf(r.getLiftCount()); + String price = formatPrice(r.getTicketPriceMin(), r.getTicketPriceMax(), r.getCurrency()); + String score = r.getOverallScore() == null ? "" : r.getOverallScore().toPlainString(); + String line = formatRow(cols, new String[]{ + safe0(r.getSourceSite()), + safe0(r.getName()), + safe0(r.getCountry()), + safe0(r.getRegion()), + coord, + alt, + km, + lifts, + price, + score, + safe0(r.getSourceUrl()) + }, false); + System.out.println(line); + } + + private List columns() { + List cols = new ArrayList<>(); + cols.add(new Col("SITE", 6, 10, 1)); + cols.add(new Col("NAME", 10, 26, 3)); + cols.add(new Col("COUNTRY", 6, 14, 2)); + cols.add(new Col("REGION", 6, 16, 2)); + cols.add(new Col("COORD", 10, 22, 2)); + cols.add(new Col("ALT", 5, 12, 1)); + cols.add(new Col("KM", 2, 8, 1)); + cols.add(new Col("LIFTS", 4, 8, 1)); + cols.add(new Col("PRICE", 4, 14, 1)); + cols.add(new Col("SCORE", 4, 8, 1)); + cols.add(new Col("URL", 10, 200, 4)); + allocate(cols, width); + return cols; + } + + private String formatRow(List cols, String[] values, boolean header) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < cols.size() && i < values.length; i++) { + Col c = cols.get(i); + String v = values[i] == null ? "" : values[i]; + String cell = padRight(truncate(v, c.width), c.width); + if (header && color) { + cell = Ansi.cyan(cell); + } + sb.append(cell); + if (i != cols.size() - 1) { + sb.append(" "); + } + } + return sb.toString(); + } + + private void allocate(List cols, int totalWidth) { + int gaps = (cols.size() - 1) * 2; + int available = Math.max(20, totalWidth - gaps); + int minSum = 0; + for (Col c : cols) { + c.width = c.min; + minSum += c.width; + } + int remaining = available - minSum; + if (remaining <= 0) { + return; + } + + int totalWeight = 0; + for (Col c : cols) { + totalWeight += c.weight; + } + for (int loop = 0; loop < 3 && remaining > 0; loop++) { + boolean any = false; + for (Col c : cols) { + if (remaining <= 0) { + break; + } + int maxAdd = c.max - c.width; + if (maxAdd <= 0) { + continue; + } + int add = Math.max(1, remaining * c.weight / Math.max(1, totalWeight)); + add = Math.min(add, maxAdd); + c.width += add; + remaining -= add; + any = true; + } + if (!any) { + break; + } + } + int idx = cols.size() - 1; + while (remaining > 0 && idx >= 0) { + Col c = cols.get(idx); + int maxAdd = c.max - c.width; + if (maxAdd > 0) { + int add = Math.min(maxAdd, remaining); + c.width += add; + remaining -= add; + } + idx--; + } + } + + private String formatCoord(Double lat, Double lon) { + if (lat == null || lon == null) { + return ""; + } + return String.format("%.5f,%.5f", lat, lon); + } + + private String formatAlt(Integer min, Integer max) { + if (min == null && max == null) { + return ""; + } + if (min != null && max != null) { + return min + "-" + max + "m"; + } + if (min != null) { + return min + "m"; + } + return max + "m"; + } + + private String formatPrice(Double min, Double max, String currency) { + if (min == null && max == null) { + return ""; + } + String cur = currency == null ? "" : currency.trim(); + String range; + if (min != null && max != null) { + range = stripTrailingZeros(min) + "-" + stripTrailingZeros(max); + } else if (min != null) { + range = stripTrailingZeros(min); + } else { + range = stripTrailingZeros(max); + } + return cur.isEmpty() ? range : cur + " " + range; + } + + private String stripTrailingZeros(Double v) { + try { + return BigDecimal.valueOf(v).stripTrailingZeros().toPlainString(); + } catch (Exception e) { + return String.valueOf(v); + } + } + } + + private static class Col { + private final int min; + private final int max; + private final int weight; + private int width; + + private Col(String name, int min, int max, int weight) { + this.min = min; + this.max = max; + this.weight = weight; + } + } + + private static class Ansi { + private static String wrap(String code, String s) { + return "\u001B[" + code + "m" + s + "\u001B[0m"; + } + + private static String cyan(String s) { + return wrap("36", s); + } + } + + private static String safe0(String s) { + if (s == null) { + return ""; + } + return s.replace('\t', ' ').trim(); + } + + private static String padRight(String s, int width) { + int w = displayWidth(s); + if (w >= width) { + return s; + } + StringBuilder sb = new StringBuilder(s); + for (int i = 0; i < (width - w); i++) { + sb.append(' '); + } + return sb.toString(); + } + + private static String truncate(String s, int width) { + if (s == null) { + return ""; + } + if (displayWidth(s) <= width) { + return s; + } + String ell = "..."; + int target = Math.max(0, width - displayWidth(ell)); + StringBuilder sb = new StringBuilder(); + int w = 0; + for (int i = 0; i < s.length(); ) { + int cp = s.codePointAt(i); + String ch = new String(Character.toChars(cp)); + int cw = displayWidth(ch); + if (w + cw > target) { + break; + } + sb.append(ch); + w += cw; + i += Character.charCount(cp); + } + sb.append(ell); + return sb.toString(); + } + + private static int displayWidth(String s) { + if (s == null || s.isEmpty()) { + return 0; + } + int w = 0; + for (int i = 0; i < s.length(); ) { + int cp = s.codePointAt(i); + if (cp == 27) { + int m = s.indexOf('m', i); + if (m > i) { + i = m + 1; + continue; + } + } + if (cp <= 0x1F || (cp >= 0x7F && cp <= 0x9F)) { + i += Character.charCount(cp); + continue; + } + if (cp <= 0x7F) { + w += 1; + } else { + w += 2; + } + i += Character.charCount(cp); + } + return w; + } + + private static String repeat(String s, int n) { + if (n <= 0) { + return ""; + } + StringBuilder sb = new StringBuilder(n * s.length()); + for (int i = 0; i < n; i++) { + sb.append(s); + } + return sb.toString(); + } + + private static String stripAnsi(String s) { + if (s == null) { + return ""; + } + return s.replaceAll("\\u001B\\[[;\\d]*m", ""); + } +} + diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml new file mode 100644 index 0000000..7b62fdf --- /dev/null +++ b/src/main/resources/logback.xml @@ -0,0 +1,13 @@ + + + + + %d{HH:mm:ss.SSS} %-5level [%thread] %logger{36} - %msg%n + + + + + + + + diff --git a/target/classes/com/ski/crawler/Main.class b/target/classes/com/ski/crawler/Main.class new file mode 100644 index 0000000..9a4feb9 Binary files /dev/null and b/target/classes/com/ski/crawler/Main.class differ diff --git a/target/classes/com/ski/crawler/command/Command.class b/target/classes/com/ski/crawler/command/Command.class new file mode 100644 index 0000000..e994cbb Binary files /dev/null and b/target/classes/com/ski/crawler/command/Command.class differ diff --git a/target/classes/com/ski/crawler/command/CrawlCommand.class b/target/classes/com/ski/crawler/command/CrawlCommand.class new file mode 100644 index 0000000..d3892af Binary files /dev/null and b/target/classes/com/ski/crawler/command/CrawlCommand.class differ diff --git a/target/classes/com/ski/crawler/command/ExportCommand.class b/target/classes/com/ski/crawler/command/ExportCommand.class new file mode 100644 index 0000000..835542f Binary files /dev/null and b/target/classes/com/ski/crawler/command/ExportCommand.class differ diff --git a/target/classes/com/ski/crawler/command/FilterCommand.class b/target/classes/com/ski/crawler/command/FilterCommand.class new file mode 100644 index 0000000..e10bac2 Binary files /dev/null and b/target/classes/com/ski/crawler/command/FilterCommand.class differ diff --git a/target/classes/com/ski/crawler/command/HelpCommand.class b/target/classes/com/ski/crawler/command/HelpCommand.class new file mode 100644 index 0000000..f4089db Binary files /dev/null and b/target/classes/com/ski/crawler/command/HelpCommand.class differ diff --git a/target/classes/com/ski/crawler/command/ListCommand.class b/target/classes/com/ski/crawler/command/ListCommand.class new file mode 100644 index 0000000..8d139e1 Binary files /dev/null and b/target/classes/com/ski/crawler/command/ListCommand.class differ diff --git a/target/classes/com/ski/crawler/command/ResumeCommand$1.class b/target/classes/com/ski/crawler/command/ResumeCommand$1.class new file mode 100644 index 0000000..1a78d80 Binary files /dev/null and b/target/classes/com/ski/crawler/command/ResumeCommand$1.class differ diff --git a/target/classes/com/ski/crawler/command/ResumeCommand.class b/target/classes/com/ski/crawler/command/ResumeCommand.class new file mode 100644 index 0000000..bdcb6de Binary files /dev/null and b/target/classes/com/ski/crawler/command/ResumeCommand.class differ diff --git a/target/classes/com/ski/crawler/command/SitesCommand.class b/target/classes/com/ski/crawler/command/SitesCommand.class new file mode 100644 index 0000000..4292ed9 Binary files /dev/null and b/target/classes/com/ski/crawler/command/SitesCommand.class differ diff --git a/target/classes/com/ski/crawler/command/StatsCommand.class b/target/classes/com/ski/crawler/command/StatsCommand.class new file mode 100644 index 0000000..40ad25d Binary files /dev/null and b/target/classes/com/ski/crawler/command/StatsCommand.class differ diff --git a/target/classes/com/ski/crawler/controller/CrawlerContext.class b/target/classes/com/ski/crawler/controller/CrawlerContext.class new file mode 100644 index 0000000..1b6ff29 Binary files /dev/null and b/target/classes/com/ski/crawler/controller/CrawlerContext.class differ diff --git a/target/classes/com/ski/crawler/controller/CrawlerController.class b/target/classes/com/ski/crawler/controller/CrawlerController.class new file mode 100644 index 0000000..5aaac48 Binary files /dev/null and b/target/classes/com/ski/crawler/controller/CrawlerController.class differ diff --git a/target/classes/com/ski/crawler/exception/CrawlerException.class b/target/classes/com/ski/crawler/exception/CrawlerException.class new file mode 100644 index 0000000..81ec85a Binary files /dev/null and b/target/classes/com/ski/crawler/exception/CrawlerException.class differ diff --git a/target/classes/com/ski/crawler/exception/NetworkException.class b/target/classes/com/ski/crawler/exception/NetworkException.class new file mode 100644 index 0000000..2b4cd3d Binary files /dev/null and b/target/classes/com/ski/crawler/exception/NetworkException.class differ diff --git a/target/classes/com/ski/crawler/exception/ParseException.class b/target/classes/com/ski/crawler/exception/ParseException.class new file mode 100644 index 0000000..17b52f5 Binary files /dev/null and b/target/classes/com/ski/crawler/exception/ParseException.class differ diff --git a/target/classes/com/ski/crawler/factory/StrategyFactory.class b/target/classes/com/ski/crawler/factory/StrategyFactory.class new file mode 100644 index 0000000..490814e Binary files /dev/null and b/target/classes/com/ski/crawler/factory/StrategyFactory.class differ diff --git a/target/classes/com/ski/crawler/model/SkiLift.class b/target/classes/com/ski/crawler/model/SkiLift.class new file mode 100644 index 0000000..75c329f Binary files /dev/null and b/target/classes/com/ski/crawler/model/SkiLift.class differ diff --git a/target/classes/com/ski/crawler/model/SkiResort.class b/target/classes/com/ski/crawler/model/SkiResort.class new file mode 100644 index 0000000..e89f08e Binary files /dev/null and b/target/classes/com/ski/crawler/model/SkiResort.class differ diff --git a/target/classes/com/ski/crawler/model/SkiReview.class b/target/classes/com/ski/crawler/model/SkiReview.class new file mode 100644 index 0000000..2476812 Binary files /dev/null and b/target/classes/com/ski/crawler/model/SkiReview.class differ diff --git a/target/classes/com/ski/crawler/model/SkiTicket.class b/target/classes/com/ski/crawler/model/SkiTicket.class new file mode 100644 index 0000000..6de9f47 Binary files /dev/null and b/target/classes/com/ski/crawler/model/SkiTicket.class differ diff --git a/target/classes/com/ski/crawler/model/SkiTrail.class b/target/classes/com/ski/crawler/model/SkiTrail.class new file mode 100644 index 0000000..8124539 Binary files /dev/null and b/target/classes/com/ski/crawler/model/SkiTrail.class differ diff --git a/target/classes/com/ski/crawler/parser/ResortDetailParser$Price.class b/target/classes/com/ski/crawler/parser/ResortDetailParser$Price.class new file mode 100644 index 0000000..0a76a7a Binary files /dev/null and b/target/classes/com/ski/crawler/parser/ResortDetailParser$Price.class differ diff --git a/target/classes/com/ski/crawler/parser/ResortDetailParser.class b/target/classes/com/ski/crawler/parser/ResortDetailParser.class new file mode 100644 index 0000000..e93b3a2 Binary files /dev/null and b/target/classes/com/ski/crawler/parser/ResortDetailParser.class differ diff --git a/target/classes/com/ski/crawler/parser/ResortParser.class b/target/classes/com/ski/crawler/parser/ResortParser.class new file mode 100644 index 0000000..1503f6c Binary files /dev/null and b/target/classes/com/ski/crawler/parser/ResortParser.class differ diff --git a/target/classes/com/ski/crawler/repository/SkiResortRepository.class b/target/classes/com/ski/crawler/repository/SkiResortRepository.class new file mode 100644 index 0000000..dda3fc1 Binary files /dev/null and b/target/classes/com/ski/crawler/repository/SkiResortRepository.class differ diff --git a/target/classes/com/ski/crawler/service/ScraperService$CrawlReport.class b/target/classes/com/ski/crawler/service/ScraperService$CrawlReport.class new file mode 100644 index 0000000..4c33615 Binary files /dev/null and b/target/classes/com/ski/crawler/service/ScraperService$CrawlReport.class differ diff --git a/target/classes/com/ski/crawler/service/ScraperService.class b/target/classes/com/ski/crawler/service/ScraperService.class new file mode 100644 index 0000000..403123d Binary files /dev/null and b/target/classes/com/ski/crawler/service/ScraperService.class differ diff --git a/target/classes/com/ski/crawler/site/CrawlerSite.class b/target/classes/com/ski/crawler/site/CrawlerSite.class new file mode 100644 index 0000000..feffd1e Binary files /dev/null and b/target/classes/com/ski/crawler/site/CrawlerSite.class differ diff --git a/target/classes/com/ski/crawler/site/SkimapOrgSite.class b/target/classes/com/ski/crawler/site/SkimapOrgSite.class new file mode 100644 index 0000000..5dab4d8 Binary files /dev/null and b/target/classes/com/ski/crawler/site/SkimapOrgSite.class differ diff --git a/target/classes/com/ski/crawler/site/SkiresortInfoSite.class b/target/classes/com/ski/crawler/site/SkiresortInfoSite.class new file mode 100644 index 0000000..431305c Binary files /dev/null and b/target/classes/com/ski/crawler/site/SkiresortInfoSite.class differ diff --git a/target/classes/com/ski/crawler/site/WikipediaSite.class b/target/classes/com/ski/crawler/site/WikipediaSite.class new file mode 100644 index 0000000..3381901 Binary files /dev/null and b/target/classes/com/ski/crawler/site/WikipediaSite.class differ diff --git a/target/classes/com/ski/crawler/spider/ResortListSpider.class b/target/classes/com/ski/crawler/spider/ResortListSpider.class new file mode 100644 index 0000000..66e989d Binary files /dev/null and b/target/classes/com/ski/crawler/spider/ResortListSpider.class differ diff --git a/target/classes/com/ski/crawler/strategy/CrawlStrategy.class b/target/classes/com/ski/crawler/strategy/CrawlStrategy.class new file mode 100644 index 0000000..7c93ae1 Binary files /dev/null and b/target/classes/com/ski/crawler/strategy/CrawlStrategy.class differ diff --git a/target/classes/com/ski/crawler/strategy/SkiResortInfoStrategy.class b/target/classes/com/ski/crawler/strategy/SkiResortInfoStrategy.class new file mode 100644 index 0000000..2996d81 Binary files /dev/null and b/target/classes/com/ski/crawler/strategy/SkiResortInfoStrategy.class differ diff --git a/target/classes/com/ski/crawler/strategy/SkimapStrategy.class b/target/classes/com/ski/crawler/strategy/SkimapStrategy.class new file mode 100644 index 0000000..5ef7c14 Binary files /dev/null and b/target/classes/com/ski/crawler/strategy/SkimapStrategy.class differ diff --git a/target/classes/com/ski/crawler/strategy/WikipediaStrategy.class b/target/classes/com/ski/crawler/strategy/WikipediaStrategy.class new file mode 100644 index 0000000..4c939f2 Binary files /dev/null and b/target/classes/com/ski/crawler/strategy/WikipediaStrategy.class differ diff --git a/target/classes/com/ski/crawler/util/CliArgs.class b/target/classes/com/ski/crawler/util/CliArgs.class new file mode 100644 index 0000000..fa15689 Binary files /dev/null and b/target/classes/com/ski/crawler/util/CliArgs.class differ diff --git a/target/classes/com/ski/crawler/util/ExcelUtil.class b/target/classes/com/ski/crawler/util/ExcelUtil.class new file mode 100644 index 0000000..1d17da1 Binary files /dev/null and b/target/classes/com/ski/crawler/util/ExcelUtil.class differ diff --git a/target/classes/com/ski/crawler/util/JsonUtil.class b/target/classes/com/ski/crawler/util/JsonUtil.class new file mode 100644 index 0000000..3a104ff Binary files /dev/null and b/target/classes/com/ski/crawler/util/JsonUtil.class differ diff --git a/target/classes/com/ski/crawler/util/RetryUtil.class b/target/classes/com/ski/crawler/util/RetryUtil.class new file mode 100644 index 0000000..01d95cf Binary files /dev/null and b/target/classes/com/ski/crawler/util/RetryUtil.class differ diff --git a/target/classes/com/ski/crawler/util/ValidationUtil.class b/target/classes/com/ski/crawler/util/ValidationUtil.class new file mode 100644 index 0000000..54fcd48 Binary files /dev/null and b/target/classes/com/ski/crawler/util/ValidationUtil.class differ diff --git a/target/classes/com/ski/crawler/utils/CrawlerHttp.class b/target/classes/com/ski/crawler/utils/CrawlerHttp.class new file mode 100644 index 0000000..c9bfb44 Binary files /dev/null and b/target/classes/com/ski/crawler/utils/CrawlerHttp.class differ diff --git a/target/classes/com/ski/crawler/utils/HttpClientUtil.class b/target/classes/com/ski/crawler/utils/HttpClientUtil.class new file mode 100644 index 0000000..178dcdf Binary files /dev/null and b/target/classes/com/ski/crawler/utils/HttpClientUtil.class differ diff --git a/target/classes/com/ski/crawler/view/ConsoleView$Ansi.class b/target/classes/com/ski/crawler/view/ConsoleView$Ansi.class new file mode 100644 index 0000000..ef99d03 Binary files /dev/null and b/target/classes/com/ski/crawler/view/ConsoleView$Ansi.class differ diff --git a/target/classes/com/ski/crawler/view/ConsoleView$Col.class b/target/classes/com/ski/crawler/view/ConsoleView$Col.class new file mode 100644 index 0000000..9a7afd8 Binary files /dev/null and b/target/classes/com/ski/crawler/view/ConsoleView$Col.class differ diff --git a/target/classes/com/ski/crawler/view/ConsoleView$TablePrinter.class b/target/classes/com/ski/crawler/view/ConsoleView$TablePrinter.class new file mode 100644 index 0000000..4fc519b Binary files /dev/null and b/target/classes/com/ski/crawler/view/ConsoleView$TablePrinter.class differ diff --git a/target/classes/com/ski/crawler/view/ConsoleView.class b/target/classes/com/ski/crawler/view/ConsoleView.class new file mode 100644 index 0000000..95199cf Binary files /dev/null and b/target/classes/com/ski/crawler/view/ConsoleView.class differ diff --git a/target/classes/logback.xml b/target/classes/logback.xml new file mode 100644 index 0000000..7b62fdf --- /dev/null +++ b/target/classes/logback.xml @@ -0,0 +1,13 @@ + + + + + %d{HH:mm:ss.SSS} %-5level [%thread] %logger{36} - %msg%n + + + + + + + +