Browse Source

提交项目源码

pull/1/head
Chengwuyi 3 weeks ago
commit
a52792f019
  1. BIN
      .gitignore
  2. 45
      .vscode/launch.json
  3. 132
      pom.xml
  4. 20
      result.jsonl
  5. 40
      src/main/java/com/ski/crawler/Main.java
  6. 10
      src/main/java/com/ski/crawler/command/Command.java
  7. 253
      src/main/java/com/ski/crawler/command/CrawlCommand.java
  8. 77
      src/main/java/com/ski/crawler/command/ExportCommand.java
  9. 18
      src/main/java/com/ski/crawler/command/FilterCommand.java
  10. 38
      src/main/java/com/ski/crawler/command/HelpCommand.java
  11. 53
      src/main/java/com/ski/crawler/command/ListCommand.java
  12. 140
      src/main/java/com/ski/crawler/command/ResumeCommand.java
  13. 19
      src/main/java/com/ski/crawler/command/SitesCommand.java
  14. 66
      src/main/java/com/ski/crawler/command/StatsCommand.java
  15. 30
      src/main/java/com/ski/crawler/controller/CrawlerContext.java
  16. 72
      src/main/java/com/ski/crawler/controller/CrawlerController.java
  17. 12
      src/main/java/com/ski/crawler/exception/CrawlerException.java
  18. 12
      src/main/java/com/ski/crawler/exception/NetworkException.java
  19. 12
      src/main/java/com/ski/crawler/exception/ParseException.java
  20. 31
      src/main/java/com/ski/crawler/factory/StrategyFactory.java
  21. 83
      src/main/java/com/ski/crawler/model/SkiLift.java
  22. 252
      src/main/java/com/ski/crawler/model/SkiResort.java
  23. 76
      src/main/java/com/ski/crawler/model/SkiReview.java
  24. 74
      src/main/java/com/ski/crawler/model/SkiTicket.java
  25. 92
      src/main/java/com/ski/crawler/model/SkiTrail.java
  26. 471
      src/main/java/com/ski/crawler/parser/ResortDetailParser.java
  27. 34
      src/main/java/com/ski/crawler/parser/ResortParser.java
  28. 74
      src/main/java/com/ski/crawler/repository/SkiResortRepository.java
  29. 243
      src/main/java/com/ski/crawler/service/ScraperService.java
  30. 22
      src/main/java/com/ski/crawler/site/CrawlerSite.java
  31. 194
      src/main/java/com/ski/crawler/site/SkimapOrgSite.java
  32. 33
      src/main/java/com/ski/crawler/site/SkiresortInfoSite.java
  33. 204
      src/main/java/com/ski/crawler/site/WikipediaSite.java
  34. 98
      src/main/java/com/ski/crawler/spider/ResortListSpider.java
  35. 19
      src/main/java/com/ski/crawler/strategy/CrawlStrategy.java
  36. 81
      src/main/java/com/ski/crawler/strategy/SkiResortInfoStrategy.java
  37. 199
      src/main/java/com/ski/crawler/strategy/SkimapStrategy.java
  38. 244
      src/main/java/com/ski/crawler/strategy/WikipediaStrategy.java
  39. 74
      src/main/java/com/ski/crawler/util/CliArgs.java
  40. 179
      src/main/java/com/ski/crawler/util/ExcelUtil.java
  41. 43
      src/main/java/com/ski/crawler/util/JsonUtil.java
  42. 33
      src/main/java/com/ski/crawler/util/RetryUtil.java
  43. 60
      src/main/java/com/ski/crawler/util/ValidationUtil.java
  44. 52
      src/main/java/com/ski/crawler/utils/CrawlerHttp.java
  45. 27
      src/main/java/com/ski/crawler/utils/HttpClientUtil.java
  46. 336
      src/main/java/com/ski/crawler/view/ConsoleView.java
  47. 13
      src/main/resources/logback.xml
  48. BIN
      target/classes/com/ski/crawler/Main.class
  49. BIN
      target/classes/com/ski/crawler/command/Command.class
  50. BIN
      target/classes/com/ski/crawler/command/CrawlCommand.class
  51. BIN
      target/classes/com/ski/crawler/command/ExportCommand.class
  52. BIN
      target/classes/com/ski/crawler/command/FilterCommand.class
  53. BIN
      target/classes/com/ski/crawler/command/HelpCommand.class
  54. BIN
      target/classes/com/ski/crawler/command/ListCommand.class
  55. BIN
      target/classes/com/ski/crawler/command/ResumeCommand$1.class
  56. BIN
      target/classes/com/ski/crawler/command/ResumeCommand.class
  57. BIN
      target/classes/com/ski/crawler/command/SitesCommand.class
  58. BIN
      target/classes/com/ski/crawler/command/StatsCommand.class
  59. BIN
      target/classes/com/ski/crawler/controller/CrawlerContext.class
  60. BIN
      target/classes/com/ski/crawler/controller/CrawlerController.class
  61. BIN
      target/classes/com/ski/crawler/exception/CrawlerException.class
  62. BIN
      target/classes/com/ski/crawler/exception/NetworkException.class
  63. BIN
      target/classes/com/ski/crawler/exception/ParseException.class
  64. BIN
      target/classes/com/ski/crawler/factory/StrategyFactory.class
  65. BIN
      target/classes/com/ski/crawler/model/SkiLift.class
  66. BIN
      target/classes/com/ski/crawler/model/SkiResort.class
  67. BIN
      target/classes/com/ski/crawler/model/SkiReview.class
  68. BIN
      target/classes/com/ski/crawler/model/SkiTicket.class
  69. BIN
      target/classes/com/ski/crawler/model/SkiTrail.class
  70. BIN
      target/classes/com/ski/crawler/parser/ResortDetailParser$Price.class
  71. BIN
      target/classes/com/ski/crawler/parser/ResortDetailParser.class
  72. BIN
      target/classes/com/ski/crawler/parser/ResortParser.class
  73. BIN
      target/classes/com/ski/crawler/repository/SkiResortRepository.class
  74. BIN
      target/classes/com/ski/crawler/service/ScraperService$CrawlReport.class
  75. BIN
      target/classes/com/ski/crawler/service/ScraperService.class
  76. BIN
      target/classes/com/ski/crawler/site/CrawlerSite.class
  77. BIN
      target/classes/com/ski/crawler/site/SkimapOrgSite.class
  78. BIN
      target/classes/com/ski/crawler/site/SkiresortInfoSite.class
  79. BIN
      target/classes/com/ski/crawler/site/WikipediaSite.class
  80. BIN
      target/classes/com/ski/crawler/spider/ResortListSpider.class
  81. BIN
      target/classes/com/ski/crawler/strategy/CrawlStrategy.class
  82. BIN
      target/classes/com/ski/crawler/strategy/SkiResortInfoStrategy.class
  83. BIN
      target/classes/com/ski/crawler/strategy/SkimapStrategy.class
  84. BIN
      target/classes/com/ski/crawler/strategy/WikipediaStrategy.class
  85. BIN
      target/classes/com/ski/crawler/util/CliArgs.class
  86. BIN
      target/classes/com/ski/crawler/util/ExcelUtil.class
  87. BIN
      target/classes/com/ski/crawler/util/JsonUtil.class
  88. BIN
      target/classes/com/ski/crawler/util/RetryUtil.class
  89. BIN
      target/classes/com/ski/crawler/util/ValidationUtil.class
  90. BIN
      target/classes/com/ski/crawler/utils/CrawlerHttp.class
  91. BIN
      target/classes/com/ski/crawler/utils/HttpClientUtil.class
  92. BIN
      target/classes/com/ski/crawler/view/ConsoleView$Ansi.class
  93. BIN
      target/classes/com/ski/crawler/view/ConsoleView$Col.class
  94. BIN
      target/classes/com/ski/crawler/view/ConsoleView$TablePrinter.class
  95. BIN
      target/classes/com/ski/crawler/view/ConsoleView.class
  96. 13
      target/classes/logback.xml

BIN
.gitignore

Binary file not shown.

45
.vscode/launch.json

@ -0,0 +1,45 @@
{
"version": "0.2.0",
"configurations": [
{
"type": "java",
"name": "CrawlerScheduler",
"request": "launch",
"mainClass": "com.ski.crawler.spider.CrawlerScheduler",
"projectName": "crawler"
},
{
"type": "java",
"name": "Main (no proxy)",
"request": "launch",
"mainClass": "com.ski.crawler.Main",
"args": [
"crawl", "--site skiresort", "--limit 5", "--proxy 127.0.0.1:7890", "--timeout 60000", "--retry 2", "--show-failures"
]
},
{
"type": "java",
"name": "Main (real, Clash proxy)",
"request": "launch",
"mainClass": "com.ski.crawler.Main",
"vmArgs": [
"-Djava.net.useSystemProxies=true",
"-Dhttp.proxyHost=127.0.0.1",
"-Dhttp.proxyPort=7890",
"-Dhttps.proxyHost=127.0.0.1",
"-Dhttps.proxyPort=7890"
]
},
{
"type": "java",
"name": "Main (real, Clash SOCKS5)",
"request": "launch",
"mainClass": "com.ski.crawler.Main",
"vmArgs": [
"-Djava.net.useSystemProxies=true",
"-DsocksProxyHost=127.0.0.1",
"-DsocksProxyPort=7891"
]
}
]
}

132
pom.xml

@ -0,0 +1,132 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.ski</groupId>
<artifactId>crawler</artifactId>
<version>1.0.0</version>
<packaging>jar</packaging>
<name>Web Crawler</name>
<description>A Java web crawler project</description>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<jsoup.version>1.15.3</jsoup.version>
<httpclient.version>4.5.13</httpclient.version>
<jackson.version>2.15.2</jackson.version>
<poi.version>5.2.5</poi.version>
</properties>
<dependencies>
<!-- jsoup HTML parser -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>
<!-- Apache HttpClient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>${httpclient.version}</version>
</dependency>
<!-- Jackson JSON processor -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
</dependency>
<!-- Jackson core -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
</dependency>
<!-- Jackson annotations -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>${jackson.version}</version>
</dependency>
<!-- Lombok for @Data annotation -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.28</version>
<scope>provided</scope>
</dependency>
<!-- Logging -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.36</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.12</version>
</dependency>
<!-- JUnit for testing -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>${poi.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
<configuration>
<source>11</source>
<target>11</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.4.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.ski.crawler.Main</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

20
result.jsonl

@ -0,0 +1,20 @@
{"id":null,"name":"Thredbo","country":"Australia","region":"Oceania","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/1121#ski-map-42368","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Valle Nevado","country":"Chile","region":"Americas","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/1144#ski-map-42367","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Las Leñas","country":"Argentina","region":"Americas","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/1129#ski-map-42366","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Damüls-Mellau Au, Damüls, Mellau","country":"Vorarlberg","region":"Austria","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/2700#ski-map-37810","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Appalachian Ski Mtn.","country":"North Carolina","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/285#ski-map-34865","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Las Leñas","country":"Argentina","region":"Americas","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/1129#ski-map-42365","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Blue Mountain","country":"Ontario","region":"Canada","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/113#ski-map-39542","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Smugglers' Notch Resort","country":"Vermont","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/209#ski-map-6815","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Granlibakken Ski Resort","country":"California","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/535#ski-map-40733","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Magic Mountain","country":"Vermont","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/201#ski-map-7492","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Bromley Mountain","country":"Vermont","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/217#ski-map-4224","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Magic Mountain","country":"Vermont","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/201#ski-map-6965","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Wurmberg","country":"Central Uplands","region":"Germany","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/4190#ski-map-7596","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Vail","country":"Colorado","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/507#ski-map-2580","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"King Pine Ski Area","country":"New Hampshire","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/354#ski-map-11664","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Pigeon Mountain","country":"Alberta","region":"Canada","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/2131#ski-map-23689","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"The Pines","country":"New York","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/4872#ski-map-10199","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Ski Cooper","country":"Colorado","region":"United States","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/512#ski-map-6863","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Staller Sattel","country":"Tyrol","region":"Austria","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/12393#ski-map-17983","sourceSite":"skimap","crawlTime":null}
{"id":null,"name":"Val Neigette","country":"Quebec","region":"Canada","latitude":null,"longitude":null,"altitudeMin":null,"altitudeMax":null,"totalKm":null,"slopeCount":null,"liftCount":null,"ticketPriceMin":null,"ticketPriceMax":null,"currency":null,"openTime":null,"snowDepthCm":null,"temperatureC":null,"nearbyHotels":null,"rentalShops":null,"url":"https://skimap.org/skiareas/view/2205#ski-map-2834","sourceSite":"skimap","crawlTime":null}

40
src/main/java/com/ski/crawler/Main.java

@ -0,0 +1,40 @@
package com.ski.crawler;
import com.ski.crawler.command.CrawlCommand;
import com.ski.crawler.command.ExportCommand;
import com.ski.crawler.command.FilterCommand;
import com.ski.crawler.command.HelpCommand;
import com.ski.crawler.command.ListCommand;
import com.ski.crawler.command.ResumeCommand;
import com.ski.crawler.command.SitesCommand;
import com.ski.crawler.command.StatsCommand;
import com.ski.crawler.controller.CrawlerContext;
import com.ski.crawler.controller.CrawlerController;
import com.ski.crawler.factory.StrategyFactory;
import com.ski.crawler.repository.SkiResortRepository;
import com.ski.crawler.service.ScraperService;
public class Main {
public static void main(String[] args) {
try {
SkiResortRepository repo = new SkiResortRepository();
StrategyFactory factory = new StrategyFactory();
ScraperService service = new ScraperService();
CrawlerContext context = new CrawlerContext(repo, factory, service);
CrawlerController controller = new CrawlerController(
new CrawlCommand(),
new ListCommand(),
new FilterCommand(),
new ExportCommand(),
new ResumeCommand(),
new StatsCommand(),
new SitesCommand(),
new HelpCommand()
);
controller.run(args, context);
} catch (Exception e) {
System.err.println("Crawler failed: " + e.getMessage());
}
}
}

10
src/main/java/com/ski/crawler/command/Command.java

@ -0,0 +1,10 @@
package com.ski.crawler.command;
import com.ski.crawler.controller.CrawlerContext;
public interface Command {
String name();
void execute(String[] args, CrawlerContext context) throws Exception;
}

253
src/main/java/com/ski/crawler/command/CrawlCommand.java

@ -0,0 +1,253 @@
package com.ski.crawler.command;
import com.ski.crawler.controller.CrawlerContext;
import com.ski.crawler.exception.NetworkException;
import com.ski.crawler.factory.StrategyFactory;
import com.ski.crawler.repository.SkiResortRepository;
import com.ski.crawler.service.ScraperService;
import com.ski.crawler.strategy.CrawlStrategy;
import com.ski.crawler.util.CliArgs;
import com.ski.crawler.util.ExcelUtil;
import com.ski.crawler.utils.CrawlerHttp;
import com.ski.crawler.view.ConsoleView;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
public class CrawlCommand implements Command {
@Override
public String name() {
return "crawl";
}
@Override
public void execute(String[] args, CrawlerContext context) throws Exception {
Map<String, String> opts = CliArgs.parseOptions(args, 1);
String siteId = normalizeSite(opts.getOrDefault("site", "skiresort"));
int limit = parseLimit(opts.get("limit"), 100);
int threads = CliArgs.parseInt(opts.get("threads"), 3);
int timeoutMs = CliArgs.parseInt(opts.get("timeout"), 20000);
int retry = CliArgs.parseInt(opts.get("retry"), 3);
long retrySleep = CliArgs.parseInt(opts.get("retry-sleep"), 1000);
boolean dryRun = CliArgs.parseBoolean(opts.get("dry-run"));
boolean full = CliArgs.parseBoolean(opts.get("full"));
boolean incremental = !full;
boolean noProxy = CliArgs.parseBoolean(opts.get("no-proxy"));
boolean color = CliArgs.parseBoolean(opts.get("color"));
boolean showFailures = CliArgs.parseBoolean(opts.get("show-failures"));
Integer widthArg = CliArgs.parseNullableInt(opts.get("width"));
String country = opts.get("country");
String startUrl = opts.get("start-url");
String outRaw = opts.get("out");
String out = (outRaw == null || outRaw.trim().isEmpty()) ? null : outRaw.trim();
String outJsonl = out;
String outXlsx = null;
if (out != null && out.toLowerCase(Locale.ROOT).endsWith(".xlsx")) {
outXlsx = out;
outJsonl = null;
}
String userAgent = opts.getOrDefault("ua", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36");
String proxyHost = opts.getOrDefault("proxy-host", "127.0.0.1");
int proxyPort = CliArgs.parseInt(opts.get("proxy-port"), 7890);
boolean proxyEnabled = !noProxy;
String proxy = opts.get("proxy");
if (proxy != null && !proxy.isEmpty()) {
String p = proxy.trim();
if (p.equalsIgnoreCase("none") || p.equalsIgnoreCase("off") || p.equalsIgnoreCase("false")) {
proxyEnabled = false;
} else {
int idx = p.lastIndexOf(':');
if (idx > 0 && idx < p.length() - 1) {
proxyHost = p.substring(0, idx);
proxyPort = CliArgs.parseInt(p.substring(idx + 1), proxyPort);
} else {
proxyHost = p;
}
}
}
CrawlerHttp http = new CrawlerHttp(userAgent, proxyHost, proxyPort, proxyEnabled, timeoutMs);
int width = resolveWidth(widthArg);
ConsoleView view = new ConsoleView(width, color);
StrategyFactory factory = context.strategies();
SkiResortRepository repo = context.repository();
ScraperService svc = context.scraper();
ScraperService.CrawlReport report;
if (siteId.equals("all")) {
if (outJsonl != null) {
System.err.println("When --site all, JSONL --out is not supported. Use --out result.xlsx or omit --out.");
return;
}
report = crawlAll(factory, svc, startUrl, limit, threads, country, http, repo, incremental, view, showFailures, dryRun, retry, retrySleep);
} else {
CrawlStrategy strategy = factory.create(siteId);
try {
report = svc.crawl(strategy, startUrl, limit, threads, country, http, repo, incremental, outJsonl, view, showFailures, dryRun, retry, retrySleep);
} catch (NetworkException e) {
throw e;
}
}
if (outXlsx != null) {
if (dryRun) {
System.err.println("dry-run is enabled, skip writing: " + outXlsx);
} else {
ExcelUtil.exportResortsBySiteToXlsx(repo.getAll(), outXlsx);
System.err.println("Excel exported: " + repo.getAll().size() + " -> " + outXlsx);
}
}
Map<String, Object> summary = new LinkedHashMap<>();
summary.put("site", report.site);
summary.put("total", report.total);
summary.put("success", report.success);
summary.put("filteredOut", report.filteredOut);
summary.put("skipped", report.skipped);
summary.put("failed", report.failed);
if (outXlsx != null && !dryRun) {
summary.put("out", outXlsx);
} else if (outJsonl != null && !dryRun) {
summary.put("out", outJsonl);
}
view.printSummary(summary, sortByValueDesc(report.byCountry), showFailures ? report.failures : null);
}
private String normalizeSite(String raw) {
if (raw == null) {
return "skiresort";
}
String t = raw.trim().toLowerCase(Locale.ROOT);
if (t.equals("wiki")) {
return "wikipedia";
}
return t;
}
private ScraperService.CrawlReport crawlAll(
StrategyFactory factory,
ScraperService svc,
String startUrl,
int limit,
int threads,
String countryFilter,
CrawlerHttp http,
SkiResortRepository repo,
boolean incremental,
ConsoleView view,
boolean showFailures,
boolean dryRun,
int retryAttempts,
long retrySleepMs
) throws Exception {
List<String> sites = Arrays.asList("skiresort", "wikipedia", "skimap");
Map<String, Long> byCountry = new LinkedHashMap<>();
List<String> failures = new java.util.ArrayList<>();
int total = 0;
int success = 0;
int filteredOut = 0;
int skipped = 0;
int failed = 0;
for (String s : sites) {
CrawlStrategy strategy = factory.create(s);
try {
ScraperService.CrawlReport r = svc.crawl(strategy, null, limit, threads, countryFilter, http, repo, incremental, null, view, showFailures, dryRun, retryAttempts, retrySleepMs);
total += r.total;
success += r.success;
filteredOut += r.filteredOut;
skipped += r.skipped;
failed += r.failed;
mergeByCountry(byCountry, r.byCountry);
if (showFailures && r.failures != null) {
for (String f : r.failures) {
if (failures.size() >= 200) {
break;
}
failures.add(f);
}
}
} catch (Exception e) {
failed += 1;
if (showFailures && failures.size() < 200) {
failures.add("site=" + s + " [" + e.getClass().getSimpleName() + "] " + (e.getMessage() == null ? "" : e.getMessage()));
}
}
}
ScraperService.CrawlReport out = new ScraperService.CrawlReport();
out.site = "all";
out.total = total;
out.success = success;
out.filteredOut = filteredOut;
out.skipped = skipped;
out.failed = failed;
out.byCountry = byCountry;
out.failures = failures;
return out;
}
private void mergeByCountry(Map<String, Long> acc, Map<String, Long> add) {
if (acc == null || add == null || add.isEmpty()) {
return;
}
for (Map.Entry<String, Long> e : add.entrySet()) {
if (e.getKey() == null) {
continue;
}
long v = e.getValue() == null ? 0L : e.getValue();
acc.put(e.getKey(), acc.getOrDefault(e.getKey(), 0L) + v);
}
}
private int parseLimit(String v, int def) {
if (v == null || v.trim().isEmpty()) {
return def;
}
String t = v.trim();
if (t.equalsIgnoreCase("all")) {
return -1;
}
try {
int n = Integer.parseInt(t);
return n <= 0 ? def : n;
} catch (Exception e) {
return def;
}
}
private int resolveWidth(Integer widthArg) {
if (widthArg != null && widthArg > 20) {
return widthArg;
}
String cols = System.getenv("COLUMNS");
if (cols != null) {
try {
int n = Integer.parseInt(cols.trim());
if (n > 20) {
return n;
}
} catch (Exception ignored) {
}
}
return 120;
}
private Map<String, Long> sortByValueDesc(Map<String, Long> m) {
if (m == null || m.isEmpty()) {
return m;
}
return m.entrySet().stream()
.sorted((a, b) -> Long.compare(b.getValue(), a.getValue()))
.collect(LinkedHashMap::new, (acc, e) -> acc.put(e.getKey(), e.getValue()), Map::putAll);
}
}

77
src/main/java/com/ski/crawler/command/ExportCommand.java

@ -0,0 +1,77 @@
package com.ski.crawler.command;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.ski.crawler.controller.CrawlerContext;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.repository.SkiResortRepository;
import com.ski.crawler.util.CliArgs;
import com.ski.crawler.util.ExcelUtil;
import com.ski.crawler.util.JsonUtil;
import java.io.BufferedWriter;
import java.util.Locale;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class ExportCommand implements Command {
@Override
public String name() {
return "export";
}
@Override
public void execute(String[] args, CrawlerContext context) throws Exception {
Map<String, String> opts = CliArgs.parseOptions(args, 1);
String out = opts.get("out");
if (out == null || out.trim().isEmpty()) {
System.err.println("Missing --out <file.jsonl|file.xlsx>");
return;
}
SkiResortRepository repo = context.repository();
List<SkiResort> all = repo.getAll();
String path = out.trim();
if (path.toLowerCase(Locale.ROOT).endsWith(".xlsx")) {
ExcelUtil.exportResortsBySiteToXlsx(all, path);
System.err.println("Exported: " + all.size() + " -> " + path);
return;
}
ObjectMapper mapper = JsonUtil.mapper();
try (BufferedWriter w = JsonUtil.openJsonlWriter(path)) {
for (SkiResort r : all) {
w.write(mapper.writeValueAsString(toJson(r)));
w.newLine();
}
}
System.err.println("Exported: " + all.size() + " -> " + path);
}
private Map<String, Object> toJson(SkiResort r) {
Map<String, Object> obj = new LinkedHashMap<>();
obj.put("id", r.getId());
obj.put("name", r.getName());
obj.put("country", r.getCountry());
obj.put("region", r.getRegion());
obj.put("latitude", r.getLatitude());
obj.put("longitude", r.getLongitude());
obj.put("altitudeMin", r.getAltitudeMin());
obj.put("altitudeMax", r.getAltitudeMax());
obj.put("totalKm", r.getTotalKm());
obj.put("slopeCount", r.getSlopeCount());
obj.put("liftCount", r.getLiftCount());
obj.put("ticketPriceMin", r.getTicketPriceMin());
obj.put("ticketPriceMax", r.getTicketPriceMax());
obj.put("currency", r.getCurrency());
obj.put("openTime", r.getOpenTime());
obj.put("snowDepthCm", r.getSnowDepthCm());
obj.put("temperatureC", r.getTemperatureC());
obj.put("nearbyHotels", r.getNearbyHotels());
obj.put("rentalShops", r.getRentalShops());
obj.put("url", r.getSourceUrl());
obj.put("sourceSite", r.getSourceSite());
obj.put("crawlTime", r.getCrawledAt() == null ? null : r.getCrawledAt().toString());
return obj;
}
}

18
src/main/java/com/ski/crawler/command/FilterCommand.java

@ -0,0 +1,18 @@
package com.ski.crawler.command;
import com.ski.crawler.controller.CrawlerContext;
public class FilterCommand implements Command {
private final ListCommand delegate = new ListCommand();
@Override
public String name() {
return "filter";
}
@Override
public void execute(String[] args, CrawlerContext context) {
delegate.execute(args, context);
}
}

38
src/main/java/com/ski/crawler/command/HelpCommand.java

@ -0,0 +1,38 @@
package com.ski.crawler.command;
import com.ski.crawler.controller.CrawlerContext;
public class HelpCommand implements Command {
@Override
public String name() {
return "help";
}
@Override
public void execute(String[] args, CrawlerContext context) {
System.out.println("命令:");
System.out.println(" crawl --site <skiresort|wikipedia|skimap|all> --limit <N|all> [--country <关键词>] [--out <result.jsonl|result.xlsx>] [--dry-run] [--no-proxy]");
System.out.println(" list [--country <关键词>]");
System.out.println(" export --out <result.jsonl|result.xlsx>");
System.out.println(" resume --in <result.jsonl>");
System.out.println(" stats");
System.out.println(" sites");
System.out.println(" help");
System.out.println();
System.out.println("crawl 参数:");
System.out.println(" --threads <N> 默认 3");
System.out.println(" --start-url <URL> 覆盖站点入口");
System.out.println(" --timeout <ms> 默认 20000");
System.out.println(" --ua <UserAgent> 覆盖 UA");
System.out.println(" --proxy <host:port|none> 代理配置");
System.out.println(" --proxy-host <host> / --proxy-port <port>");
System.out.println(" --no-proxy 禁用代理");
System.out.println(" --width <N> 表格宽度");
System.out.println(" --color 表头上色(可选)");
System.out.println(" --show-failures 结束时输出失败列表(可选)");
System.out.println(" --full 全量抓取(忽略去重,仍然不会往仓库写重复 URL)");
System.out.println(" --retry <N> 默认 3");
System.out.println(" --retry-sleep <ms> 默认 1000");
System.out.println(" --dry-run 不写入仓库/不导出文件(仅展示)");
}
}

53
src/main/java/com/ski/crawler/command/ListCommand.java

@ -0,0 +1,53 @@
package com.ski.crawler.command;
import com.ski.crawler.controller.CrawlerContext;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.repository.SkiResortRepository;
import com.ski.crawler.util.CliArgs;
import com.ski.crawler.view.ConsoleView;
import java.util.List;
import java.util.Map;
public class ListCommand implements Command {
@Override
public String name() {
return "list";
}
@Override
public void execute(String[] args, CrawlerContext context) {
Map<String, String> opts = CliArgs.parseOptions(args, 1);
String country = opts.get("country");
boolean color = CliArgs.parseBoolean(opts.get("color"));
Integer widthArg = CliArgs.parseNullableInt(opts.get("width"));
int width = resolveWidth(widthArg);
SkiResortRepository repo = context.repository();
List<SkiResort> list = (country == null || country.trim().isEmpty()) ? repo.getAll() : repo.filterByCountry(country);
ConsoleView view = new ConsoleView(width, color);
view.printHeader();
for (SkiResort r : list) {
view.printResort(r);
}
}
private int resolveWidth(Integer widthArg) {
if (widthArg != null && widthArg > 20) {
return widthArg;
}
String cols = System.getenv("COLUMNS");
if (cols != null) {
try {
int n = Integer.parseInt(cols.trim());
if (n > 20) {
return n;
}
} catch (Exception ignored) {
}
}
return 120;
}
}

140
src/main/java/com/ski/crawler/command/ResumeCommand.java

@ -0,0 +1,140 @@
package com.ski.crawler.command;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.ski.crawler.controller.CrawlerContext;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.repository.SkiResortRepository;
import com.ski.crawler.util.CliArgs;
import com.ski.crawler.util.JsonUtil;
import java.io.BufferedReader;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Map;
public class ResumeCommand implements Command {
@Override
public String name() {
return "resume";
}
@Override
public void execute(String[] args, CrawlerContext context) throws Exception {
Map<String, String> opts = CliArgs.parseOptions(args, 1);
String in = opts.get("in");
if (in == null || in.trim().isEmpty()) {
System.err.println("Missing --in <file.jsonl>");
return;
}
SkiResortRepository repo = context.repository();
ObjectMapper mapper = JsonUtil.mapper();
int loaded = 0;
int skipped = 0;
try (BufferedReader br = JsonUtil.openJsonlReader(in.trim())) {
String line;
while ((line = br.readLine()) != null) {
String t = line.trim();
if (t.isEmpty()) {
continue;
}
Map<String, Object> obj = mapper.readValue(t, new TypeReference<Map<String, Object>>() {});
SkiResort r = fromJson(obj);
if (r.getSourceUrl() == null && obj.get("url") != null) {
r.setSourceUrl(String.valueOf(obj.get("url")));
}
if (repo.add(r)) {
loaded++;
} else {
skipped++;
}
}
}
System.err.println("Resumed: loaded=" + loaded + " skipped=" + skipped + " totalInRepo=" + repo.getAll().size());
}
private SkiResort fromJson(Map<String, Object> obj) {
SkiResort r = new SkiResort();
r.setName(asString(obj.get("name")));
r.setCountry(asString(obj.get("country")));
r.setRegion(asString(obj.get("region")));
r.setLatitude(asDouble(obj.get("latitude")));
r.setLongitude(asDouble(obj.get("longitude")));
r.setAltitudeMin(asInt(obj.get("altitudeMin")));
r.setAltitudeMax(asInt(obj.get("altitudeMax")));
r.setTotalKm(asDouble(obj.get("totalKm")));
r.setSlopeCount(asInt(obj.get("slopeCount")));
r.setLiftCount(asInt(obj.get("liftCount")));
r.setTicketPriceMin(asDouble(obj.get("ticketPriceMin")));
r.setTicketPriceMax(asDouble(obj.get("ticketPriceMax")));
r.setCurrency(asString(obj.get("currency")));
r.setOpenTime(asString(obj.get("openTime")));
r.setSnowDepthCm(asInt(obj.get("snowDepthCm")));
r.setTemperatureC(asDouble(obj.get("temperatureC")));
r.setSourceSite(asString(obj.get("sourceSite")));
r.setSourceUrl(asString(obj.get("url")));
String crawlTime = asString(obj.get("crawlTime"));
if (crawlTime != null) {
try {
r.setCrawledAt(LocalDateTime.parse(crawlTime));
} catch (Exception ignored) {
}
}
Object hotels = obj.get("nearbyHotels");
if (hotels instanceof List) {
r.setNearbyHotels((List<String>) hotels);
}
Object shops = obj.get("rentalShops");
if (shops instanceof List) {
r.setRentalShops((List<String>) shops);
}
return r;
}
private String asString(Object v) {
if (v == null) {
return null;
}
String s = String.valueOf(v).replace('\u00A0', ' ').trim();
return s.isEmpty() ? null : s;
}
private Integer asInt(Object v) {
try {
if (v == null) {
return null;
}
if (v instanceof Number) {
return ((Number) v).intValue();
}
String s = String.valueOf(v).trim();
if (s.isEmpty()) {
return null;
}
return Integer.parseInt(s);
} catch (Exception e) {
return null;
}
}
private Double asDouble(Object v) {
try {
if (v == null) {
return null;
}
if (v instanceof Number) {
return ((Number) v).doubleValue();
}
String s = String.valueOf(v).trim().replace(",", ".");
if (s.isEmpty()) {
return null;
}
return Double.parseDouble(s);
} catch (Exception e) {
return null;
}
}
}

19
src/main/java/com/ski/crawler/command/SitesCommand.java

@ -0,0 +1,19 @@
package com.ski.crawler.command;
import com.ski.crawler.controller.CrawlerContext;
public class SitesCommand implements Command {
@Override
public String name() {
return "sites";
}
@Override
public void execute(String[] args, CrawlerContext context) {
System.out.println("sites:");
System.out.println(" skiresort https://www.skiresort.info");
System.out.println(" wikipedia https://en.wikipedia.org/wiki/List_of_ski_areas_and_resorts");
System.out.println(" skimap https://skimap.org");
}
}

66
src/main/java/com/ski/crawler/command/StatsCommand.java

@ -0,0 +1,66 @@
package com.ski.crawler.command;
import com.ski.crawler.controller.CrawlerContext;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.repository.SkiResortRepository;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class StatsCommand implements Command {
@Override
public String name() {
return "stats";
}
@Override
public void execute(String[] args, CrawlerContext context) {
SkiResortRepository repo = context.repository();
List<SkiResort> all = repo.getAll();
System.out.println("total=" + all.size());
Map<String, Long> byCountry = repo.countByCountry();
List<Map.Entry<String, Long>> top = byCountry.entrySet().stream()
.sorted(Map.Entry.<String, Long>comparingByValue(Comparator.reverseOrder()))
.limit(20)
.collect(Collectors.toList());
if (!top.isEmpty()) {
System.out.println("byCountry(top20):");
long max = top.get(0).getValue() == null ? 0 : top.get(0).getValue();
for (Map.Entry<String, Long> e : top) {
long v = e.getValue() == null ? 0 : e.getValue();
System.out.println(" " + e.getKey() + ": " + v + " " + bar(v, max, 30));
}
}
double sum = 0;
int cnt = 0;
for (SkiResort r : all) {
Double p = r.getTicketPriceMin();
if (p != null && p >= 0) {
sum += p;
cnt++;
}
}
if (cnt > 0) {
System.out.println("avgTicketPriceMin=" + (sum / cnt) + " samples=" + cnt);
}
}
private String bar(long v, long max, int width) {
if (max <= 0 || width <= 0) {
return "";
}
int n = (int) Math.round((double) v * width / (double) max);
if (n <= 0) {
return "";
}
StringBuilder sb = new StringBuilder(n);
for (int i = 0; i < n; i++) {
sb.append('#');
}
return sb.toString();
}
}

30
src/main/java/com/ski/crawler/controller/CrawlerContext.java

@ -0,0 +1,30 @@
package com.ski.crawler.controller;
import com.ski.crawler.factory.StrategyFactory;
import com.ski.crawler.repository.SkiResortRepository;
import com.ski.crawler.service.ScraperService;
public class CrawlerContext {
private final SkiResortRepository repository;
private final StrategyFactory strategyFactory;
private final ScraperService scraperService;
public CrawlerContext(SkiResortRepository repository, StrategyFactory strategyFactory, ScraperService scraperService) {
this.repository = repository;
this.strategyFactory = strategyFactory;
this.scraperService = scraperService;
}
public SkiResortRepository repository() {
return repository;
}
public StrategyFactory strategies() {
return strategyFactory;
}
public ScraperService scraper() {
return scraperService;
}
}

72
src/main/java/com/ski/crawler/controller/CrawlerController.java

@ -0,0 +1,72 @@
package com.ski.crawler.controller;
import com.ski.crawler.command.Command;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
public class CrawlerController {
private final Map<String, Command> commands = new HashMap<>();
public CrawlerController(Command... cmds) {
if (cmds != null) {
for (Command c : cmds) {
if (c != null && c.name() != null) {
commands.put(c.name().toLowerCase(Locale.ROOT), c);
}
}
}
}
public void run(String[] args, CrawlerContext context) throws Exception {
String cmd = firstArg(args);
if (cmd.isEmpty()) {
execute("help", args, context);
return;
}
if (isLegacyLimit(cmd)) {
execute("crawl", new String[]{"crawl", "--limit", cmd}, context);
return;
}
if ("all".equalsIgnoreCase(cmd)) {
execute("crawl", new String[]{"crawl", "--limit", "all"}, context);
return;
}
execute(cmd, args, context);
}
private void execute(String cmd, String[] args, CrawlerContext context) throws Exception {
Command c = commands.get(cmd.toLowerCase(Locale.ROOT));
if (c == null) {
Command help = commands.get("help");
if (help != null) {
help.execute(args, context);
}
return;
}
c.execute(args, context);
}
private String firstArg(String[] args) {
if (args == null || args.length == 0 || args[0] == null) {
return "";
}
return args[0].trim();
}
private boolean isLegacyLimit(String s) {
try {
if (s == null) {
return false;
}
Integer.parseInt(s.trim());
return true;
} catch (Exception e) {
return false;
}
}
}

12
src/main/java/com/ski/crawler/exception/CrawlerException.java

@ -0,0 +1,12 @@
package com.ski.crawler.exception;
public class CrawlerException extends Exception {
public CrawlerException(String message) {
super(message);
}
public CrawlerException(String message, Throwable cause) {
super(message, cause);
}
}

12
src/main/java/com/ski/crawler/exception/NetworkException.java

@ -0,0 +1,12 @@
package com.ski.crawler.exception;
public class NetworkException extends CrawlerException {
public NetworkException(String message) {
super(message);
}
public NetworkException(String message, Throwable cause) {
super(message, cause);
}
}

12
src/main/java/com/ski/crawler/exception/ParseException.java

@ -0,0 +1,12 @@
package com.ski.crawler.exception;
public class ParseException extends CrawlerException {
public ParseException(String message) {
super(message);
}
public ParseException(String message, Throwable cause) {
super(message, cause);
}
}

31
src/main/java/com/ski/crawler/factory/StrategyFactory.java

@ -0,0 +1,31 @@
package com.ski.crawler.factory;
import com.ski.crawler.strategy.CrawlStrategy;
import com.ski.crawler.strategy.SkiResortInfoStrategy;
import com.ski.crawler.strategy.SkimapStrategy;
import com.ski.crawler.strategy.WikipediaStrategy;
import java.util.Locale;
public class StrategyFactory {
public CrawlStrategy create(String id) {
if (id == null) {
return new SkiResortInfoStrategy();
}
String t = id.trim().toLowerCase(Locale.ROOT);
if (t.equals("wiki")) {
t = "wikipedia";
}
switch (t) {
case "skiresort":
return new SkiResortInfoStrategy();
case "wikipedia":
return new WikipediaStrategy();
case "skimap":
return new SkimapStrategy();
default:
throw new IllegalArgumentException("Unknown site: " + id);
}
}
}

83
src/main/java/com/ski/crawler/model/SkiLift.java

@ -0,0 +1,83 @@
package com.ski.crawler.model;
public class SkiLift {
private Long id;
private Long resortId;
private Integer totalLifts;
private Integer gondolas;
private Integer chairlifts;
private Integer surfaceLifts;
private Integer cableCars;
private Integer travelators;
public SkiLift() {
}
public SkiLift(Long resortId) {
this.resortId = resortId;
}
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public Long getResortId() {
return resortId;
}
public void setResortId(Long resortId) {
this.resortId = resortId;
}
public Integer getTotalLifts() {
return totalLifts;
}
public void setTotalLifts(Integer totalLifts) {
this.totalLifts = totalLifts;
}
public Integer getGondolas() {
return gondolas;
}
public void setGondolas(Integer gondolas) {
this.gondolas = gondolas;
}
public Integer getChairlifts() {
return chairlifts;
}
public void setChairlifts(Integer chairlifts) {
this.chairlifts = chairlifts;
}
public Integer getSurfaceLifts() {
return surfaceLifts;
}
public void setSurfaceLifts(Integer surfaceLifts) {
this.surfaceLifts = surfaceLifts;
}
public Integer getCableCars() {
return cableCars;
}
public void setCableCars(Integer cableCars) {
this.cableCars = cableCars;
}
public Integer getTravelators() {
return travelators;
}
public void setTravelators(Integer travelators) {
this.travelators = travelators;
}
}

252
src/main/java/com/ski/crawler/model/SkiResort.java

@ -0,0 +1,252 @@
package com.ski.crawler.model;
import java.math.BigDecimal;
import java.time.LocalDateTime;
import java.util.List;
public class SkiResort {
private Long id;
private String name;
private String country;
private String region;
private Double latitude;
private Double longitude;
private Integer altitudeMin;
private Integer altitudeMax;
private Double totalKm;
private BigDecimal overallScore;
private SkiTrail skiTrail;
private SkiLift skiLift;
private SkiTicket skiTicket;
private String sourceUrl;
private String sourceSite;
private LocalDateTime crawledAt;
private Integer slopeCount;
private Integer liftCount;
private Double ticketPriceMin;
private Double ticketPriceMax;
private String currency;
private String openTime;
private Double temperatureC;
private Integer snowDepthCm;
private List<String> nearbyHotels;
private List<String> rentalShops;
public SkiResort() {
}
public SkiResort(String name, String country, String sourceUrl) {
this.name = name;
this.country = country;
this.sourceUrl = sourceUrl;
}
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = country;
}
public String getRegion() {
return region;
}
public void setRegion(String region) {
this.region = region;
}
public Double getLatitude() {
return latitude;
}
public void setLatitude(Double latitude) {
this.latitude = latitude;
}
public Double getLongitude() {
return longitude;
}
public void setLongitude(Double longitude) {
this.longitude = longitude;
}
public Integer getAltitudeMin() {
return altitudeMin;
}
public void setAltitudeMin(Integer altitudeMin) {
this.altitudeMin = altitudeMin;
}
public Integer getAltitudeMax() {
return altitudeMax;
}
public void setAltitudeMax(Integer altitudeMax) {
this.altitudeMax = altitudeMax;
}
public Double getTotalKm() {
return totalKm;
}
public void setTotalKm(Double totalKm) {
this.totalKm = totalKm;
}
public BigDecimal getOverallScore() {
return overallScore;
}
public void setOverallScore(BigDecimal overallScore) {
this.overallScore = overallScore;
}
public SkiTrail getSkiTrail() {
return skiTrail;
}
public void setSkiTrail(SkiTrail skiTrail) {
this.skiTrail = skiTrail;
}
public SkiLift getSkiLift() {
return skiLift;
}
public void setSkiLift(SkiLift skiLift) {
this.skiLift = skiLift;
}
public SkiTicket getSkiTicket() {
return skiTicket;
}
public void setSkiTicket(SkiTicket skiTicket) {
this.skiTicket = skiTicket;
}
public String getSourceUrl() {
return sourceUrl;
}
public void setSourceUrl(String sourceUrl) {
this.sourceUrl = sourceUrl;
}
public String getSourceSite() {
return sourceSite;
}
public void setSourceSite(String sourceSite) {
this.sourceSite = sourceSite;
}
public LocalDateTime getCrawledAt() {
return crawledAt;
}
public void setCrawledAt(LocalDateTime crawledAt) {
this.crawledAt = crawledAt;
}
public Integer getSlopeCount() {
return slopeCount;
}
public void setSlopeCount(Integer slopeCount) {
this.slopeCount = slopeCount;
}
public Integer getLiftCount() {
return liftCount;
}
public void setLiftCount(Integer liftCount) {
this.liftCount = liftCount;
}
public Double getTicketPriceMin() {
return ticketPriceMin;
}
public void setTicketPriceMin(Double ticketPriceMin) {
this.ticketPriceMin = ticketPriceMin;
}
public Double getTicketPriceMax() {
return ticketPriceMax;
}
public void setTicketPriceMax(Double ticketPriceMax) {
this.ticketPriceMax = ticketPriceMax;
}
public String getCurrency() {
return currency;
}
public void setCurrency(String currency) {
this.currency = currency;
}
public String getOpenTime() {
return openTime;
}
public void setOpenTime(String openTime) {
this.openTime = openTime;
}
public Double getTemperatureC() {
return temperatureC;
}
public void setTemperatureC(Double temperatureC) {
this.temperatureC = temperatureC;
}
public Integer getSnowDepthCm() {
return snowDepthCm;
}
public void setSnowDepthCm(Integer snowDepthCm) {
this.snowDepthCm = snowDepthCm;
}
public List<String> getNearbyHotels() {
return nearbyHotels;
}
public void setNearbyHotels(List<String> nearbyHotels) {
this.nearbyHotels = nearbyHotels;
}
public List<String> getRentalShops() {
return rentalShops;
}
public void setRentalShops(List<String> rentalShops) {
this.rentalShops = rentalShops;
}
}

76
src/main/java/com/ski/crawler/model/SkiReview.java

@ -0,0 +1,76 @@
package com.ski.crawler.model;
import java.time.LocalDateTime;
public class SkiReview {
private Long id;
private Long resortId;
private Double overallScore;
private Double snowScore;
private Double facilitiesScore;
private Integer totalReviews;
private LocalDateTime crawledAt;
public SkiReview() {
}
public SkiReview(Long resortId) {
this.resortId = resortId;
}
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public Long getResortId() {
return resortId;
}
public void setResortId(Long resortId) {
this.resortId = resortId;
}
public Double getOverallScore() {
return overallScore;
}
public void setOverallScore(Double overallScore) {
this.overallScore = overallScore;
}
public Double getSnowScore() {
return snowScore;
}
public void setSnowScore(Double snowScore) {
this.snowScore = snowScore;
}
public Double getFacilitiesScore() {
return facilitiesScore;
}
public void setFacilitiesScore(Double facilitiesScore) {
this.facilitiesScore = facilitiesScore;
}
public Integer getTotalReviews() {
return totalReviews;
}
public void setTotalReviews(Integer totalReviews) {
this.totalReviews = totalReviews;
}
public LocalDateTime getCrawledAt() {
return crawledAt;
}
public void setCrawledAt(LocalDateTime crawledAt) {
this.crawledAt = crawledAt;
}
}

74
src/main/java/com/ski/crawler/model/SkiTicket.java

@ -0,0 +1,74 @@
package com.ski.crawler.model;
public class SkiTicket {
private Long id;
private Long resortId;
private String ticketType;
private Double priceAdult;
private Double priceChild;
private String currency;
private String season;
public SkiTicket() {
}
public SkiTicket(Long resortId) {
this.resortId = resortId;
}
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public Long getResortId() {
return resortId;
}
public void setResortId(Long resortId) {
this.resortId = resortId;
}
public String getTicketType() {
return ticketType;
}
public void setTicketType(String ticketType) {
this.ticketType = ticketType;
}
public Double getPriceAdult() {
return priceAdult;
}
public void setPriceAdult(Double priceAdult) {
this.priceAdult = priceAdult;
}
public Double getPriceChild() {
return priceChild;
}
public void setPriceChild(Double priceChild) {
this.priceChild = priceChild;
}
public String getCurrency() {
return currency;
}
public void setCurrency(String currency) {
this.currency = currency;
}
public String getSeason() {
return season;
}
public void setSeason(String season) {
this.season = season;
}
}

92
src/main/java/com/ski/crawler/model/SkiTrail.java

@ -0,0 +1,92 @@
package com.ski.crawler.model;
public class SkiTrail {
private Long id;
private Long resortId;
private Double totalKm;
private Double beginnerKm;
private Double intermediateKm;
private Double expertKm;
private Integer totalRuns;
private Boolean snowMaking;
private Integer snowDepthCm;
public SkiTrail() {
}
public SkiTrail(Long resortId) {
this.resortId = resortId;
}
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public Long getResortId() {
return resortId;
}
public void setResortId(Long resortId) {
this.resortId = resortId;
}
public Double getTotalKm() {
return totalKm;
}
public void setTotalKm(Double totalKm) {
this.totalKm = totalKm;
}
public Double getBeginnerKm() {
return beginnerKm;
}
public void setBeginnerKm(Double beginnerKm) {
this.beginnerKm = beginnerKm;
}
public Double getIntermediateKm() {
return intermediateKm;
}
public void setIntermediateKm(Double intermediateKm) {
this.intermediateKm = intermediateKm;
}
public Double getExpertKm() {
return expertKm;
}
public void setExpertKm(Double expertKm) {
this.expertKm = expertKm;
}
public Integer getTotalRuns() {
return totalRuns;
}
public void setTotalRuns(Integer totalRuns) {
this.totalRuns = totalRuns;
}
public Boolean getSnowMaking() {
return snowMaking;
}
public void setSnowMaking(Boolean snowMaking) {
this.snowMaking = snowMaking;
}
public Integer getSnowDepthCm() {
return snowDepthCm;
}
public void setSnowDepthCm(Integer snowDepthCm) {
this.snowDepthCm = snowDepthCm;
}
}

471
src/main/java/com/ski/crawler/parser/ResortDetailParser.java

@ -0,0 +1,471 @@
package com.ski.crawler.parser;
import com.ski.crawler.model.SkiLift;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.model.SkiTicket;
import com.ski.crawler.model.SkiTrail;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ResortDetailParser {
private static final Pattern INT_M_PATTERN = Pattern.compile("(\\d{2,5})\\s*m\\b", Pattern.CASE_INSENSITIVE);
private static final Pattern ALT_RANGE_PATTERN = Pattern.compile("(\\d{2,5})\\s*m\\s*(?:-|–|to)\\s*(\\d{2,5})\\s*m\\b", Pattern.CASE_INSENSITIVE);
private static final Pattern KM_PATTERN = Pattern.compile("(\\d+(?:[\\.,]\\d+)?)\\s*km\\b", Pattern.CASE_INSENSITIVE);
private static final Pattern PERCENT_PATTERN = Pattern.compile("(\\d{1,3})\\s*%\\b");
private static final Pattern NUMBER_PATTERN = Pattern.compile("(\\d+(?:[\\.,]\\d+)?)");
private static final Pattern CURRENCY_FIRST_PATTERN = Pattern.compile("(?:(SFr\\.)|CHF|€|\\$|£)\\s*(\\d+(?:[\\.,]\\d+)?)");
private static final Pattern CURRENCY_LAST_PATTERN = Pattern.compile("(\\d+(?:[\\.,]\\d+)?)\\s*(€|\\$|£|CHF|SFr\\.)");
//解析滑雪场详情页的 HTML 内容,提取出滑雪场的详细信息。
//它使用 Jsoup 解析 HTML 内容,然后根据 HTML 结构提取出滑雪场的名称、国家、区域、海拔、总距离、总轨迹、总 lift、总票、总评分等信息。
//最后,它将这些信息封装到 SkiResort 对象中。
public SkiResort parse(String html) {
SkiResort resort = new SkiResort();
resort.setSkiTrail(new SkiTrail());
resort.setSkiLift(new SkiLift());
resort.setSkiTicket(new SkiTicket());
if (html == null || html.isEmpty()) {
return resort;
}
Document doc;
try {
doc = Jsoup.parse(html);
} catch (Exception e) {
return resort;
}
tryFillName(doc, resort);
tryFillCountryRegionFromBreadcrumb(doc, resort);
tryFillAltitude(doc, resort);
tryFillTotalKmAndTrailBreakdown(doc, resort);
tryFillLifts(doc, resort.getSkiLift());
tryFillTickets(doc, resort.getSkiTicket());
tryFillOverallScore(doc, resort);
return resort;
}
private void tryFillName(Document doc, SkiResort resort) {
try {
Element nameEl = doc.selectFirst(".resort-name");
if (nameEl == null) {
nameEl = doc.selectFirst("h1");
}
if (nameEl != null) {
String name = cleanText(nameEl.text());
if (!name.isEmpty()) {
resort.setName(name);
}
}
} catch (Exception ignored) {
}
}
private void tryFillCountryRegionFromBreadcrumb(Document doc, SkiResort resort) {
try {
Elements crumbs = doc.select(".breadcrumb a, nav.breadcrumb a, ol.breadcrumb a, ul.breadcrumb a, .breadcrumb li, nav.breadcrumb li, ol.breadcrumb li, ul.breadcrumb li");
List<String> items = new ArrayList<>();
for (Element el : crumbs) {
String t = cleanText(el.text());
if (t.isEmpty()) {
continue;
}
String lower = t.toLowerCase(Locale.ROOT);
if (lower.equals("ski resorts") || lower.equals("ski-resorts") || lower.equals("home") || lower.equals("worldwide")) {
continue;
}
items.add(t);
}
if (items.size() >= 3) {
resort.setCountry(items.get(items.size() - 3));
resort.setRegion(items.get(items.size() - 2));
} else if (items.size() == 2) {
resort.setCountry(items.get(0));
resort.setRegion(items.get(1));
} else if (items.size() == 1) {
resort.setCountry(items.get(0));
}
} catch (Exception ignored) {
}
}
private void tryFillAltitude(Document doc, SkiResort resort) {
try {
String text = doc.text();
Integer min = null;
Integer max = null;
Matcher range = ALT_RANGE_PATTERN.matcher(text);
if (range.find()) {
min = safeParseInt(range.group(1));
max = safeParseInt(range.group(2));
} else {
List<Integer> ms = new ArrayList<>();
Matcher m = INT_M_PATTERN.matcher(text);
while (m.find() && ms.size() < 3) {
Integer v = safeParseInt(m.group(1));
if (v != null) {
ms.add(v);
}
}
if (ms.size() >= 2) {
min = ms.get(ms.size() - 2);
max = ms.get(ms.size() - 1);
}
}
resort.setAltitudeMin(min);
resort.setAltitudeMax(max);
} catch (Exception ignored) {
}
}
private void tryFillTotalKmAndTrailBreakdown(Document doc, SkiResort resort) {
try {
Double totalKm = null;
Element kmEl = firstElementContaining(doc, "km", "slope", "slopes", "piste");
if (kmEl != null) {
totalKm = firstDoubleFrom(KM_PATTERN, kmEl.text());
}
if (totalKm == null) {
totalKm = firstDoubleFrom(KM_PATTERN, doc.text());
}
resort.setTotalKm(totalKm);
SkiTrail trail = resort.getSkiTrail();
if (trail == null) {
trail = new SkiTrail();
resort.setSkiTrail(trail);
}
trail.setTotalKm(totalKm);
Integer beginnerPct = percentNearKeyword(doc, "beginner", "easy");
Integer intermediatePct = percentNearKeyword(doc, "intermediate", "medium");
Integer expertPct = percentNearKeyword(doc, "expert", "advanced", "difficult");
if (totalKm != null) {
if (beginnerPct != null) {
trail.setBeginnerKm(roundKm(totalKm * beginnerPct / 100.0));
}
if (intermediatePct != null) {
trail.setIntermediateKm(roundKm(totalKm * intermediatePct / 100.0));
}
if (expertPct != null) {
trail.setExpertKm(roundKm(totalKm * expertPct / 100.0));
}
}
} catch (Exception ignored) {
}
}
private void tryFillLifts(Document doc, SkiLift lift) {
if (lift == null) {
return;
}
try {
String text = doc.text();
lift.setTotalLifts(intNear(text, "lift", "lifts"));
lift.setGondolas(intNear(text, "gondola", "gondolas"));
lift.setChairlifts(intNear(text, "chairlift", "chairlifts"));
lift.setSurfaceLifts(intNear(text, "surface lift", "surface lifts", "t-bar", "drag lift", "platter lift"));
lift.setCableCars(intNear(text, "cable car", "cable cars"));
lift.setTravelators(intNear(text, "travelator", "travelators", "moving carpet"));
} catch (Exception ignored) {
}
}
private void tryFillTickets(Document doc, SkiTicket ticket) {
if (ticket == null) {
return;
}
try {
Element adultEl = firstElementContaining(doc, "adult", "adults");
Element childEl = firstElementContaining(doc, "child", "children", "kid", "kids");
Price adult = (adultEl != null) ? extractPrice(adultEl.text()) : null;
Price child = (childEl != null) ? extractPrice(childEl.text()) : null;
if (adult == null || child == null) {
List<Price> prices = extractAllPrices(doc.text(), 4);
if (adult == null && !prices.isEmpty()) {
adult = prices.get(0);
}
if (child == null && prices.size() >= 2) {
child = prices.get(1);
}
}
if (adult != null) {
ticket.setPriceAdult(adult.amount);
ticket.setCurrency(adult.currency);
}
if (child != null) {
ticket.setPriceChild(child.amount);
if (ticket.getCurrency() == null) {
ticket.setCurrency(child.currency);
}
}
} catch (Exception ignored) {
}
}
private void tryFillOverallScore(Document doc, SkiResort resort) {
try {
Element scoreEl = firstElementContaining(doc, "score", "rating", "stars");
BigDecimal score = null;
if (scoreEl != null) {
score = firstBigDecimal(scoreEl.text());
}
if (score == null) {
score = firstBigDecimal(doc.text());
}
if (score != null) {
if (score.compareTo(BigDecimal.ZERO) < 0 || score.compareTo(new BigDecimal("10")) > 0) {
return;
}
resort.setOverallScore(score);
}
} catch (Exception ignored) {
}
}
private Element firstElementContaining(Document doc, String... keywords) {
Elements candidates = doc.select("div, span, p, li, td, th");
for (Element el : candidates) {
String t = el.text();
if (t == null || t.isEmpty()) {
continue;
}
String lower = t.toLowerCase(Locale.ROOT);
for (String k : keywords) {
if (k != null && !k.isEmpty() && lower.contains(k.toLowerCase(Locale.ROOT))) {
return el;
}
}
}
return null;
}
private Integer percentNearKeyword(Document doc, String... keywords) {
Elements candidates = doc.select("div, span, p, li, td, th");
for (Element el : candidates) {
String t = el.text();
if (t == null || t.isEmpty()) {
continue;
}
String lower = t.toLowerCase(Locale.ROOT);
boolean hit = false;
for (String k : keywords) {
if (k != null && !k.isEmpty() && lower.contains(k.toLowerCase(Locale.ROOT))) {
hit = true;
break;
}
}
if (!hit) {
continue;
}
Matcher m = PERCENT_PATTERN.matcher(t);
if (m.find()) {
Integer pct = safeParseInt(m.group(1));
if (pct != null && pct >= 0 && pct <= 100) {
return pct;
}
}
}
return null;
}
private Integer intNear(String text, String... keywords) {
if (text == null || text.isEmpty()) {
return null;
}
String lower = text.toLowerCase(Locale.ROOT);
int bestIndex = -1;
for (String k : keywords) {
if (k == null || k.isEmpty()) {
continue;
}
int idx = lower.indexOf(k.toLowerCase(Locale.ROOT));
if (idx >= 0) {
bestIndex = idx;
break;
}
}
if (bestIndex < 0) {
return null;
}
int start = Math.max(0, bestIndex - 40);
int end = Math.min(text.length(), bestIndex + 40);
String window = text.substring(start, end);
Matcher m = Pattern.compile("(\\d{1,4})").matcher(window);
if (m.find()) {
return safeParseInt(m.group(1));
}
return null;
}
private Double roundKm(double v) {
return new BigDecimal(v).setScale(2, RoundingMode.HALF_UP).doubleValue();
}
private Double firstDoubleFrom(Pattern pattern, String text) {
if (text == null) {
return null;
}
Matcher m = pattern.matcher(text);
if (m.find()) {
return safeParseDouble(m.group(1));
}
return null;
}
private BigDecimal firstBigDecimal(String text) {
if (text == null) {
return null;
}
Matcher m = NUMBER_PATTERN.matcher(text);
if (m.find()) {
Double d = safeParseDouble(m.group(1));
if (d == null) {
return null;
}
return BigDecimal.valueOf(d).setScale(2, RoundingMode.HALF_UP);
}
return null;
}
private Price extractPrice(String text) {
if (text == null) {
return null;
}
Matcher m1 = CURRENCY_FIRST_PATTERN.matcher(text);
if (m1.find()) {
String cur = normalizeCurrency(m1.group(1), text.substring(m1.start(), Math.min(text.length(), m1.end())));
Double amount = safeParseDouble(m1.group(2));
if (amount != null) {
return new Price(cur, amount);
}
}
Matcher m2 = CURRENCY_LAST_PATTERN.matcher(text);
if (m2.find()) {
Double amount = safeParseDouble(m2.group(1));
String cur = normalizeCurrency(null, m2.group(2));
if (amount != null) {
return new Price(cur, amount);
}
}
return null;
}
private List<Price> extractAllPrices(String text, int limit) {
List<Price> out = new ArrayList<>();
if (text == null || text.isEmpty()) {
return out;
}
Matcher m1 = CURRENCY_FIRST_PATTERN.matcher(text);
while (m1.find() && out.size() < limit) {
Double amount = safeParseDouble(m1.group(2));
if (amount == null) {
continue;
}
String cur = normalizeCurrency(m1.group(1), text.substring(m1.start(), Math.min(text.length(), m1.end())));
out.add(new Price(cur, amount));
}
Matcher m2 = CURRENCY_LAST_PATTERN.matcher(text);
while (m2.find() && out.size() < limit) {
Double amount = safeParseDouble(m2.group(1));
if (amount == null) {
continue;
}
String cur = normalizeCurrency(null, m2.group(2));
out.add(new Price(cur, amount));
}
return out;
}
private String normalizeCurrency(String group1, String raw) {
String src = (group1 != null && !group1.isEmpty()) ? group1 : raw;
if (src == null) {
return null;
}
String s = src.trim();
if (s.startsWith("SFr")) {
return "SFr.";
}
if (s.equalsIgnoreCase("CHF")) {
return "CHF";
}
if (s.contains("€")) {
return "€";
}
if (s.contains("$")) {
return "$";
}
if (s.contains("£")) {
return "£";
}
return s.isEmpty() ? null : s;
}
private String cleanText(String s) {
if (s == null) {
return "";
}
return s.replace('\u00A0', ' ').trim();
}
private Integer safeParseInt(String s) {
try {
if (s == null) {
return null;
}
String t = s.replaceAll("[^0-9]", "");
if (t.isEmpty()) {
return null;
}
return Integer.parseInt(t);
} catch (Exception e) {
return null;
}
}
private Double safeParseDouble(String s) {
try {
if (s == null) {
return null;
}
String t = s.trim().replace(",", ".");
t = t.replaceAll("[^0-9.]", "");
if (t.isEmpty()) {
return null;
}
return Double.parseDouble(t);
} catch (Exception e) {
return null;
}
}
private static class Price {
private final String currency;
private final Double amount;
private Price(String currency, Double amount) {
this.currency = currency;
this.amount = amount;
}
}
}

34
src/main/java/com/ski/crawler/parser/ResortParser.java

@ -0,0 +1,34 @@
package com.ski.crawler.parser;
import com.ski.crawler.model.SkiResort;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.time.LocalDateTime;
//ResortParser 类是解析器,负责解析滑雪场的 HTML 页内容。
//它使用 Jsoup 解析 HTML 内容,然后根据 HTML 结构提取出滑雪场的名称、国家、区域、海拔、总距离、总轨迹、总 lift、总票、总评分等信息。
//最后,它将这些信息封装到 SkiResort 对象中。
public class ResortParser {
public SkiResort parseResort(String html, String sourceUrl) {
Document doc = Jsoup.parse(html);
SkiResort resort = new SkiResort();
String title = doc.title();
resort.setName((title == null || title.isEmpty()) ? "UNKNOWN" : title);
resort.setSourceUrl(sourceUrl);
resort.setCrawledAt(LocalDateTime.now());
Element countryMeta = doc.selectFirst("meta[name=country]");
if (countryMeta != null) {
resort.setCountry(countryMeta.attr("content"));
}
Element regionMeta = doc.selectFirst("meta[name=region]");
if (regionMeta != null) {
resort.setRegion(regionMeta.attr("content"));
}
return resort;
}
}

74
src/main/java/com/ski/crawler/repository/SkiResortRepository.java

@ -0,0 +1,74 @@
package com.ski.crawler.repository;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.util.ValidationUtil;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.LongAdder;
public class SkiResortRepository {
private final Map<String, SkiResort> byUrl = new LinkedHashMap<>();
public synchronized boolean containsUrl(String url) {
if (url == null || url.trim().isEmpty()) {
return false;
}
return byUrl.containsKey(url.trim());
}
public synchronized boolean add(SkiResort resort) {
if (resort == null) {
return false;
}
SkiResort cleaned = ValidationUtil.clean(resort);
ValidationUtil.validate(cleaned);
String url = cleaned.getSourceUrl().trim();
if (byUrl.containsKey(url)) {
return false;
}
byUrl.put(url, cleaned);
return true;
}
public synchronized List<SkiResort> getAll() {
return Collections.unmodifiableList(new ArrayList<>(byUrl.values()));
}
public synchronized List<SkiResort> filterByCountry(String keyword) {
String k = ValidationUtil.normalizeCountryKey(keyword);
if (k.isEmpty()) {
return getAll();
}
List<SkiResort> out = new ArrayList<>();
for (SkiResort r : byUrl.values()) {
String c = ValidationUtil.normalizeCountryKey(r.getCountry());
if (!c.isEmpty() && (c.equals(k) || c.contains(k))) {
out.add(r);
}
}
return Collections.unmodifiableList(out);
}
public Map<String, Long> countByCountry() {
Map<String, LongAdder> tmp = new ConcurrentHashMap<>();
for (SkiResort r : getAll()) {
String c = r.getCountry();
if (c == null || c.trim().isEmpty()) {
continue;
}
String key = c.replace('\u00A0', ' ').trim();
tmp.computeIfAbsent(key, x -> new LongAdder()).increment();
}
Map<String, Long> out = new LinkedHashMap<>();
for (Map.Entry<String, LongAdder> e : tmp.entrySet()) {
out.put(e.getKey(), e.getValue().sum());
}
return out;
}
}

243
src/main/java/com/ski/crawler/service/ScraperService.java

@ -0,0 +1,243 @@
package com.ski.crawler.service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.ski.crawler.exception.NetworkException;
import com.ski.crawler.exception.ParseException;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.repository.SkiResortRepository;
import com.ski.crawler.strategy.CrawlStrategy;
import com.ski.crawler.util.JsonUtil;
import com.ski.crawler.util.RetryUtil;
import com.ski.crawler.util.ValidationUtil;
import com.ski.crawler.utils.CrawlerHttp;
import com.ski.crawler.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.LongAdder;
public class ScraperService {
private static final Logger log = LoggerFactory.getLogger(ScraperService.class);
public CrawlReport crawl(
CrawlStrategy strategy,
String startUrl,
int limit,
int threads,
String countryFilter,
CrawlerHttp http,
SkiResortRepository repo,
boolean incremental,
String outPath,
ConsoleView view,
boolean showFailures,
boolean dryRun,
int retryAttempts,
long retrySleepMs
) throws NetworkException {
String actualStartUrl = (startUrl == null || startUrl.isEmpty()) ? strategy.defaultStartUrl() : startUrl;
List<String> urls;
try {
urls = RetryUtil.retry(() -> strategy.collectDetailUrls(actualStartUrl, limit, http), retryAttempts, retrySleepMs);
} catch (NetworkException e) {
throw e;
} catch (Exception e) {
throw new NetworkException("Collect urls failed: " + e.getMessage(), e);
}
int total = urls.size();
Queue<String> queue = new ConcurrentLinkedQueue<>(urls);
AtomicInteger done = new AtomicInteger(0);
AtomicInteger success = new AtomicInteger(0);
AtomicInteger skipped = new AtomicInteger(0);
AtomicInteger failed = new AtomicInteger(0);
AtomicInteger filteredOut = new AtomicInteger(0);
Map<String, LongAdder> byCountry = new ConcurrentHashMap<>();
List<String> failures = Collections.synchronizedList(new ArrayList<>());
Map<String, Boolean> seenThisRun = new ConcurrentHashMap<>();
Object outLock = new Object();
BufferedWriter outWriter = null;
ObjectMapper mapper = null;
if (outPath != null && !outPath.trim().isEmpty()) {
try {
outWriter = JsonUtil.openJsonlWriter(outPath.trim());
mapper = JsonUtil.mapper();
} catch (Exception e) {
throw new NetworkException("Open out file failed: " + e.getMessage(), e);
}
}
final BufferedWriter outWriterFinal = outWriter;
final ObjectMapper mapperFinal = mapper;
view.printHeader();
int workerCount = Math.max(1, threads);
ExecutorService pool = Executors.newFixedThreadPool(workerCount);
for (int i = 0; i < workerCount; i++) {
pool.submit(() -> {
while (true) {
String url = queue.poll();
if (url == null) {
return;
}
try {
if (incremental) {
if (repo.containsUrl(url)) {
skipped.incrementAndGet();
continue;
}
if (dryRun) {
if (seenThisRun.putIfAbsent(url, Boolean.TRUE) != null) {
skipped.incrementAndGet();
continue;
}
}
}
String html = RetryUtil.retry(() -> http.getHtml(url), retryAttempts, retrySleepMs);
SkiResort resort = strategy.parseDetail(url, html);
resort.setSourceSite(strategy.id());
resort.setSourceUrl(url);
SkiResort cleaned = ValidationUtil.clean(resort);
ValidationUtil.validate(cleaned);
success.incrementAndGet();
String country = cleaned.getCountry();
if (country != null && !country.trim().isEmpty()) {
String key = country.replace('\u00A0', ' ').trim();
byCountry.computeIfAbsent(key, k -> new LongAdder()).increment();
}
String filter = ValidationUtil.normalizeCountryKey(countryFilter);
if (!filter.isEmpty()) {
String c = ValidationUtil.normalizeCountryKey(cleaned.getCountry());
if (c.isEmpty() || (!c.equals(filter) && !c.contains(filter))) {
filteredOut.incrementAndGet();
if (!dryRun) {
repo.add(cleaned);
}
continue;
}
}
synchronized (outLock) {
view.printResort(cleaned);
if (!dryRun && outWriterFinal != null && mapperFinal != null) {
outWriterFinal.write(mapperFinal.writeValueAsString(toJson(cleaned)));
outWriterFinal.newLine();
}
}
if (!dryRun) {
repo.add(cleaned);
}
} catch (ParseException e) {
failed.incrementAndGet();
if (showFailures && failures.size() < 200) {
failures.add(url + " [ParseException] " + safeMsg(e.getMessage()));
}
log.error("Parse failed: {}", url, e);
} catch (Exception e) {
failed.incrementAndGet();
if (showFailures && failures.size() < 200) {
failures.add(url + " [" + e.getClass().getSimpleName() + "] " + safeMsg(e.getMessage()));
}
log.error("Crawl failed: {}", url, e);
} finally {
int finished = done.incrementAndGet();
log.info("{}/{} success={} skipped={} failed={}", finished, total, success.get(), skipped.get(), failed.get());
}
}
});
}
pool.shutdown();
try {
pool.awaitTermination(7, TimeUnit.DAYS);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
if (outWriterFinal != null) {
try (BufferedWriter w = outWriterFinal) {
w.flush();
} catch (Exception ignored) {
}
}
Map<String, Long> byCountryOut = new LinkedHashMap<>();
for (Map.Entry<String, LongAdder> e : byCountry.entrySet()) {
byCountryOut.put(e.getKey(), e.getValue().sum());
}
CrawlReport report = new CrawlReport();
report.site = strategy.id();
report.total = total;
report.success = success.get();
report.filteredOut = filteredOut.get();
report.skipped = skipped.get();
report.failed = failed.get();
report.byCountry = byCountryOut;
report.failures = new ArrayList<>(failures);
return report;
}
private Map<String, Object> toJson(SkiResort r) {
Map<String, Object> obj = new LinkedHashMap<>();
obj.put("id", r.getId());
obj.put("name", r.getName());
obj.put("country", r.getCountry());
obj.put("region", r.getRegion());
obj.put("latitude", r.getLatitude());
obj.put("longitude", r.getLongitude());
obj.put("altitudeMin", r.getAltitudeMin());
obj.put("altitudeMax", r.getAltitudeMax());
obj.put("totalKm", r.getTotalKm());
obj.put("slopeCount", r.getSlopeCount());
obj.put("liftCount", r.getLiftCount());
obj.put("ticketPriceMin", r.getTicketPriceMin());
obj.put("ticketPriceMax", r.getTicketPriceMax());
obj.put("currency", r.getCurrency());
obj.put("openTime", r.getOpenTime());
obj.put("snowDepthCm", r.getSnowDepthCm());
obj.put("temperatureC", r.getTemperatureC());
obj.put("nearbyHotels", r.getNearbyHotels());
obj.put("rentalShops", r.getRentalShops());
obj.put("url", r.getSourceUrl());
obj.put("sourceSite", r.getSourceSite());
obj.put("crawlTime", r.getCrawledAt() == null ? null : r.getCrawledAt().toString());
return obj;
}
private String safeMsg(String s) {
return s == null ? "" : s.replace('\n', ' ').replace('\r', ' ').trim();
}
public static class CrawlReport {
public String site;
public int total;
public int success;
public int filteredOut;
public int skipped;
public int failed;
public Map<String, Long> byCountry;
public List<String> failures;
}
}

22
src/main/java/com/ski/crawler/site/CrawlerSite.java

@ -0,0 +1,22 @@
//站点抽象接口 :每个站点实现“列表采集 + 详情解析”两件事
//每个站点需要实现以下方法:
//id():返回站点的唯一标识符,用于在命令行中指定要采集的站点。
//defaultStartUrl():返回站点的默认采集起始 URL。
//collectDetailUrls():采集站点的详情页 URL 列表,返回一个字符串列表。
//parseDetail():解析详情页 HTML,返回一个 SkiResort 实例。
package com.ski.crawler.site;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.utils.CrawlerHttp;
import java.util.List;
public interface CrawlerSite {
String id();//
String defaultStartUrl();//
List<String> collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws Exception;
SkiResort parseDetail(String sourceUrl, String html) throws Exception;
}

194
src/main/java/com/ski/crawler/site/SkimapOrgSite.java

@ -0,0 +1,194 @@
package com.ski.crawler.site;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.utils.CrawlerHttp;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SkimapOrgSite implements CrawlerSite {
private static final Pattern LAT_LON_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*,\\s*(-?\\d+(?:\\.\\d+)?)");
@Override
public String id() {
return "skimap";
}
@Override
public String defaultStartUrl() {
return "https://skimap.org";
}
@Override
public List<String> collectDetailUrls(String startUrl, int limit, CrawlerHttp http) {
Set<String> out = new LinkedHashSet<>();
Set<String> visited = new LinkedHashSet<>();
String page = startUrl;
while (page != null && !page.isEmpty() && !visited.contains(page)) {
visited.add(page);
if (page.toLowerCase(Locale.ROOT).contains("/skiareas/view/")) {
out.add(page);
break;
}
Document doc = http.getDocument(page);
for (Element a : doc.select("a[href]")) {
String href = a.attr("href");
if (href == null || href.isEmpty()) {
continue;
}
String abs = a.absUrl("href");
if (abs == null || abs.isEmpty()) {
continue;
}
String lower = abs.toLowerCase(Locale.ROOT);
if (!lower.contains("/skiareas/view/")) {
continue;
}
out.add(abs);
if (limit > 0 && out.size() >= limit) {
return new ArrayList<>(out);
}
}
String next = findNext(doc);
page = (next != null && !visited.contains(next)) ? next : null;
}
return new ArrayList<>(out);
}
@Override
public SkiResort parseDetail(String sourceUrl, String html) {
Document doc = org.jsoup.Jsoup.parse(html, sourceUrl);
SkiResort resort = new SkiResort();
String name = null;
Element h1 = doc.selectFirst("h1");
if (h1 != null) {
name = clean(h1.text());
}
if (name == null || name.isEmpty()) {
Element ogTitle = doc.selectFirst("meta[property=og:title]");
if (ogTitle != null) {
name = clean(ogTitle.attr("content"));
}
}
if (name != null && !name.isEmpty()) {
resort.setName(name);
}
List<String> crumbs = new ArrayList<>();
for (Element a : doc.select(".breadcrumb a, nav.breadcrumb a, ol.breadcrumb a, ul.breadcrumb a")) {
String t = clean(a.text());
if (!t.isEmpty()) {
crumbs.add(t);
}
}
if (crumbs.size() >= 1) {
resort.setCountry(crumbs.get(crumbs.size() - 1));
}
if (crumbs.size() >= 2) {
resort.setRegion(crumbs.get(crumbs.size() - 2));
}
Double[] latLon = extractLatLon(doc);
if (latLon != null) {
resort.setLatitude(latLon[0]);
resort.setLongitude(latLon[1]);
}
return resort;
}
private String findNext(Document doc) {
Element e = doc.selectFirst("a[rel=next], a.next, li.pagination-next a, a[aria-label=Next]");
if (e != null) {
String abs = e.absUrl("href");
return abs == null || abs.isEmpty() ? null : abs;
}
for (Element a : doc.select("a[href]")) {
String t = clean(a.text()).toLowerCase(Locale.ROOT);
if (t.equals("next") || t.equals("next ›") || t.contains("next")) {
String abs = a.absUrl("href");
if (abs != null && !abs.isEmpty()) {
return abs;
}
}
}
return null;
}
private Double[] extractLatLon(Document doc) {
Element metaLat = doc.selectFirst("meta[property=place:location:latitude], meta[name=geo.position]");
Element metaLon = doc.selectFirst("meta[property=place:location:longitude]");
if (metaLat != null && metaLon != null) {
Double lat = safeParseDouble(metaLat.attr("content"));
Double lon = safeParseDouble(metaLon.attr("content"));
if (lat != null && lon != null) {
return new Double[]{lat, lon};
}
}
if (metaLat != null) {
Double[] ll = parseLatLon(metaLat.attr("content"));
if (ll != null) {
return ll;
}
}
Double[] ll = parseLatLon(doc.text());
if (ll != null) {
return ll;
}
return null;
}
private Double[] parseLatLon(String text) {
if (text == null || text.isEmpty()) {
return null;
}
Matcher m = LAT_LON_PATTERN.matcher(text);
while (m.find()) {
Double lat = safeParseDouble(m.group(1));
Double lon = safeParseDouble(m.group(2));
if (lat == null || lon == null) {
continue;
}
if (lat < -90 || lat > 90 || lon < -180 || lon > 180) {
continue;
}
return new Double[]{lat, lon};
}
return null;
}
private String clean(String s) {
if (s == null) {
return "";
}
return s.replace('\u00A0', ' ').trim();
}
private Double safeParseDouble(String s) {
try {
if (s == null) {
return null;
}
String t = s.trim().replace(",", ".");
t = t.replaceAll("[^0-9.\\-]", "");
if (t.isEmpty()) {
return null;
}
return Double.parseDouble(t);
} catch (Exception e) {
return null;
}
}
}

33
src/main/java/com/ski/crawler/site/SkiresortInfoSite.java

@ -0,0 +1,33 @@
package com.ski.crawler.site;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.parser.ResortDetailParser;
import com.ski.crawler.spider.ResortListSpider;
import com.ski.crawler.utils.CrawlerHttp;
import java.util.List;
public class SkiresortInfoSite implements CrawlerSite {
private final ResortDetailParser detailParser = new ResortDetailParser();
@Override
public String id() {
return "skiresort";
}
@Override
public String defaultStartUrl() {
return "https://www.skiresort.info/ski-resorts/";
}
@Override
public List<String> collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws Exception {
ResortListSpider listSpider = new ResortListSpider(http);
return limit > 0 ? listSpider.fetchFirst(startUrl, limit) : listSpider.fetchAll(startUrl);
}
@Override
public SkiResort parseDetail(String sourceUrl, String html) throws Exception {
return detailParser.parse(html);
}
}

204
src/main/java/com/ski/crawler/site/WikipediaSite.java

@ -0,0 +1,204 @@
package com.ski.crawler.site;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.utils.CrawlerHttp;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WikipediaSite implements CrawlerSite {
private static final Pattern GEO_SEMI_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*;\\s*(-?\\d+(?:\\.\\d+)?)");
private static final Pattern GEO_COMMA_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*,\\s*(-?\\d+(?:\\.\\d+)?)");
@Override
public String id() {
return "wikipedia";
}
@Override
public String defaultStartUrl() {
return "https://en.wikipedia.org/wiki/List_of_ski_areas_and_resorts";
}
@Override
public List<String> collectDetailUrls(String startUrl, int limit, CrawlerHttp http) {
Document doc = http.getDocument(startUrl);
Element content = doc.selectFirst("#mw-content-text");
if (content == null) {
content = doc.body();
}
Set<String> out = new LinkedHashSet<>();
for (Element a : content.select("a[href]")) {
String href = a.attr("href");
if (href == null || href.isEmpty()) {
continue;
}
if (!href.startsWith("/wiki/")) {
continue;
}
if (href.contains(":")) {
continue;
}
if (href.contains("#")) {
href = href.substring(0, href.indexOf('#'));
}
String abs = a.absUrl("href");
if (abs == null || abs.isEmpty()) {
continue;
}
String lower = abs.toLowerCase(Locale.ROOT);
if (lower.contains("list_of_")) {
continue;
}
out.add(abs);
if (limit > 0 && out.size() >= limit) {
break;
}
}
return new ArrayList<>(out);
}
@Override
public SkiResort parseDetail(String sourceUrl, String html) {
Document doc = org.jsoup.Jsoup.parse(html, sourceUrl);
SkiResort resort = new SkiResort();
Element h1 = doc.selectFirst("#firstHeading");
if (h1 == null) {
h1 = doc.selectFirst("h1");
}
if (h1 != null) {
String name = clean(h1.text());
if (!name.isEmpty()) {
resort.setName(name);
}
}
Element infobox = doc.selectFirst("table.infobox");
if (infobox != null) {
String country = extractInfoboxValue(infobox, "Country");
if (country != null && !country.isEmpty()) {
resort.setCountry(country);
}
String region = extractInfoboxValue(infobox, "Location");
if (region != null && !region.isEmpty()) {
resort.setRegion(region);
}
}
Double[] latLon = extractLatLon(doc);
if (latLon != null) {
resort.setLatitude(latLon[0]);
resort.setLongitude(latLon[1]);
}
return resort;
}
private Double[] extractLatLon(Document doc) {
Element geoDec = doc.selectFirst("span.geo-dec");
if (geoDec != null) {
Double[] ll = parseLatLon(geoDec.text());
if (ll != null) {
return ll;
}
}
Element geo = doc.selectFirst("span.geo");
if (geo != null) {
Double[] ll = parseLatLon(geo.text());
if (ll != null) {
return ll;
}
}
Element metaLat = doc.selectFirst("meta[property=place:location:latitude], meta[name=geo.position]");
Element metaLon = doc.selectFirst("meta[property=place:location:longitude]");
if (metaLat != null && metaLon != null) {
Double lat = safeParseDouble(metaLat.attr("content"));
Double lon = safeParseDouble(metaLon.attr("content"));
if (lat != null && lon != null) {
return new Double[]{lat, lon};
}
}
if (metaLat != null) {
Double[] ll = parseLatLon(metaLat.attr("content"));
if (ll != null) {
return ll;
}
}
return null;
}
private Double[] parseLatLon(String text) {
if (text == null || text.isEmpty()) {
return null;
}
Matcher m1 = GEO_SEMI_PATTERN.matcher(text);
if (m1.find()) {
Double lat = safeParseDouble(m1.group(1));
Double lon = safeParseDouble(m1.group(2));
if (lat != null && lon != null) {
return new Double[]{lat, lon};
}
}
Matcher m2 = GEO_COMMA_PATTERN.matcher(text);
if (m2.find()) {
Double lat = safeParseDouble(m2.group(1));
Double lon = safeParseDouble(m2.group(2));
if (lat != null && lon != null) {
return new Double[]{lat, lon};
}
}
return null;
}
private String extractInfoboxValue(Element infobox, String header) {
for (Element row : infobox.select("tr")) {
Element th = row.selectFirst("th");
Element td = row.selectFirst("td");
if (th == null || td == null) {
continue;
}
String key = clean(th.text());
if (!header.equalsIgnoreCase(key)) {
continue;
}
String value = clean(td.text());
if (value.isEmpty()) {
return null;
}
return value;
}
return null;
}
private String clean(String s) {
if (s == null) {
return "";
}
return s.replace('\u00A0', ' ').trim();
}
private Double safeParseDouble(String s) {
try {
if (s == null) {
return null;
}
String t = s.trim().replace(",", ".");
t = t.replaceAll("[^0-9.\\-]", "");
if (t.isEmpty()) {
return null;
}
return Double.parseDouble(t);
} catch (Exception e) {
return null;
}
}
}

98
src/main/java/com/ski/crawler/spider/ResortListSpider.java

@ -0,0 +1,98 @@
//负责收集所有滑雪场的详情页地址
package com.ski.crawler.spider;
import com.ski.crawler.utils.CrawlerHttp;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.Set;
//ResortListSpider 类是爬虫的列表采集器,负责采集所有滑雪场的列表页。
//它使用一个队列来存储待采集的 URL,每次从队列中取出一个 URL,然后使用 Jsoup 连接该 URL 并获取 HTML 内容。
//最后,它解析 HTML 内容,提取出所有滑雪场的详情页 URL,并将它们添加到队列中。
public class ResortListSpider {
private final CrawlerHttp http;
private final Random random = new Random();
private final LinkedList<String> queue = new LinkedList<>();//
public ResortListSpider(CrawlerHttp http) {
this.http = http;
}
public List<String> fetchAll(String startUrl) throws IOException, InterruptedException {
return fetchFirst(startUrl, -1);
}
public List<String> fetchFirst(String startUrl, int limit) throws IOException, InterruptedException {
Set<String> visitedPages = new HashSet<>();
Set<String> detailUrls = new HashSet<>();
String page = startUrl;
while (page != null && !visitedPages.contains(page)) {
visitedPages.add(page);
Document doc = http.getDocument(page);
Elements links = doc.select("a[href]");
for (Element a : links) {
String href = a.attr("href");
if (href == null || href.isEmpty()) {
continue;
}
if (href.startsWith("/ski-resort/") || href.startsWith("https://www.skiresort.info/ski-resort/")) {
String abs = a.absUrl("href");
if (!abs.isEmpty()) {
detailUrls.add(abs);
if (limit > 0 && detailUrls.size() >= limit) {
break;
}
}
}
}
if (limit > 0 && detailUrls.size() >= limit) {
break;
}
String next = findNext(doc);
if (next != null && !visitedPages.contains(next)) {
page = next;
} else {
page = null;
}
Thread.sleep(2000 + random.nextInt(2001));
}
queue.clear();
queue.addAll(detailUrls);
return new LinkedList<>(queue);
}
private String findNext(Document doc) {
Element e = doc.selectFirst("a[rel=next], a.next, li.pagination-next a");
if (e != null) {
return e.absUrl("href");
}
for (Element a : doc.select("a[href]")) {
String t = a.text().toLowerCase();
if (t.contains("next") || t.contains("下一页") || t.contains("weiter")) {
String abs = a.absUrl("href");
if (!abs.isEmpty()) {
return abs;
}
}
}
return null;
}
public LinkedList<String> getQueue() {
return new LinkedList<>(queue);
}
}

19
src/main/java/com/ski/crawler/strategy/CrawlStrategy.java

@ -0,0 +1,19 @@
package com.ski.crawler.strategy;
import com.ski.crawler.exception.NetworkException;
import com.ski.crawler.exception.ParseException;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.utils.CrawlerHttp;
import java.util.List;
public interface CrawlStrategy {
String id();
String defaultStartUrl();
List<String> collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws NetworkException;
SkiResort parseDetail(String sourceUrl, String html) throws ParseException;
}

81
src/main/java/com/ski/crawler/strategy/SkiResortInfoStrategy.java

@ -0,0 +1,81 @@
package com.ski.crawler.strategy;
import com.ski.crawler.exception.NetworkException;
import com.ski.crawler.exception.ParseException;
import com.ski.crawler.model.SkiLift;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.model.SkiTicket;
import com.ski.crawler.model.SkiTrail;
import com.ski.crawler.parser.ResortDetailParser;
import com.ski.crawler.spider.ResortListSpider;
import com.ski.crawler.utils.CrawlerHttp;
import java.util.List;
public class SkiResortInfoStrategy implements CrawlStrategy {
private final ResortDetailParser detailParser = new ResortDetailParser();
@Override
public String id() {
return "skiresort";
}
@Override
public String defaultStartUrl() {
return "https://www.skiresort.info/ski-resorts/";
}
@Override
public List<String> collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws NetworkException {
try {
ResortListSpider listSpider = new ResortListSpider(http);
return limit > 0 ? listSpider.fetchFirst(startUrl, limit) : listSpider.fetchAll(startUrl);
} catch (Exception e) {
throw new NetworkException("Collect urls failed: " + e.getMessage(), e);
}
}
@Override
public SkiResort parseDetail(String sourceUrl, String html) throws ParseException {
try {
SkiResort resort = detailParser.parse(html);
resort.setSourceUrl(sourceUrl);
resort.setSourceSite(id());
SkiTrail trail = resort.getSkiTrail();
if (trail != null && trail.getTotalRuns() != null) {
resort.setSlopeCount(trail.getTotalRuns());
}
SkiLift lift = resort.getSkiLift();
if (lift != null && lift.getTotalLifts() != null) {
resort.setLiftCount(lift.getTotalLifts());
}
SkiTicket ticket = resort.getSkiTicket();
if (ticket != null) {
Double a = ticket.getPriceAdult();
Double c = ticket.getPriceChild();
if (ticket.getCurrency() != null && resort.getCurrency() == null) {
resort.setCurrency(ticket.getCurrency());
}
Double min = null;
Double max = null;
if (a != null) {
min = a;
max = a;
}
if (c != null) {
min = min == null ? c : Math.min(min, c);
max = max == null ? c : Math.max(max, c);
}
resort.setTicketPriceMin(min);
resort.setTicketPriceMax(max);
}
return resort;
} catch (Exception e) {
throw new ParseException("Parse detail failed: " + e.getMessage(), e);
}
}
}

199
src/main/java/com/ski/crawler/strategy/SkimapStrategy.java

@ -0,0 +1,199 @@
package com.ski.crawler.strategy;
import com.ski.crawler.exception.NetworkException;
import com.ski.crawler.exception.ParseException;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.utils.CrawlerHttp;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SkimapStrategy implements CrawlStrategy {
private static final Pattern LAT_LON_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*,\\s*(-?\\d+(?:\\.\\d+)?)");
@Override
public String id() {
return "skimap";
}
@Override
public String defaultStartUrl() {
return "https://skimap.org";
}
@Override
public List<String> collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws NetworkException {
try {
Set<String> out = new LinkedHashSet<>();
Set<String> visited = new LinkedHashSet<>();
String page = startUrl;
while (page != null && !page.isEmpty() && !visited.contains(page)) {
visited.add(page);
if (page.toLowerCase(Locale.ROOT).contains("/skiareas/view/")) {
out.add(page);
break;
}
Document doc = http.getDocument(page);
for (Element a : doc.select("a[href]")) {
String abs = a.absUrl("href");
if (abs == null || abs.isEmpty()) {
continue;
}
String lower = abs.toLowerCase(Locale.ROOT);
if (!lower.contains("/skiareas/view/")) {
continue;
}
out.add(abs);
if (limit > 0 && out.size() >= limit) {
return new ArrayList<>(out);
}
}
String next = findNext(doc);
page = (next != null && !visited.contains(next)) ? next : null;
}
return new ArrayList<>(out);
} catch (Exception e) {
throw new NetworkException("Collect urls failed: " + e.getMessage(), e);
}
}
@Override
public SkiResort parseDetail(String sourceUrl, String html) throws ParseException {
try {
Document doc = org.jsoup.Jsoup.parse(html, sourceUrl);
SkiResort resort = new SkiResort();
resort.setSourceUrl(sourceUrl);
resort.setSourceSite(id());
String name = null;
Element h1 = doc.selectFirst("h1");
if (h1 != null) {
name = clean(h1.text());
}
if (name == null || name.isEmpty()) {
Element ogTitle = doc.selectFirst("meta[property=og:title]");
if (ogTitle != null) {
name = clean(ogTitle.attr("content"));
}
}
if (name != null && !name.isEmpty()) {
resort.setName(name);
}
List<String> crumbs = new ArrayList<>();
for (Element a : doc.select(".breadcrumb a, nav.breadcrumb a, ol.breadcrumb a, ul.breadcrumb a")) {
String t = clean(a.text());
if (!t.isEmpty()) {
crumbs.add(t);
}
}
if (crumbs.size() >= 1) {
resort.setCountry(crumbs.get(crumbs.size() - 1));
}
if (crumbs.size() >= 2) {
resort.setRegion(crumbs.get(crumbs.size() - 2));
}
Double[] latLon = extractLatLon(doc);
if (latLon != null) {
resort.setLatitude(latLon[0]);
resort.setLongitude(latLon[1]);
}
return resort;
} catch (Exception e) {
throw new ParseException("Parse detail failed: " + e.getMessage(), e);
}
}
private String findNext(Document doc) {
Element e = doc.selectFirst("a[rel=next], a.next, li.pagination-next a, a[aria-label=Next]");
if (e != null) {
String abs = e.absUrl("href");
return abs == null || abs.isEmpty() ? null : abs;
}
for (Element a : doc.select("a[href]")) {
String t = clean(a.text()).toLowerCase(Locale.ROOT);
if (t.equals("next") || t.equals("next ›") || t.contains("next")) {
String abs = a.absUrl("href");
if (abs != null && !abs.isEmpty()) {
return abs;
}
}
}
return null;
}
private Double[] extractLatLon(Document doc) {
Element metaLat = doc.selectFirst("meta[property=place:location:latitude], meta[name=geo.position]");
Element metaLon = doc.selectFirst("meta[property=place:location:longitude]");
if (metaLat != null && metaLon != null) {
Double lat = safeParseDouble(metaLat.attr("content"));
Double lon = safeParseDouble(metaLon.attr("content"));
if (lat != null && lon != null) {
return new Double[]{lat, lon};
}
}
if (metaLat != null) {
Double[] ll = parseLatLon(metaLat.attr("content"));
if (ll != null) {
return ll;
}
}
return parseLatLon(doc.text());
}
private Double[] parseLatLon(String text) {
if (text == null || text.isEmpty()) {
return null;
}
Matcher m = LAT_LON_PATTERN.matcher(text);
while (m.find()) {
Double lat = safeParseDouble(m.group(1));
Double lon = safeParseDouble(m.group(2));
if (lat == null || lon == null) {
continue;
}
if (lat < -90 || lat > 90 || lon < -180 || lon > 180) {
continue;
}
return new Double[]{lat, lon};
}
return null;
}
private String clean(String s) {
if (s == null) {
return "";
}
return s.replace('\u00A0', ' ').trim();
}
private Double safeParseDouble(String s) {
try {
if (s == null) {
return null;
}
String t = s.trim().replace(",", ".");
t = t.replaceAll("[^0-9.\\-]", "");
if (t.isEmpty()) {
return null;
}
return Double.parseDouble(t);
} catch (Exception e) {
return null;
}
}
}

244
src/main/java/com/ski/crawler/strategy/WikipediaStrategy.java

@ -0,0 +1,244 @@
package com.ski.crawler.strategy;
import com.ski.crawler.exception.NetworkException;
import com.ski.crawler.exception.ParseException;
import com.ski.crawler.model.SkiResort;
import com.ski.crawler.utils.CrawlerHttp;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WikipediaStrategy implements CrawlStrategy {
private static final Pattern GEO_SEMI_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*;\\s*(-?\\d+(?:\\.\\d+)?)");
private static final Pattern GEO_COMMA_PATTERN = Pattern.compile("(-?\\d+(?:\\.\\d+)?)\\s*,\\s*(-?\\d+(?:\\.\\d+)?)");
private static final Pattern INT_M_PATTERN = Pattern.compile("(\\d{2,5})\\s*m\\b", Pattern.CASE_INSENSITIVE);
@Override
public String id() {
return "wikipedia";
}
@Override
public String defaultStartUrl() {
return "https://en.wikipedia.org/wiki/List_of_ski_areas_and_resorts";
}
@Override
public List<String> collectDetailUrls(String startUrl, int limit, CrawlerHttp http) throws NetworkException {
try {
Document doc = http.getDocument(startUrl);
Element content = doc.selectFirst("#mw-content-text");
if (content == null) {
content = doc.body();
}
Set<String> out = new LinkedHashSet<>();
for (Element a : content.select("a[href]")) {
String href = a.attr("href");
if (href == null || href.isEmpty()) {
continue;
}
if (!href.startsWith("/wiki/")) {
continue;
}
if (href.contains(":")) {
continue;
}
if (href.contains("#")) {
href = href.substring(0, href.indexOf('#'));
}
String abs = a.absUrl("href");
if (abs == null || abs.isEmpty()) {
continue;
}
String lower = abs.toLowerCase(Locale.ROOT);
if (lower.contains("list_of_")) {
continue;
}
out.add(abs);
if (limit > 0 && out.size() >= limit) {
break;
}
}
return new ArrayList<>(out);
} catch (Exception e) {
throw new NetworkException("Collect urls failed: " + e.getMessage(), e);
}
}
@Override
public SkiResort parseDetail(String sourceUrl, String html) throws ParseException {
try {
Document doc = org.jsoup.Jsoup.parse(html, sourceUrl);
SkiResort resort = new SkiResort();
resort.setSourceUrl(sourceUrl);
resort.setSourceSite(id());
Element h1 = doc.selectFirst("#firstHeading");
if (h1 == null) {
h1 = doc.selectFirst("h1");
}
if (h1 != null) {
String name = clean(h1.text());
if (!name.isEmpty()) {
resort.setName(name);
}
}
Element infobox = doc.selectFirst("table.infobox");
if (infobox != null) {
String country = extractInfoboxValue(infobox, "Country");
if (country != null && !country.isEmpty()) {
resort.setCountry(country);
}
String location = extractInfoboxValue(infobox, "Location");
if (location != null && !location.isEmpty()) {
resort.setRegion(location);
}
Integer top = extractElevation(infobox, "Top elevation", "Highest elevation");
Integer base = extractElevation(infobox, "Base elevation", "Lowest elevation");
if (base != null) {
resort.setAltitudeMin(base);
}
if (top != null) {
resort.setAltitudeMax(top);
}
}
Double[] latLon = extractLatLon(doc);
if (latLon != null) {
resort.setLatitude(latLon[0]);
resort.setLongitude(latLon[1]);
}
return resort;
} catch (Exception e) {
throw new ParseException("Parse detail failed: " + e.getMessage(), e);
}
}
private Double[] extractLatLon(Document doc) {
Element geoDec = doc.selectFirst("span.geo-dec");
if (geoDec != null) {
Double[] ll = parseLatLon(geoDec.text());
if (ll != null) {
return ll;
}
}
Element geo = doc.selectFirst("span.geo");
if (geo != null) {
Double[] ll = parseLatLon(geo.text());
if (ll != null) {
return ll;
}
}
Element metaLat = doc.selectFirst("meta[property=place:location:latitude], meta[name=geo.position]");
Element metaLon = doc.selectFirst("meta[property=place:location:longitude]");
if (metaLat != null && metaLon != null) {
Double lat = safeParseDouble(metaLat.attr("content"));
Double lon = safeParseDouble(metaLon.attr("content"));
if (lat != null && lon != null) {
return new Double[]{lat, lon};
}
}
if (metaLat != null) {
Double[] ll = parseLatLon(metaLat.attr("content"));
if (ll != null) {
return ll;
}
}
return null;
}
private Double[] parseLatLon(String text) {
if (text == null || text.isEmpty()) {
return null;
}
Matcher m1 = GEO_SEMI_PATTERN.matcher(text);
if (m1.find()) {
Double lat = safeParseDouble(m1.group(1));
Double lon = safeParseDouble(m1.group(2));
if (lat != null && lon != null) {
return new Double[]{lat, lon};
}
}
Matcher m2 = GEO_COMMA_PATTERN.matcher(text);
if (m2.find()) {
Double lat = safeParseDouble(m2.group(1));
Double lon = safeParseDouble(m2.group(2));
if (lat != null && lon != null) {
return new Double[]{lat, lon};
}
}
return null;
}
private Integer extractElevation(Element infobox, String... headers) {
for (String h : headers) {
String v = extractInfoboxValue(infobox, h);
if (v == null || v.isEmpty()) {
continue;
}
Matcher m = INT_M_PATTERN.matcher(v);
if (m.find()) {
try {
return Integer.parseInt(m.group(1));
} catch (Exception ignored) {
}
}
}
return null;
}
private String extractInfoboxValue(Element infobox, String header) {
for (Element row : infobox.select("tr")) {
Element th = row.selectFirst("th");
Element td = row.selectFirst("td");
if (th == null || td == null) {
continue;
}
String key = clean(th.text());
if (!header.equalsIgnoreCase(key)) {
continue;
}
String value = clean(td.text());
if (value.isEmpty()) {
return null;
}
return value;
}
return null;
}
private String clean(String s) {
if (s == null) {
return "";
}
return s.replace('\u00A0', ' ').trim();
}
private Double safeParseDouble(String s) {
try {
if (s == null) {
return null;
}
String t = s.trim().replace(",", ".");
t = t.replaceAll("[^0-9.\\-]", "");
if (t.isEmpty()) {
return null;
}
return Double.parseDouble(t);
} catch (Exception e) {
return null;
}
}
}

74
src/main/java/com/ski/crawler/util/CliArgs.java

@ -0,0 +1,74 @@
package com.ski.crawler.util;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
public class CliArgs {
public static Map<String, String> parseOptions(String[] args, int startIndex) {
Map<String, String> out = new HashMap<>();
if (args == null) {
return out;
}
for (int i = startIndex; i < args.length; i++) {
String a = args[i];
if (a == null) {
continue;
}
String t = a.trim();
if (!t.startsWith("--")) {
continue;
}
String body = t.substring(2);
String key;
String value;
int eq = body.indexOf('=');
if (eq >= 0) {
key = body.substring(0, eq).trim().toLowerCase(Locale.ROOT);
value = body.substring(eq + 1).trim();
} else {
key = body.trim().toLowerCase(Locale.ROOT);
if (i + 1 < args.length && args[i + 1] != null && !args[i + 1].trim().startsWith("--")) {
value = args[++i].trim();
} else {
value = "true";
}
}
if (!key.isEmpty()) {
out.put(key, value);
}
}
return out;
}
public static int parseInt(String v, int def) {
try {
if (v == null || v.trim().isEmpty()) {
return def;
}
return Integer.parseInt(v.trim());
} catch (Exception e) {
return def;
}
}
public static Integer parseNullableInt(String v) {
try {
if (v == null || v.trim().isEmpty()) {
return null;
}
return Integer.parseInt(v.trim());
} catch (Exception e) {
return null;
}
}
public static boolean parseBoolean(String v) {
if (v == null) {
return false;
}
String t = v.trim().toLowerCase(Locale.ROOT);
return t.equals("true") || t.equals("1") || t.equals("yes") || t.equals("y") || t.equals("on");
}
}

179
src/main/java/com/ski/crawler/util/ExcelUtil.java

@ -0,0 +1,179 @@
package com.ski.crawler.util;
import com.ski.crawler.model.SkiResort;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.Font;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.FileOutputStream;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
public class ExcelUtil {
private static final List<String> DEFAULT_SHEETS = Arrays.asList("skiresort", "wikipedia", "skimap");
public static void exportResortsBySiteToXlsx(List<SkiResort> resorts, String path) throws Exception {
Map<String, List<SkiResort>> bySite = new LinkedHashMap<>();
for (String s : DEFAULT_SHEETS) {
bySite.put(s, new ArrayList<>());
}
List<SkiResort> other = new ArrayList<>();
if (resorts != null) {
for (SkiResort r : resorts) {
String site = normalizeSite(r == null ? null : r.getSourceSite());
if (bySite.containsKey(site)) {
bySite.get(site).add(r);
} else {
other.add(r);
}
}
}
try (Workbook wb = new XSSFWorkbook()) {
CellStyle headerStyle = createHeaderStyle(wb);
for (Map.Entry<String, List<SkiResort>> e : bySite.entrySet()) {
writeSheet(wb, headerStyle, e.getKey(), e.getValue());
}
if (!other.isEmpty()) {
writeSheet(wb, headerStyle, "other", other);
}
try (FileOutputStream out = new FileOutputStream(path)) {
wb.write(out);
}
}
}
private static void writeSheet(Workbook wb, CellStyle headerStyle, String sheetName, List<SkiResort> rows) {
Sheet sheet = wb.createSheet(safeSheetName(sheetName));
sheet.createFreezePane(0, 1);
int r = 0;
Row header = sheet.createRow(r++);
String[] cols = new String[]{
"sourceSite", "name", "country", "region",
"latitude", "longitude",
"altitudeMin", "altitudeMax",
"totalKm", "slopeCount", "liftCount",
"ticketPriceMin", "ticketPriceMax", "currency",
"overallScore",
"url",
"crawlTime"
};
for (int i = 0; i < cols.length; i++) {
Cell c = header.createCell(i);
c.setCellValue(cols[i]);
c.setCellStyle(headerStyle);
}
if (rows != null) {
for (SkiResort sr : rows) {
if (sr == null) {
continue;
}
Row row = sheet.createRow(r++);
int i = 0;
setCell(row, i++, sr.getSourceSite());
setCell(row, i++, sr.getName());
setCell(row, i++, sr.getCountry());
setCell(row, i++, sr.getRegion());
setCell(row, i++, sr.getLatitude());
setCell(row, i++, sr.getLongitude());
setCell(row, i++, sr.getAltitudeMin());
setCell(row, i++, sr.getAltitudeMax());
setCell(row, i++, sr.getTotalKm());
setCell(row, i++, sr.getSlopeCount());
setCell(row, i++, sr.getLiftCount());
setCell(row, i++, sr.getTicketPriceMin());
setCell(row, i++, sr.getTicketPriceMax());
setCell(row, i++, sr.getCurrency());
setCell(row, i++, sr.getOverallScore());
setCell(row, i++, sr.getSourceUrl());
setCell(row, i, sr.getCrawledAt() == null ? null : sr.getCrawledAt().toString());
}
}
int[] widths = new int[]{
12, 28, 14, 18,
12, 12,
12, 12,
10, 10, 10,
14, 14, 10,
12,
40,
22
};
for (int i = 0; i < widths.length; i++) {
sheet.setColumnWidth(i, Math.min(255, Math.max(8, widths[i])) * 256);
}
}
private static CellStyle createHeaderStyle(Workbook wb) {
CellStyle style = wb.createCellStyle();
Font font = wb.createFont();
font.setBold(true);
style.setFont(font);
return style;
}
private static void setCell(Row row, int col, Object v) {
Cell cell = row.createCell(col);
if (v == null) {
return;
}
if (v instanceof Integer) {
cell.setCellValue(((Integer) v).doubleValue());
return;
}
if (v instanceof Long) {
cell.setCellValue(((Long) v).doubleValue());
return;
}
if (v instanceof Double) {
cell.setCellValue((Double) v);
return;
}
if (v instanceof Float) {
cell.setCellValue(((Float) v).doubleValue());
return;
}
if (v instanceof BigDecimal) {
cell.setCellValue(((BigDecimal) v).doubleValue());
return;
}
cell.setCellValue(String.valueOf(v));
}
private static String normalizeSite(String s) {
if (s == null) {
return "";
}
return s.trim().toLowerCase(Locale.ROOT);
}
private static String safeSheetName(String name) {
String n = name == null ? "sheet" : name.trim();
if (n.isEmpty()) {
n = "sheet";
}
n = n.replace(':', '-')
.replace('\\', '-')
.replace('/', '-')
.replace('?', '-')
.replace('*', '-')
.replace('[', '(')
.replace(']', ')');
if (n.length() > 31) {
n = n.substring(0, 31);
}
return n;
}
}

43
src/main/java/com/ski/crawler/util/JsonUtil.java

@ -0,0 +1,43 @@
package com.ski.crawler.util;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
public class JsonUtil {
private static final ObjectMapper MAPPER = new ObjectMapper();
public static ObjectMapper mapper() {
return MAPPER;
}
public static BufferedWriter openJsonlWriter(String path) throws Exception {
return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path), StandardCharsets.UTF_8));
}
public static BufferedReader openJsonlReader(String path) throws Exception {
return new BufferedReader(new InputStreamReader(new FileInputStream(path), StandardCharsets.UTF_8));
}
public static List<String> readAllLines(String path) throws Exception {
List<String> out = new ArrayList<>();
try (BufferedReader br = openJsonlReader(path)) {
String line;
while ((line = br.readLine()) != null) {
if (!line.trim().isEmpty()) {
out.add(line);
}
}
}
return out;
}
}

33
src/main/java/com/ski/crawler/util/RetryUtil.java

@ -0,0 +1,33 @@
package com.ski.crawler.util;
import com.ski.crawler.exception.NetworkException;
import java.util.concurrent.Callable;
public class RetryUtil {
public static <T> T retry(Callable<T> task, int maxAttempts, long baseSleepMs) throws Exception {
Exception last = null;
int attempts = Math.max(1, maxAttempts);
for (int i = 1; i <= attempts; i++) {
try {
return task.call();
} catch (Exception e) {
last = e;
if (i == attempts) {
throw e;
}
long sleep = baseSleepMs <= 0 ? 0 : baseSleepMs;
if (sleep > 0) {
try {
Thread.sleep(sleep);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
throw new NetworkException("Retry interrupted", ie);
}
}
}
}
throw last == null ? new NetworkException("Retry failed") : last;
}
}

60
src/main/java/com/ski/crawler/util/ValidationUtil.java

@ -0,0 +1,60 @@
package com.ski.crawler.util;
import com.ski.crawler.model.SkiResort;
import java.util.Locale;
public class ValidationUtil {
public static SkiResort clean(SkiResort r) {
if (r == null) {
return null;
}
r.setName(trimToNull(r.getName()));
r.setCountry(trimToNull(r.getCountry()));
r.setRegion(trimToNull(r.getRegion()));
r.setSourceUrl(trimToNull(r.getSourceUrl()));
r.setSourceSite(trimToNull(r.getSourceSite()));
if (r.getLatitude() != null && (r.getLatitude() < -90 || r.getLatitude() > 90)) {
r.setLatitude(null);
}
if (r.getLongitude() != null && (r.getLongitude() < -180 || r.getLongitude() > 180)) {
r.setLongitude(null);
}
if (r.getTicketPriceMin() != null && r.getTicketPriceMin() < 0) {
r.setTicketPriceMin(null);
}
if (r.getTicketPriceMax() != null && r.getTicketPriceMax() < 0) {
r.setTicketPriceMax(null);
}
return r;
}
public static void validate(SkiResort r) {
if (r == null) {
throw new IllegalArgumentException("SkiResort is null");
}
if (r.getSourceUrl() == null || r.getSourceUrl().isEmpty()) {
throw new IllegalArgumentException("sourceUrl is empty");
}
if (r.getName() == null || r.getName().isEmpty()) {
throw new IllegalArgumentException("name is empty");
}
}
public static String normalizeCountryKey(String country) {
if (country == null) {
return "";
}
return country.replace('\u00A0', ' ').trim().toLowerCase(Locale.ROOT);
}
private static String trimToNull(String s) {
if (s == null) {
return null;
}
String t = s.replace('\u00A0', ' ').trim();
return t.isEmpty() ? null : t;
}
}

52
src/main/java/com/ski/crawler/utils/CrawlerHttp.java

@ -0,0 +1,52 @@
//统一 HTTP 配置 :UA/代理/超时集中管理,避免各处硬编码
package com.ski.crawler.utils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.net.InetSocketAddress;
import java.net.Socket;
public class CrawlerHttp {
private final String userAgent;
private final String proxyHost;
private final int proxyPort;
private final boolean proxyEnabled;
private final int timeoutMs;
public CrawlerHttp(String userAgent, String proxyHost, int proxyPort, boolean proxyEnabled, int timeoutMs) {
this.userAgent = userAgent;
this.proxyHost = proxyHost;
this.proxyPort = proxyPort;
this.proxyEnabled = proxyEnabled;
this.timeoutMs = timeoutMs;
}
public Document getDocument(String url) {
org.jsoup.Connection conn = Jsoup.connect(url)
.userAgent(userAgent)
.timeout(timeoutMs)
.followRedirects(true);
if (proxyEnabled && proxyHost != null && !proxyHost.isEmpty() && proxyPort > 0 && isProxyReachable(proxyHost, proxyPort, 300)) {
conn = conn.proxy(proxyHost, proxyPort);
}
try {
return conn.get();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public String getHtml(String url) {
return getDocument(url).outerHtml();
}
private boolean isProxyReachable(String host, int port, int timeoutMs) {
try (Socket socket = new Socket()) {
socket.connect(new InetSocketAddress(host, port), timeoutMs);
return true;
} catch (Exception e) {
return false;
}
}
}

27
src/main/java/com/ski/crawler/utils/HttpClientUtil.java

@ -0,0 +1,27 @@
//网络请求工具
package com.ski.crawler.utils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
public class HttpClientUtil {
public String get(String url) throws IOException {
HttpGet request = new HttpGet(url);
try (CloseableHttpClient client = HttpClients.createDefault();
CloseableHttpResponse response = client.execute(request)) {
HttpEntity entity = response.getEntity();
if (entity == null) {
return "";
}
return EntityUtils.toString(entity, StandardCharsets.UTF_8);
}
}
}

336
src/main/java/com/ski/crawler/view/ConsoleView.java

@ -0,0 +1,336 @@
package com.ski.crawler.view;
import com.ski.crawler.model.SkiResort;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class ConsoleView {
private final boolean color;
private final int width;
private final TablePrinter table;
public ConsoleView(int width, boolean color) {
this.width = Math.max(60, width);
this.color = color;
this.table = new TablePrinter(this.width, this.color);
}
public void printHeader() {
table.printHeader();
}
public void printResort(SkiResort r) {
table.printRow(r);
}
public void printSummary(Map<String, Object> summary, Map<String, Long> byCountry, List<String> failures) {
System.err.println("---- summary ----");
for (Map.Entry<String, Object> e : summary.entrySet()) {
System.err.println(e.getKey() + "=" + (e.getValue() == null ? "" : e.getValue()));
}
if (byCountry != null && !byCountry.isEmpty()) {
System.err.println("by country:");
for (Map.Entry<String, Long> e : byCountry.entrySet()) {
System.err.println(" " + e.getKey() + ": " + e.getValue());
}
}
if (failures != null && !failures.isEmpty()) {
System.err.println("failures:");
for (String f : failures) {
System.err.println(" " + f);
}
}
}
private static class TablePrinter {
private final int width;
private final boolean color;
private boolean headerPrinted;
private TablePrinter(int width, boolean color) {
this.width = width;
this.color = color;
}
private void printHeader() {
if (headerPrinted) {
return;
}
headerPrinted = true;
List<Col> cols = columns();
String line = formatRow(cols, new String[]{"SITE", "NAME", "COUNTRY", "REGION", "COORD", "ALT", "KM", "LIFTS", "PRICE", "SCORE", "URL"}, true);
System.out.println(line);
System.out.println(repeat("-", displayWidth(stripAnsi(line))));
}
private void printRow(SkiResort r) {
List<Col> cols = columns();
String coord = formatCoord(r.getLatitude(), r.getLongitude());
String alt = formatAlt(r.getAltitudeMin(), r.getAltitudeMax());
String km = r.getTotalKm() == null ? "" : String.valueOf(r.getTotalKm());
String lifts = r.getLiftCount() == null ? "" : String.valueOf(r.getLiftCount());
String price = formatPrice(r.getTicketPriceMin(), r.getTicketPriceMax(), r.getCurrency());
String score = r.getOverallScore() == null ? "" : r.getOverallScore().toPlainString();
String line = formatRow(cols, new String[]{
safe0(r.getSourceSite()),
safe0(r.getName()),
safe0(r.getCountry()),
safe0(r.getRegion()),
coord,
alt,
km,
lifts,
price,
score,
safe0(r.getSourceUrl())
}, false);
System.out.println(line);
}
private List<Col> columns() {
List<Col> cols = new ArrayList<>();
cols.add(new Col("SITE", 6, 10, 1));
cols.add(new Col("NAME", 10, 26, 3));
cols.add(new Col("COUNTRY", 6, 14, 2));
cols.add(new Col("REGION", 6, 16, 2));
cols.add(new Col("COORD", 10, 22, 2));
cols.add(new Col("ALT", 5, 12, 1));
cols.add(new Col("KM", 2, 8, 1));
cols.add(new Col("LIFTS", 4, 8, 1));
cols.add(new Col("PRICE", 4, 14, 1));
cols.add(new Col("SCORE", 4, 8, 1));
cols.add(new Col("URL", 10, 200, 4));
allocate(cols, width);
return cols;
}
private String formatRow(List<Col> cols, String[] values, boolean header) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < cols.size() && i < values.length; i++) {
Col c = cols.get(i);
String v = values[i] == null ? "" : values[i];
String cell = padRight(truncate(v, c.width), c.width);
if (header && color) {
cell = Ansi.cyan(cell);
}
sb.append(cell);
if (i != cols.size() - 1) {
sb.append(" ");
}
}
return sb.toString();
}
private void allocate(List<Col> cols, int totalWidth) {
int gaps = (cols.size() - 1) * 2;
int available = Math.max(20, totalWidth - gaps);
int minSum = 0;
for (Col c : cols) {
c.width = c.min;
minSum += c.width;
}
int remaining = available - minSum;
if (remaining <= 0) {
return;
}
int totalWeight = 0;
for (Col c : cols) {
totalWeight += c.weight;
}
for (int loop = 0; loop < 3 && remaining > 0; loop++) {
boolean any = false;
for (Col c : cols) {
if (remaining <= 0) {
break;
}
int maxAdd = c.max - c.width;
if (maxAdd <= 0) {
continue;
}
int add = Math.max(1, remaining * c.weight / Math.max(1, totalWeight));
add = Math.min(add, maxAdd);
c.width += add;
remaining -= add;
any = true;
}
if (!any) {
break;
}
}
int idx = cols.size() - 1;
while (remaining > 0 && idx >= 0) {
Col c = cols.get(idx);
int maxAdd = c.max - c.width;
if (maxAdd > 0) {
int add = Math.min(maxAdd, remaining);
c.width += add;
remaining -= add;
}
idx--;
}
}
private String formatCoord(Double lat, Double lon) {
if (lat == null || lon == null) {
return "";
}
return String.format("%.5f,%.5f", lat, lon);
}
private String formatAlt(Integer min, Integer max) {
if (min == null && max == null) {
return "";
}
if (min != null && max != null) {
return min + "-" + max + "m";
}
if (min != null) {
return min + "m";
}
return max + "m";
}
private String formatPrice(Double min, Double max, String currency) {
if (min == null && max == null) {
return "";
}
String cur = currency == null ? "" : currency.trim();
String range;
if (min != null && max != null) {
range = stripTrailingZeros(min) + "-" + stripTrailingZeros(max);
} else if (min != null) {
range = stripTrailingZeros(min);
} else {
range = stripTrailingZeros(max);
}
return cur.isEmpty() ? range : cur + " " + range;
}
private String stripTrailingZeros(Double v) {
try {
return BigDecimal.valueOf(v).stripTrailingZeros().toPlainString();
} catch (Exception e) {
return String.valueOf(v);
}
}
}
private static class Col {
private final int min;
private final int max;
private final int weight;
private int width;
private Col(String name, int min, int max, int weight) {
this.min = min;
this.max = max;
this.weight = weight;
}
}
private static class Ansi {
private static String wrap(String code, String s) {
return "\u001B[" + code + "m" + s + "\u001B[0m";
}
private static String cyan(String s) {
return wrap("36", s);
}
}
private static String safe0(String s) {
if (s == null) {
return "";
}
return s.replace('\t', ' ').trim();
}
private static String padRight(String s, int width) {
int w = displayWidth(s);
if (w >= width) {
return s;
}
StringBuilder sb = new StringBuilder(s);
for (int i = 0; i < (width - w); i++) {
sb.append(' ');
}
return sb.toString();
}
private static String truncate(String s, int width) {
if (s == null) {
return "";
}
if (displayWidth(s) <= width) {
return s;
}
String ell = "...";
int target = Math.max(0, width - displayWidth(ell));
StringBuilder sb = new StringBuilder();
int w = 0;
for (int i = 0; i < s.length(); ) {
int cp = s.codePointAt(i);
String ch = new String(Character.toChars(cp));
int cw = displayWidth(ch);
if (w + cw > target) {
break;
}
sb.append(ch);
w += cw;
i += Character.charCount(cp);
}
sb.append(ell);
return sb.toString();
}
private static int displayWidth(String s) {
if (s == null || s.isEmpty()) {
return 0;
}
int w = 0;
for (int i = 0; i < s.length(); ) {
int cp = s.codePointAt(i);
if (cp == 27) {
int m = s.indexOf('m', i);
if (m > i) {
i = m + 1;
continue;
}
}
if (cp <= 0x1F || (cp >= 0x7F && cp <= 0x9F)) {
i += Character.charCount(cp);
continue;
}
if (cp <= 0x7F) {
w += 1;
} else {
w += 2;
}
i += Character.charCount(cp);
}
return w;
}
private static String repeat(String s, int n) {
if (n <= 0) {
return "";
}
StringBuilder sb = new StringBuilder(n * s.length());
for (int i = 0; i < n; i++) {
sb.append(s);
}
return sb.toString();
}
private static String stripAnsi(String s) {
if (s == null) {
return "";
}
return s.replaceAll("\\u001B\\[[;\\d]*m", "");
}
}

13
src/main/resources/logback.xml

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{HH:mm:ss.SSS} %-5level [%thread] %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<root level="INFO">
<appender-ref ref="CONSOLE"/>
</root>
</configuration>

BIN
target/classes/com/ski/crawler/Main.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/command/Command.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/command/CrawlCommand.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/command/ExportCommand.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/command/FilterCommand.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/command/HelpCommand.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/command/ListCommand.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/command/ResumeCommand$1.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/command/ResumeCommand.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/command/SitesCommand.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/command/StatsCommand.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/controller/CrawlerContext.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/controller/CrawlerController.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/exception/CrawlerException.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/exception/NetworkException.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/exception/ParseException.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/factory/StrategyFactory.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/model/SkiLift.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/model/SkiResort.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/model/SkiReview.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/model/SkiTicket.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/model/SkiTrail.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/parser/ResortDetailParser$Price.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/parser/ResortDetailParser.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/parser/ResortParser.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/repository/SkiResortRepository.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/service/ScraperService$CrawlReport.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/service/ScraperService.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/site/CrawlerSite.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/site/SkimapOrgSite.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/site/SkiresortInfoSite.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/site/WikipediaSite.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/spider/ResortListSpider.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/strategy/CrawlStrategy.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/strategy/SkiResortInfoStrategy.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/strategy/SkimapStrategy.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/strategy/WikipediaStrategy.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/util/CliArgs.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/util/ExcelUtil.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/util/JsonUtil.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/util/RetryUtil.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/util/ValidationUtil.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/utils/CrawlerHttp.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/utils/HttpClientUtil.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/view/ConsoleView$Ansi.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/view/ConsoleView$Col.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/view/ConsoleView$TablePrinter.class

Binary file not shown.

BIN
target/classes/com/ski/crawler/view/ConsoleView.class

Binary file not shown.

13
target/classes/logback.xml

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{HH:mm:ss.SSS} %-5level [%thread] %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<root level="INFO">
<appender-ref ref="CONSOLE"/>
</root>
</configuration>
Loading…
Cancel
Save