commit
8198b8c9b7
25 changed files with 1768 additions and 0 deletions
@ -0,0 +1,28 @@ |
|||||
|
# Maven |
||||
|
target/ |
||||
|
pom.xml.tag |
||||
|
pom.xml.releaseBackup |
||||
|
pom.xml.versionsBackup |
||||
|
pom.xml.next |
||||
|
release.properties |
||||
|
dependency-reduced-pom.xml |
||||
|
buildNumber.properties |
||||
|
.mvn/timing.properties |
||||
|
.mvn/wrapper/maven-wrapper.jar |
||||
|
|
||||
|
# IDE |
||||
|
.idea/ |
||||
|
*.iml |
||||
|
*.ipr |
||||
|
*.iws |
||||
|
.project |
||||
|
.classpath |
||||
|
.settings/ |
||||
|
.vscode/ |
||||
|
|
||||
|
# OS |
||||
|
.DS_Store |
||||
|
Thumbs.db |
||||
|
|
||||
|
# Logs |
||||
|
*.log |
||||
Binary file not shown.
@ -0,0 +1,61 @@ |
|||||
|
[ { |
||||
|
"title" : "A Light in the Attic", |
||||
|
"price" : "51.77" |
||||
|
}, { |
||||
|
"title" : "Tipping the Velvet", |
||||
|
"price" : "53.74" |
||||
|
}, { |
||||
|
"title" : "Soumission", |
||||
|
"price" : "50.10" |
||||
|
}, { |
||||
|
"title" : "Sharp Objects", |
||||
|
"price" : "47.82" |
||||
|
}, { |
||||
|
"title" : "Sapiens: A Brief History of Humankind", |
||||
|
"price" : "54.23" |
||||
|
}, { |
||||
|
"title" : "The Requiem Red", |
||||
|
"price" : "22.65" |
||||
|
}, { |
||||
|
"title" : "The Dirty Little Secrets of Getting Your Dream Job", |
||||
|
"price" : "33.34" |
||||
|
}, { |
||||
|
"title" : "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull", |
||||
|
"price" : "17.93" |
||||
|
}, { |
||||
|
"title" : "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics", |
||||
|
"price" : "22.60" |
||||
|
}, { |
||||
|
"title" : "The Black Maria", |
||||
|
"price" : "52.15" |
||||
|
}, { |
||||
|
"title" : "Starving Hearts (Triangular Trade Trilogy, #1)", |
||||
|
"price" : "13.99" |
||||
|
}, { |
||||
|
"title" : "Shakespeare's Sonnets", |
||||
|
"price" : "20.66" |
||||
|
}, { |
||||
|
"title" : "Set Me Free", |
||||
|
"price" : "17.46" |
||||
|
}, { |
||||
|
"title" : "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", |
||||
|
"price" : "52.29" |
||||
|
}, { |
||||
|
"title" : "Rip it Up and Start Again", |
||||
|
"price" : "35.02" |
||||
|
}, { |
||||
|
"title" : "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", |
||||
|
"price" : "57.25" |
||||
|
}, { |
||||
|
"title" : "Olio", |
||||
|
"price" : "23.88" |
||||
|
}, { |
||||
|
"title" : "Mesaerion: The Best Science Fiction Stories 1800-1849", |
||||
|
"price" : "37.59" |
||||
|
}, { |
||||
|
"title" : "Libertarianism for Beginners", |
||||
|
"price" : "51.33" |
||||
|
}, { |
||||
|
"title" : "It's Only the Himalayas", |
||||
|
"price" : "45.17" |
||||
|
} ] |
||||
File diff suppressed because it is too large
@ -0,0 +1,31 @@ |
|||||
|
[ { |
||||
|
"text" : "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”", |
||||
|
"author" : "Albert Einstein" |
||||
|
}, { |
||||
|
"text" : "“It is our choices, Harry, that show what we truly are, far more than our abilities.”", |
||||
|
"author" : "J.K. Rowling" |
||||
|
}, { |
||||
|
"text" : "“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”", |
||||
|
"author" : "Albert Einstein" |
||||
|
}, { |
||||
|
"text" : "“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”", |
||||
|
"author" : "Jane Austen" |
||||
|
}, { |
||||
|
"text" : "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”", |
||||
|
"author" : "Marilyn Monroe" |
||||
|
}, { |
||||
|
"text" : "“Try not to become a man of success. Rather become a man of value.”", |
||||
|
"author" : "Albert Einstein" |
||||
|
}, { |
||||
|
"text" : "“It is better to be hated for what you are than to be loved for what you are not.”", |
||||
|
"author" : "André Gide" |
||||
|
}, { |
||||
|
"text" : "“I have not failed. I've just found 10,000 ways that won't work.”", |
||||
|
"author" : "Thomas A. Edison" |
||||
|
}, { |
||||
|
"text" : "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”", |
||||
|
"author" : "Eleanor Roosevelt" |
||||
|
}, { |
||||
|
"text" : "“A day without sunshine is like, you know, night.”", |
||||
|
"author" : "Steve Martin" |
||||
|
} ] |
||||
@ -0,0 +1,76 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" |
||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
||||
|
<modelVersion>4.0.0</modelVersion> |
||||
|
|
||||
|
<groupId>com.scraper</groupId> |
||||
|
<artifactId>web-scraper</artifactId> |
||||
|
<version>1.0-SNAPSHOT</version> |
||||
|
<packaging>jar</packaging> |
||||
|
|
||||
|
<name>Web Scraper</name> |
||||
|
<description>A web scraping application</description> |
||||
|
|
||||
|
<properties> |
||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
||||
|
<maven.compiler.source>11</maven.compiler.source> |
||||
|
<maven.compiler.target>11</maven.compiler.target> |
||||
|
</properties> |
||||
|
|
||||
|
<dependencies> |
||||
|
<dependency> |
||||
|
<groupId>org.jsoup</groupId> |
||||
|
<artifactId>jsoup</artifactId> |
||||
|
<version>1.17.2</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>org.apache.httpcomponents.client5</groupId> |
||||
|
<artifactId>httpclient5</artifactId> |
||||
|
<version>5.4.1</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>info.picocli</groupId> |
||||
|
<artifactId>picocli</artifactId> |
||||
|
<version>4.7.6</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>com.fasterxml.jackson.core</groupId> |
||||
|
<artifactId>jackson-databind</artifactId> |
||||
|
<version>2.17.2</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>com.fasterxml.jackson.core</groupId> |
||||
|
<artifactId>jackson-core</artifactId> |
||||
|
<version>2.17.2</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>com.fasterxml.jackson.core</groupId> |
||||
|
<artifactId>jackson-annotations</artifactId> |
||||
|
<version>2.17.2</version> |
||||
|
</dependency> |
||||
|
</dependencies> |
||||
|
|
||||
|
<build> |
||||
|
<plugins> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-compiler-plugin</artifactId> |
||||
|
<version>3.13.0</version> |
||||
|
<configuration> |
||||
|
<source>11</source> |
||||
|
<target>11</target> |
||||
|
</configuration> |
||||
|
</plugin> |
||||
|
<plugin> |
||||
|
<groupId>org.codehaus.mojo</groupId> |
||||
|
<artifactId>exec-maven-plugin</artifactId> |
||||
|
<version>3.1.0</version> |
||||
|
<configuration> |
||||
|
<mainClass>com.scraper.Main</mainClass> |
||||
|
<commandlineArgs>--site all --output ./output</commandlineArgs> |
||||
|
</configuration> |
||||
|
</plugin> |
||||
|
</plugins> |
||||
|
</build> |
||||
|
</project> |
||||
@ -0,0 +1,109 @@ |
|||||
|
package com.scraper; |
||||
|
|
||||
|
import com.scraper.command.CrawlerCommand; |
||||
|
import com.scraper.command.CrawlAllCommand; |
||||
|
import com.scraper.command.CrawlBooksCommand; |
||||
|
import com.scraper.command.CrawlCountriesCommand; |
||||
|
import com.scraper.command.CrawlQuotesCommand; |
||||
|
import com.scraper.exception.CrawlerException; |
||||
|
import com.scraper.exception.NetworkException; |
||||
|
import com.scraper.exception.ParseException; |
||||
|
import com.scraper.exception.StorageException; |
||||
|
import com.scraper.strategy.SiteABooksStrategy; |
||||
|
import com.scraper.strategy.SiteBQuotesStrategy; |
||||
|
import com.scraper.strategy.SiteCCountriesStrategy; |
||||
|
import picocli.CommandLine; |
||||
|
import picocli.CommandLine.Command; |
||||
|
import picocli.CommandLine.Option; |
||||
|
|
||||
|
|
||||
|
|
||||
|
/** |
||||
|
* 爬虫程序的主入口类,使用 Picocli 实现命令行解析。 |
||||
|
* 支持爬取书籍、名言、国家信息或全部内容,并保存为 JSON 文件。 |
||||
|
*/ |
||||
|
@Command(name = "webscraper", mixinStandardHelpOptions = true, version = "1.0", |
||||
|
description = "网页爬虫程序,支持爬取书籍、名言和国家信息。") |
||||
|
public class Main implements Runnable { |
||||
|
|
||||
|
/** |
||||
|
* 要爬取的网站类型,可选值:books、quotes、countries、all(不区分大小写),默认为 all |
||||
|
*/ |
||||
|
@Option(names = {"-s", "--site"}, description = "要爬取的网站类型:books、quotes、countries、all(默认:all)") |
||||
|
private String site = "all"; |
||||
|
|
||||
|
/** |
||||
|
* 输出目录,默认为 "./output" |
||||
|
*/ |
||||
|
@Option(names = {"-o", "--output"}, description = "输出目录(默认:./output)") |
||||
|
private String outputDir = "./output"; |
||||
|
|
||||
|
/** |
||||
|
* 主方法,程序入口 |
||||
|
* @param args 命令行参数 |
||||
|
*/ |
||||
|
public static void main(String[] args) { |
||||
|
int exitCode = new CommandLine(new Main()).execute(args); |
||||
|
System.exit(exitCode); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 执行业务逻辑 |
||||
|
*/ |
||||
|
@Override |
||||
|
public void run() { |
||||
|
try { |
||||
|
// 创建策略实例
|
||||
|
SiteABooksStrategy booksStrategy = new SiteABooksStrategy(); |
||||
|
SiteBQuotesStrategy quotesStrategy = new SiteBQuotesStrategy(); |
||||
|
SiteCCountriesStrategy countriesStrategy = new SiteCCountriesStrategy(); |
||||
|
|
||||
|
CrawlerCommand command; |
||||
|
|
||||
|
// 根据 --site 选项选择对应的命令
|
||||
|
switch (site.toLowerCase()) { |
||||
|
case "books": |
||||
|
command = new CrawlBooksCommand(booksStrategy, outputDir); |
||||
|
break; |
||||
|
case "quotes": |
||||
|
command = new CrawlQuotesCommand(quotesStrategy, outputDir); |
||||
|
break; |
||||
|
case "countries": |
||||
|
command = new CrawlCountriesCommand(countriesStrategy, outputDir); |
||||
|
break; |
||||
|
case "all": |
||||
|
default: |
||||
|
command = new CrawlAllCommand(booksStrategy, quotesStrategy, countriesStrategy, outputDir); |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
// 执行命令
|
||||
|
command.execute(); |
||||
|
|
||||
|
} catch (NetworkException e) { |
||||
|
System.err.println("网络错误:" + e.getMessage()); |
||||
|
if (e.getCause() != null) { |
||||
|
System.err.println("原因:" + e.getCause().getMessage()); |
||||
|
} |
||||
|
System.exit(1); |
||||
|
} catch (ParseException e) { |
||||
|
System.err.println("解析失败:" + e.getMessage()); |
||||
|
if (e.getCause() != null) { |
||||
|
System.err.println("原因:" + e.getCause().getMessage()); |
||||
|
} |
||||
|
System.exit(1); |
||||
|
} catch (StorageException e) { |
||||
|
System.err.println("存储异常:" + e.getMessage()); |
||||
|
if (e.getCause() != null) { |
||||
|
System.err.println("原因:" + e.getCause().getMessage()); |
||||
|
} |
||||
|
System.exit(1); |
||||
|
} catch (CrawlerException e) { |
||||
|
System.err.println("爬取异常:" + e.getMessage()); |
||||
|
if (e.getCause() != null) { |
||||
|
System.err.println("原因:" + e.getCause().getMessage()); |
||||
|
} |
||||
|
System.exit(1); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,45 @@ |
|||||
|
package com.scraper.command; |
||||
|
|
||||
|
import com.scraper.exception.CrawlerException; |
||||
|
import com.scraper.model.Book; |
||||
|
import com.scraper.model.Country; |
||||
|
import com.scraper.model.Quote; |
||||
|
import com.scraper.strategy.SiteABooksStrategy; |
||||
|
import com.scraper.strategy.SiteBQuotesStrategy; |
||||
|
import com.scraper.strategy.SiteCCountriesStrategy; |
||||
|
import com.scraper.view.ConsoleView; |
||||
|
import com.scraper.view.FileSaver; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CrawlAllCommand implements CrawlerCommand { |
||||
|
private SiteABooksStrategy booksStrategy; |
||||
|
private SiteBQuotesStrategy quotesStrategy; |
||||
|
private SiteCCountriesStrategy countriesStrategy; |
||||
|
private String outputDir; |
||||
|
|
||||
|
public CrawlAllCommand(SiteABooksStrategy booksStrategy, SiteBQuotesStrategy quotesStrategy, SiteCCountriesStrategy countriesStrategy, String outputDir) { |
||||
|
this.booksStrategy = booksStrategy; |
||||
|
this.quotesStrategy = quotesStrategy; |
||||
|
this.countriesStrategy = countriesStrategy; |
||||
|
this.outputDir = outputDir; |
||||
|
} |
||||
|
|
||||
|
public CrawlAllCommand(SiteABooksStrategy booksStrategy, SiteBQuotesStrategy quotesStrategy, SiteCCountriesStrategy countriesStrategy) { |
||||
|
this(booksStrategy, quotesStrategy, countriesStrategy, "./output"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws CrawlerException { |
||||
|
List<Book> books = booksStrategy.crawl("http://books.toscrape.com"); |
||||
|
ConsoleView.printBooks(books); |
||||
|
FileSaver.saveToJson(books, outputDir + "/books.json"); |
||||
|
|
||||
|
List<Quote> quotes = quotesStrategy.crawl("http://quotes.toscrape.com"); |
||||
|
ConsoleView.printQuotes(quotes); |
||||
|
FileSaver.saveToJson(quotes, outputDir + "/quotes.json"); |
||||
|
|
||||
|
List<Country> countries = countriesStrategy.crawl("https://www.scrapethissite.com/pages/simple/"); |
||||
|
ConsoleView.printCountries(countries); |
||||
|
FileSaver.saveToJson(countries, outputDir + "/countries.json"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,29 @@ |
|||||
|
package com.scraper.command; |
||||
|
|
||||
|
import com.scraper.exception.CrawlerException; |
||||
|
import com.scraper.model.Book; |
||||
|
import com.scraper.strategy.SiteABooksStrategy; |
||||
|
import com.scraper.view.ConsoleView; |
||||
|
import com.scraper.view.FileSaver; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CrawlBooksCommand implements CrawlerCommand { |
||||
|
private SiteABooksStrategy strategy; |
||||
|
private String outputDir; |
||||
|
|
||||
|
public CrawlBooksCommand(SiteABooksStrategy strategy, String outputDir) { |
||||
|
this.strategy = strategy; |
||||
|
this.outputDir = outputDir; |
||||
|
} |
||||
|
|
||||
|
public CrawlBooksCommand(SiteABooksStrategy strategy) { |
||||
|
this(strategy, "./output"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws CrawlerException { |
||||
|
List<Book> books = strategy.crawl("http://books.toscrape.com"); |
||||
|
ConsoleView.printBooks(books); |
||||
|
FileSaver.saveToJson(books, outputDir + "/books.json"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,29 @@ |
|||||
|
package com.scraper.command; |
||||
|
|
||||
|
import com.scraper.exception.CrawlerException; |
||||
|
import com.scraper.model.Country; |
||||
|
import com.scraper.strategy.SiteCCountriesStrategy; |
||||
|
import com.scraper.view.ConsoleView; |
||||
|
import com.scraper.view.FileSaver; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CrawlCountriesCommand implements CrawlerCommand { |
||||
|
private SiteCCountriesStrategy strategy; |
||||
|
private String outputDir; |
||||
|
|
||||
|
public CrawlCountriesCommand(SiteCCountriesStrategy strategy, String outputDir) { |
||||
|
this.strategy = strategy; |
||||
|
this.outputDir = outputDir; |
||||
|
} |
||||
|
|
||||
|
public CrawlCountriesCommand(SiteCCountriesStrategy strategy) { |
||||
|
this(strategy, "./output"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws CrawlerException { |
||||
|
List<Country> countries = strategy.crawl("https://www.scrapethissite.com/pages/simple/"); |
||||
|
ConsoleView.printCountries(countries); |
||||
|
FileSaver.saveToJson(countries, outputDir + "/countries.json"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,29 @@ |
|||||
|
package com.scraper.command; |
||||
|
|
||||
|
import com.scraper.exception.CrawlerException; |
||||
|
import com.scraper.model.Quote; |
||||
|
import com.scraper.strategy.SiteBQuotesStrategy; |
||||
|
import com.scraper.view.ConsoleView; |
||||
|
import com.scraper.view.FileSaver; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CrawlQuotesCommand implements CrawlerCommand { |
||||
|
private SiteBQuotesStrategy strategy; |
||||
|
private String outputDir; |
||||
|
|
||||
|
public CrawlQuotesCommand(SiteBQuotesStrategy strategy, String outputDir) { |
||||
|
this.strategy = strategy; |
||||
|
this.outputDir = outputDir; |
||||
|
} |
||||
|
|
||||
|
public CrawlQuotesCommand(SiteBQuotesStrategy strategy) { |
||||
|
this(strategy, "./output"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws CrawlerException { |
||||
|
List<Quote> quotes = strategy.crawl("http://quotes.toscrape.com"); |
||||
|
ConsoleView.printQuotes(quotes); |
||||
|
FileSaver.saveToJson(quotes, outputDir + "/quotes.json"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,7 @@ |
|||||
|
package com.scraper.command; |
||||
|
|
||||
|
import com.scraper.exception.CrawlerException; |
||||
|
|
||||
|
public interface CrawlerCommand { |
||||
|
void execute() throws CrawlerException; |
||||
|
} |
||||
@ -0,0 +1,7 @@ |
|||||
|
package com.scraper.exception; |
||||
|
|
||||
|
public abstract class CrawlerException extends Exception { |
||||
|
public CrawlerException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,7 @@ |
|||||
|
package com.scraper.exception; |
||||
|
|
||||
|
public class NetworkException extends CrawlerException { |
||||
|
public NetworkException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,7 @@ |
|||||
|
package com.scraper.exception; |
||||
|
|
||||
|
public class ParseException extends CrawlerException { |
||||
|
public ParseException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,7 @@ |
|||||
|
package com.scraper.exception; |
||||
|
|
||||
|
public class StorageException extends CrawlerException { |
||||
|
public StorageException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,24 @@ |
|||||
|
package com.scraper.model; |
||||
|
|
||||
|
public class Book { |
||||
|
private String title; |
||||
|
private String price; |
||||
|
|
||||
|
public Book(String title, String price) { |
||||
|
this.title = title; |
||||
|
this.price = price; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public String getPrice() { |
||||
|
return price; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Book{title='" + title + "', price='" + price + "'}"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,30 @@ |
|||||
|
package com.scraper.model; |
||||
|
|
||||
|
public class Country { |
||||
|
private String name; |
||||
|
private String capital; |
||||
|
private String population; |
||||
|
|
||||
|
public Country(String name, String capital, String population) { |
||||
|
this.name = name; |
||||
|
this.capital = capital; |
||||
|
this.population = population; |
||||
|
} |
||||
|
|
||||
|
public String getName() { |
||||
|
return name; |
||||
|
} |
||||
|
|
||||
|
public String getCapital() { |
||||
|
return capital; |
||||
|
} |
||||
|
|
||||
|
public String getPopulation() { |
||||
|
return population; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Country{name='" + name + "', capital='" + capital + "', population='" + population + "'}"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,24 @@ |
|||||
|
package com.scraper.model; |
||||
|
|
||||
|
public class Quote { |
||||
|
private String text; |
||||
|
private String author; |
||||
|
|
||||
|
public Quote(String text, String author) { |
||||
|
this.text = text; |
||||
|
this.author = author; |
||||
|
} |
||||
|
|
||||
|
public String getText() { |
||||
|
return text; |
||||
|
} |
||||
|
|
||||
|
public String getAuthor() { |
||||
|
return author; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Quote{text='" + text + "', author='" + author + "'}"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,8 @@ |
|||||
|
package com.scraper.strategy; |
||||
|
|
||||
|
import com.scraper.exception.CrawlerException; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlStrategy<T> { |
||||
|
List<T> crawl(String url) throws CrawlerException; |
||||
|
} |
||||
@ -0,0 +1,51 @@ |
|||||
|
package com.scraper.strategy; |
||||
|
|
||||
|
import com.scraper.exception.CrawlerException; |
||||
|
import com.scraper.exception.NetworkException; |
||||
|
import com.scraper.exception.ParseException; |
||||
|
import com.scraper.model.Book; |
||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet; |
||||
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
||||
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
||||
|
import org.apache.hc.client5.http.impl.classic.HttpClients; |
||||
|
|
||||
|
import org.apache.hc.core5.http.io.entity.EntityUtils; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class SiteABooksStrategy implements CrawlStrategy<Book> { |
||||
|
@Override |
||||
|
public List<Book> crawl(String url) throws CrawlerException { |
||||
|
System.out.println("正在爬取 [http://books.toscrape.com]..."); |
||||
|
List<Book> books = new ArrayList<>(); |
||||
|
|
||||
|
try (CloseableHttpClient httpClient = HttpClients.createDefault()) { |
||||
|
HttpGet httpGet = new HttpGet(url); |
||||
|
|
||||
|
try (CloseableHttpResponse response = httpClient.execute(httpGet)) { |
||||
|
String html = EntityUtils.toString(response.getEntity()); |
||||
|
Document doc = Jsoup.parse(html); |
||||
|
Elements productPods = doc.select(".product_pod"); |
||||
|
|
||||
|
for (Element pod : productPods) { |
||||
|
String title = pod.select("h3 > a").attr("title"); |
||||
|
String priceText = pod.select(".price_color").text(); |
||||
|
String price = priceText.replace("£", ""); |
||||
|
books.add(new Book(title, price)); |
||||
|
} |
||||
|
} catch (org.apache.hc.core5.http.ParseException e) { |
||||
|
throw new ParseException("解析响应内容失败", e); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("网络请求失败", e); |
||||
|
} |
||||
|
|
||||
|
return books; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,50 @@ |
|||||
|
package com.scraper.strategy; |
||||
|
|
||||
|
import com.scraper.exception.CrawlerException; |
||||
|
import com.scraper.exception.NetworkException; |
||||
|
import com.scraper.exception.ParseException; |
||||
|
import com.scraper.model.Quote; |
||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet; |
||||
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
||||
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
||||
|
import org.apache.hc.client5.http.impl.classic.HttpClients; |
||||
|
|
||||
|
import org.apache.hc.core5.http.io.entity.EntityUtils; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class SiteBQuotesStrategy implements CrawlStrategy<Quote> { |
||||
|
@Override |
||||
|
public List<Quote> crawl(String url) throws CrawlerException { |
||||
|
System.out.println("正在爬取 [http://quotes.toscrape.com]..."); |
||||
|
List<Quote> quotes = new ArrayList<>(); |
||||
|
|
||||
|
try (CloseableHttpClient httpClient = HttpClients.createDefault()) { |
||||
|
HttpGet httpGet = new HttpGet(url); |
||||
|
|
||||
|
try (CloseableHttpResponse response = httpClient.execute(httpGet)) { |
||||
|
String html = EntityUtils.toString(response.getEntity()); |
||||
|
Document doc = Jsoup.parse(html); |
||||
|
Elements quoteElements = doc.select(".quote"); |
||||
|
|
||||
|
for (Element quoteEl : quoteElements) { |
||||
|
String text = quoteEl.select(".text").text(); |
||||
|
String author = quoteEl.select(".author").text(); |
||||
|
quotes.add(new Quote(text, author)); |
||||
|
} |
||||
|
} catch (org.apache.hc.core5.http.ParseException e) { |
||||
|
throw new ParseException("解析响应内容失败", e); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("网络请求失败", e); |
||||
|
} |
||||
|
|
||||
|
return quotes; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,51 @@ |
|||||
|
package com.scraper.strategy; |
||||
|
|
||||
|
import com.scraper.exception.CrawlerException; |
||||
|
import com.scraper.exception.NetworkException; |
||||
|
import com.scraper.exception.ParseException; |
||||
|
import com.scraper.model.Country; |
||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet; |
||||
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
||||
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
||||
|
import org.apache.hc.client5.http.impl.classic.HttpClients; |
||||
|
|
||||
|
import org.apache.hc.core5.http.io.entity.EntityUtils; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class SiteCCountriesStrategy implements CrawlStrategy<Country> { |
||||
|
@Override |
||||
|
public List<Country> crawl(String url) throws CrawlerException { |
||||
|
System.out.println("正在爬取 [https://www.scrapethissite.com/pages/simple/]..."); |
||||
|
List<Country> countries = new ArrayList<>(); |
||||
|
|
||||
|
try (CloseableHttpClient httpClient = HttpClients.createDefault()) { |
||||
|
HttpGet httpGet = new HttpGet(url); |
||||
|
|
||||
|
try (CloseableHttpResponse response = httpClient.execute(httpGet)) { |
||||
|
String html = EntityUtils.toString(response.getEntity()); |
||||
|
Document doc = Jsoup.parse(html); |
||||
|
Elements countryElements = doc.select(".country"); |
||||
|
|
||||
|
for (Element countryEl : countryElements) { |
||||
|
String name = countryEl.select(".country-name").text().trim(); |
||||
|
String capital = countryEl.select(".country-capital").text().trim(); |
||||
|
String population = countryEl.select(".country-population").text().trim(); |
||||
|
countries.add(new Country(name, capital, population)); |
||||
|
} |
||||
|
} catch (org.apache.hc.core5.http.ParseException e) { |
||||
|
throw new ParseException("解析响应内容失败", e); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
throw new NetworkException("网络请求失败", e); |
||||
|
} |
||||
|
|
||||
|
return countries; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,26 @@ |
|||||
|
package com.scraper.view; |
||||
|
|
||||
|
import com.scraper.model.Book; |
||||
|
import com.scraper.model.Country; |
||||
|
import com.scraper.model.Quote; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class ConsoleView { |
||||
|
public static void printBooks(List<Book> books) { |
||||
|
for (Book book : books) { |
||||
|
System.out.println("书名: 《" + book.getTitle() + "》, 价格: £" + book.getPrice()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static void printQuotes(List<Quote> quotes) { |
||||
|
for (Quote quote : quotes) { |
||||
|
System.out.println("\"" + quote.getText() + "\" —— " + quote.getAuthor()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static void printCountries(List<Country> countries) { |
||||
|
for (Country country : countries) { |
||||
|
System.out.println("国家: " + country.getName() + ", 首都: " + country.getCapital() + ", 人口: " + country.getPopulation()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,31 @@ |
|||||
|
package com.scraper.view; |
||||
|
|
||||
|
import com.fasterxml.jackson.databind.ObjectMapper; |
||||
|
import com.scraper.exception.StorageException; |
||||
|
import java.io.File; |
||||
|
import java.io.IOException; |
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.nio.file.Paths; |
||||
|
import java.util.List; |
||||
|
import java.util.function.Function; |
||||
|
|
||||
|
public class FileSaver { |
||||
|
public static void saveToJson(Object data, String filePath) throws StorageException { |
||||
|
try { |
||||
|
Path path = Paths.get(filePath); |
||||
|
Path parentDir = path.getParent(); |
||||
|
if (parentDir != null) { |
||||
|
Files.createDirectories(parentDir); |
||||
|
} |
||||
|
ObjectMapper mapper = new ObjectMapper(); |
||||
|
mapper.writerWithDefaultPrettyPrinter().writeValue(new File(filePath), data); |
||||
|
} catch (IOException e) { |
||||
|
throw new StorageException("无法写入 JSON 文件: " + filePath, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static void saveToCsv(List<?> items, String filePath, String[] headers, Function<Object, String> rowMapper) throws StorageException { |
||||
|
throw new UnsupportedOperationException("CSV 保存功能暂未实现"); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue