commit
8198b8c9b7
25 changed files with 1768 additions and 0 deletions
@ -0,0 +1,28 @@ |
|||
# Maven |
|||
target/ |
|||
pom.xml.tag |
|||
pom.xml.releaseBackup |
|||
pom.xml.versionsBackup |
|||
pom.xml.next |
|||
release.properties |
|||
dependency-reduced-pom.xml |
|||
buildNumber.properties |
|||
.mvn/timing.properties |
|||
.mvn/wrapper/maven-wrapper.jar |
|||
|
|||
# IDE |
|||
.idea/ |
|||
*.iml |
|||
*.ipr |
|||
*.iws |
|||
.project |
|||
.classpath |
|||
.settings/ |
|||
.vscode/ |
|||
|
|||
# OS |
|||
.DS_Store |
|||
Thumbs.db |
|||
|
|||
# Logs |
|||
*.log |
|||
Binary file not shown.
@ -0,0 +1,61 @@ |
|||
[ { |
|||
"title" : "A Light in the Attic", |
|||
"price" : "51.77" |
|||
}, { |
|||
"title" : "Tipping the Velvet", |
|||
"price" : "53.74" |
|||
}, { |
|||
"title" : "Soumission", |
|||
"price" : "50.10" |
|||
}, { |
|||
"title" : "Sharp Objects", |
|||
"price" : "47.82" |
|||
}, { |
|||
"title" : "Sapiens: A Brief History of Humankind", |
|||
"price" : "54.23" |
|||
}, { |
|||
"title" : "The Requiem Red", |
|||
"price" : "22.65" |
|||
}, { |
|||
"title" : "The Dirty Little Secrets of Getting Your Dream Job", |
|||
"price" : "33.34" |
|||
}, { |
|||
"title" : "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull", |
|||
"price" : "17.93" |
|||
}, { |
|||
"title" : "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics", |
|||
"price" : "22.60" |
|||
}, { |
|||
"title" : "The Black Maria", |
|||
"price" : "52.15" |
|||
}, { |
|||
"title" : "Starving Hearts (Triangular Trade Trilogy, #1)", |
|||
"price" : "13.99" |
|||
}, { |
|||
"title" : "Shakespeare's Sonnets", |
|||
"price" : "20.66" |
|||
}, { |
|||
"title" : "Set Me Free", |
|||
"price" : "17.46" |
|||
}, { |
|||
"title" : "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", |
|||
"price" : "52.29" |
|||
}, { |
|||
"title" : "Rip it Up and Start Again", |
|||
"price" : "35.02" |
|||
}, { |
|||
"title" : "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", |
|||
"price" : "57.25" |
|||
}, { |
|||
"title" : "Olio", |
|||
"price" : "23.88" |
|||
}, { |
|||
"title" : "Mesaerion: The Best Science Fiction Stories 1800-1849", |
|||
"price" : "37.59" |
|||
}, { |
|||
"title" : "Libertarianism for Beginners", |
|||
"price" : "51.33" |
|||
}, { |
|||
"title" : "It's Only the Himalayas", |
|||
"price" : "45.17" |
|||
} ] |
|||
File diff suppressed because it is too large
@ -0,0 +1,31 @@ |
|||
[ { |
|||
"text" : "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”", |
|||
"author" : "Albert Einstein" |
|||
}, { |
|||
"text" : "“It is our choices, Harry, that show what we truly are, far more than our abilities.”", |
|||
"author" : "J.K. Rowling" |
|||
}, { |
|||
"text" : "“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”", |
|||
"author" : "Albert Einstein" |
|||
}, { |
|||
"text" : "“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”", |
|||
"author" : "Jane Austen" |
|||
}, { |
|||
"text" : "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”", |
|||
"author" : "Marilyn Monroe" |
|||
}, { |
|||
"text" : "“Try not to become a man of success. Rather become a man of value.”", |
|||
"author" : "Albert Einstein" |
|||
}, { |
|||
"text" : "“It is better to be hated for what you are than to be loved for what you are not.”", |
|||
"author" : "André Gide" |
|||
}, { |
|||
"text" : "“I have not failed. I've just found 10,000 ways that won't work.”", |
|||
"author" : "Thomas A. Edison" |
|||
}, { |
|||
"text" : "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”", |
|||
"author" : "Eleanor Roosevelt" |
|||
}, { |
|||
"text" : "“A day without sunshine is like, you know, night.”", |
|||
"author" : "Steve Martin" |
|||
} ] |
|||
@ -0,0 +1,76 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
|
|||
<groupId>com.scraper</groupId> |
|||
<artifactId>web-scraper</artifactId> |
|||
<version>1.0-SNAPSHOT</version> |
|||
<packaging>jar</packaging> |
|||
|
|||
<name>Web Scraper</name> |
|||
<description>A web scraping application</description> |
|||
|
|||
<properties> |
|||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
|||
<maven.compiler.source>11</maven.compiler.source> |
|||
<maven.compiler.target>11</maven.compiler.target> |
|||
</properties> |
|||
|
|||
<dependencies> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>1.17.2</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.apache.httpcomponents.client5</groupId> |
|||
<artifactId>httpclient5</artifactId> |
|||
<version>5.4.1</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>info.picocli</groupId> |
|||
<artifactId>picocli</artifactId> |
|||
<version>4.7.6</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.fasterxml.jackson.core</groupId> |
|||
<artifactId>jackson-databind</artifactId> |
|||
<version>2.17.2</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.fasterxml.jackson.core</groupId> |
|||
<artifactId>jackson-core</artifactId> |
|||
<version>2.17.2</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.fasterxml.jackson.core</groupId> |
|||
<artifactId>jackson-annotations</artifactId> |
|||
<version>2.17.2</version> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-compiler-plugin</artifactId> |
|||
<version>3.13.0</version> |
|||
<configuration> |
|||
<source>11</source> |
|||
<target>11</target> |
|||
</configuration> |
|||
</plugin> |
|||
<plugin> |
|||
<groupId>org.codehaus.mojo</groupId> |
|||
<artifactId>exec-maven-plugin</artifactId> |
|||
<version>3.1.0</version> |
|||
<configuration> |
|||
<mainClass>com.scraper.Main</mainClass> |
|||
<commandlineArgs>--site all --output ./output</commandlineArgs> |
|||
</configuration> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
|||
@ -0,0 +1,109 @@ |
|||
package com.scraper; |
|||
|
|||
import com.scraper.command.CrawlerCommand; |
|||
import com.scraper.command.CrawlAllCommand; |
|||
import com.scraper.command.CrawlBooksCommand; |
|||
import com.scraper.command.CrawlCountriesCommand; |
|||
import com.scraper.command.CrawlQuotesCommand; |
|||
import com.scraper.exception.CrawlerException; |
|||
import com.scraper.exception.NetworkException; |
|||
import com.scraper.exception.ParseException; |
|||
import com.scraper.exception.StorageException; |
|||
import com.scraper.strategy.SiteABooksStrategy; |
|||
import com.scraper.strategy.SiteBQuotesStrategy; |
|||
import com.scraper.strategy.SiteCCountriesStrategy; |
|||
import picocli.CommandLine; |
|||
import picocli.CommandLine.Command; |
|||
import picocli.CommandLine.Option; |
|||
|
|||
|
|||
|
|||
/** |
|||
* 爬虫程序的主入口类,使用 Picocli 实现命令行解析。 |
|||
* 支持爬取书籍、名言、国家信息或全部内容,并保存为 JSON 文件。 |
|||
*/ |
|||
@Command(name = "webscraper", mixinStandardHelpOptions = true, version = "1.0", |
|||
description = "网页爬虫程序,支持爬取书籍、名言和国家信息。") |
|||
public class Main implements Runnable { |
|||
|
|||
/** |
|||
* 要爬取的网站类型,可选值:books、quotes、countries、all(不区分大小写),默认为 all |
|||
*/ |
|||
@Option(names = {"-s", "--site"}, description = "要爬取的网站类型:books、quotes、countries、all(默认:all)") |
|||
private String site = "all"; |
|||
|
|||
/** |
|||
* 输出目录,默认为 "./output" |
|||
*/ |
|||
@Option(names = {"-o", "--output"}, description = "输出目录(默认:./output)") |
|||
private String outputDir = "./output"; |
|||
|
|||
/** |
|||
* 主方法,程序入口 |
|||
* @param args 命令行参数 |
|||
*/ |
|||
public static void main(String[] args) { |
|||
int exitCode = new CommandLine(new Main()).execute(args); |
|||
System.exit(exitCode); |
|||
} |
|||
|
|||
/** |
|||
* 执行业务逻辑 |
|||
*/ |
|||
@Override |
|||
public void run() { |
|||
try { |
|||
// 创建策略实例
|
|||
SiteABooksStrategy booksStrategy = new SiteABooksStrategy(); |
|||
SiteBQuotesStrategy quotesStrategy = new SiteBQuotesStrategy(); |
|||
SiteCCountriesStrategy countriesStrategy = new SiteCCountriesStrategy(); |
|||
|
|||
CrawlerCommand command; |
|||
|
|||
// 根据 --site 选项选择对应的命令
|
|||
switch (site.toLowerCase()) { |
|||
case "books": |
|||
command = new CrawlBooksCommand(booksStrategy, outputDir); |
|||
break; |
|||
case "quotes": |
|||
command = new CrawlQuotesCommand(quotesStrategy, outputDir); |
|||
break; |
|||
case "countries": |
|||
command = new CrawlCountriesCommand(countriesStrategy, outputDir); |
|||
break; |
|||
case "all": |
|||
default: |
|||
command = new CrawlAllCommand(booksStrategy, quotesStrategy, countriesStrategy, outputDir); |
|||
break; |
|||
} |
|||
|
|||
// 执行命令
|
|||
command.execute(); |
|||
|
|||
} catch (NetworkException e) { |
|||
System.err.println("网络错误:" + e.getMessage()); |
|||
if (e.getCause() != null) { |
|||
System.err.println("原因:" + e.getCause().getMessage()); |
|||
} |
|||
System.exit(1); |
|||
} catch (ParseException e) { |
|||
System.err.println("解析失败:" + e.getMessage()); |
|||
if (e.getCause() != null) { |
|||
System.err.println("原因:" + e.getCause().getMessage()); |
|||
} |
|||
System.exit(1); |
|||
} catch (StorageException e) { |
|||
System.err.println("存储异常:" + e.getMessage()); |
|||
if (e.getCause() != null) { |
|||
System.err.println("原因:" + e.getCause().getMessage()); |
|||
} |
|||
System.exit(1); |
|||
} catch (CrawlerException e) { |
|||
System.err.println("爬取异常:" + e.getMessage()); |
|||
if (e.getCause() != null) { |
|||
System.err.println("原因:" + e.getCause().getMessage()); |
|||
} |
|||
System.exit(1); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,45 @@ |
|||
package com.scraper.command; |
|||
|
|||
import com.scraper.exception.CrawlerException; |
|||
import com.scraper.model.Book; |
|||
import com.scraper.model.Country; |
|||
import com.scraper.model.Quote; |
|||
import com.scraper.strategy.SiteABooksStrategy; |
|||
import com.scraper.strategy.SiteBQuotesStrategy; |
|||
import com.scraper.strategy.SiteCCountriesStrategy; |
|||
import com.scraper.view.ConsoleView; |
|||
import com.scraper.view.FileSaver; |
|||
import java.util.List; |
|||
|
|||
public class CrawlAllCommand implements CrawlerCommand { |
|||
private SiteABooksStrategy booksStrategy; |
|||
private SiteBQuotesStrategy quotesStrategy; |
|||
private SiteCCountriesStrategy countriesStrategy; |
|||
private String outputDir; |
|||
|
|||
public CrawlAllCommand(SiteABooksStrategy booksStrategy, SiteBQuotesStrategy quotesStrategy, SiteCCountriesStrategy countriesStrategy, String outputDir) { |
|||
this.booksStrategy = booksStrategy; |
|||
this.quotesStrategy = quotesStrategy; |
|||
this.countriesStrategy = countriesStrategy; |
|||
this.outputDir = outputDir; |
|||
} |
|||
|
|||
public CrawlAllCommand(SiteABooksStrategy booksStrategy, SiteBQuotesStrategy quotesStrategy, SiteCCountriesStrategy countriesStrategy) { |
|||
this(booksStrategy, quotesStrategy, countriesStrategy, "./output"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute() throws CrawlerException { |
|||
List<Book> books = booksStrategy.crawl("http://books.toscrape.com"); |
|||
ConsoleView.printBooks(books); |
|||
FileSaver.saveToJson(books, outputDir + "/books.json"); |
|||
|
|||
List<Quote> quotes = quotesStrategy.crawl("http://quotes.toscrape.com"); |
|||
ConsoleView.printQuotes(quotes); |
|||
FileSaver.saveToJson(quotes, outputDir + "/quotes.json"); |
|||
|
|||
List<Country> countries = countriesStrategy.crawl("https://www.scrapethissite.com/pages/simple/"); |
|||
ConsoleView.printCountries(countries); |
|||
FileSaver.saveToJson(countries, outputDir + "/countries.json"); |
|||
} |
|||
} |
|||
@ -0,0 +1,29 @@ |
|||
package com.scraper.command; |
|||
|
|||
import com.scraper.exception.CrawlerException; |
|||
import com.scraper.model.Book; |
|||
import com.scraper.strategy.SiteABooksStrategy; |
|||
import com.scraper.view.ConsoleView; |
|||
import com.scraper.view.FileSaver; |
|||
import java.util.List; |
|||
|
|||
public class CrawlBooksCommand implements CrawlerCommand { |
|||
private SiteABooksStrategy strategy; |
|||
private String outputDir; |
|||
|
|||
public CrawlBooksCommand(SiteABooksStrategy strategy, String outputDir) { |
|||
this.strategy = strategy; |
|||
this.outputDir = outputDir; |
|||
} |
|||
|
|||
public CrawlBooksCommand(SiteABooksStrategy strategy) { |
|||
this(strategy, "./output"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute() throws CrawlerException { |
|||
List<Book> books = strategy.crawl("http://books.toscrape.com"); |
|||
ConsoleView.printBooks(books); |
|||
FileSaver.saveToJson(books, outputDir + "/books.json"); |
|||
} |
|||
} |
|||
@ -0,0 +1,29 @@ |
|||
package com.scraper.command; |
|||
|
|||
import com.scraper.exception.CrawlerException; |
|||
import com.scraper.model.Country; |
|||
import com.scraper.strategy.SiteCCountriesStrategy; |
|||
import com.scraper.view.ConsoleView; |
|||
import com.scraper.view.FileSaver; |
|||
import java.util.List; |
|||
|
|||
public class CrawlCountriesCommand implements CrawlerCommand { |
|||
private SiteCCountriesStrategy strategy; |
|||
private String outputDir; |
|||
|
|||
public CrawlCountriesCommand(SiteCCountriesStrategy strategy, String outputDir) { |
|||
this.strategy = strategy; |
|||
this.outputDir = outputDir; |
|||
} |
|||
|
|||
public CrawlCountriesCommand(SiteCCountriesStrategy strategy) { |
|||
this(strategy, "./output"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute() throws CrawlerException { |
|||
List<Country> countries = strategy.crawl("https://www.scrapethissite.com/pages/simple/"); |
|||
ConsoleView.printCountries(countries); |
|||
FileSaver.saveToJson(countries, outputDir + "/countries.json"); |
|||
} |
|||
} |
|||
@ -0,0 +1,29 @@ |
|||
package com.scraper.command; |
|||
|
|||
import com.scraper.exception.CrawlerException; |
|||
import com.scraper.model.Quote; |
|||
import com.scraper.strategy.SiteBQuotesStrategy; |
|||
import com.scraper.view.ConsoleView; |
|||
import com.scraper.view.FileSaver; |
|||
import java.util.List; |
|||
|
|||
public class CrawlQuotesCommand implements CrawlerCommand { |
|||
private SiteBQuotesStrategy strategy; |
|||
private String outputDir; |
|||
|
|||
public CrawlQuotesCommand(SiteBQuotesStrategy strategy, String outputDir) { |
|||
this.strategy = strategy; |
|||
this.outputDir = outputDir; |
|||
} |
|||
|
|||
public CrawlQuotesCommand(SiteBQuotesStrategy strategy) { |
|||
this(strategy, "./output"); |
|||
} |
|||
|
|||
@Override |
|||
public void execute() throws CrawlerException { |
|||
List<Quote> quotes = strategy.crawl("http://quotes.toscrape.com"); |
|||
ConsoleView.printQuotes(quotes); |
|||
FileSaver.saveToJson(quotes, outputDir + "/quotes.json"); |
|||
} |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
package com.scraper.command; |
|||
|
|||
import com.scraper.exception.CrawlerException; |
|||
|
|||
public interface CrawlerCommand { |
|||
void execute() throws CrawlerException; |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
package com.scraper.exception; |
|||
|
|||
public abstract class CrawlerException extends Exception { |
|||
public CrawlerException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
package com.scraper.exception; |
|||
|
|||
public class NetworkException extends CrawlerException { |
|||
public NetworkException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
package com.scraper.exception; |
|||
|
|||
public class ParseException extends CrawlerException { |
|||
public ParseException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
package com.scraper.exception; |
|||
|
|||
public class StorageException extends CrawlerException { |
|||
public StorageException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,24 @@ |
|||
package com.scraper.model; |
|||
|
|||
public class Book { |
|||
private String title; |
|||
private String price; |
|||
|
|||
public Book(String title, String price) { |
|||
this.title = title; |
|||
this.price = price; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public String getPrice() { |
|||
return price; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Book{title='" + title + "', price='" + price + "'}"; |
|||
} |
|||
} |
|||
@ -0,0 +1,30 @@ |
|||
package com.scraper.model; |
|||
|
|||
public class Country { |
|||
private String name; |
|||
private String capital; |
|||
private String population; |
|||
|
|||
public Country(String name, String capital, String population) { |
|||
this.name = name; |
|||
this.capital = capital; |
|||
this.population = population; |
|||
} |
|||
|
|||
public String getName() { |
|||
return name; |
|||
} |
|||
|
|||
public String getCapital() { |
|||
return capital; |
|||
} |
|||
|
|||
public String getPopulation() { |
|||
return population; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Country{name='" + name + "', capital='" + capital + "', population='" + population + "'}"; |
|||
} |
|||
} |
|||
@ -0,0 +1,24 @@ |
|||
package com.scraper.model; |
|||
|
|||
public class Quote { |
|||
private String text; |
|||
private String author; |
|||
|
|||
public Quote(String text, String author) { |
|||
this.text = text; |
|||
this.author = author; |
|||
} |
|||
|
|||
public String getText() { |
|||
return text; |
|||
} |
|||
|
|||
public String getAuthor() { |
|||
return author; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Quote{text='" + text + "', author='" + author + "'}"; |
|||
} |
|||
} |
|||
@ -0,0 +1,8 @@ |
|||
package com.scraper.strategy; |
|||
|
|||
import com.scraper.exception.CrawlerException; |
|||
import java.util.List; |
|||
|
|||
public interface CrawlStrategy<T> { |
|||
List<T> crawl(String url) throws CrawlerException; |
|||
} |
|||
@ -0,0 +1,51 @@ |
|||
package com.scraper.strategy; |
|||
|
|||
import com.scraper.exception.CrawlerException; |
|||
import com.scraper.exception.NetworkException; |
|||
import com.scraper.exception.ParseException; |
|||
import com.scraper.model.Book; |
|||
import org.apache.hc.client5.http.classic.methods.HttpGet; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
|||
import org.apache.hc.client5.http.impl.classic.HttpClients; |
|||
|
|||
import org.apache.hc.core5.http.io.entity.EntityUtils; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class SiteABooksStrategy implements CrawlStrategy<Book> { |
|||
@Override |
|||
public List<Book> crawl(String url) throws CrawlerException { |
|||
System.out.println("正在爬取 [http://books.toscrape.com]..."); |
|||
List<Book> books = new ArrayList<>(); |
|||
|
|||
try (CloseableHttpClient httpClient = HttpClients.createDefault()) { |
|||
HttpGet httpGet = new HttpGet(url); |
|||
|
|||
try (CloseableHttpResponse response = httpClient.execute(httpGet)) { |
|||
String html = EntityUtils.toString(response.getEntity()); |
|||
Document doc = Jsoup.parse(html); |
|||
Elements productPods = doc.select(".product_pod"); |
|||
|
|||
for (Element pod : productPods) { |
|||
String title = pod.select("h3 > a").attr("title"); |
|||
String priceText = pod.select(".price_color").text(); |
|||
String price = priceText.replace("£", ""); |
|||
books.add(new Book(title, price)); |
|||
} |
|||
} catch (org.apache.hc.core5.http.ParseException e) { |
|||
throw new ParseException("解析响应内容失败", e); |
|||
} |
|||
} catch (IOException e) { |
|||
throw new NetworkException("网络请求失败", e); |
|||
} |
|||
|
|||
return books; |
|||
} |
|||
} |
|||
@ -0,0 +1,50 @@ |
|||
package com.scraper.strategy; |
|||
|
|||
import com.scraper.exception.CrawlerException; |
|||
import com.scraper.exception.NetworkException; |
|||
import com.scraper.exception.ParseException; |
|||
import com.scraper.model.Quote; |
|||
import org.apache.hc.client5.http.classic.methods.HttpGet; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
|||
import org.apache.hc.client5.http.impl.classic.HttpClients; |
|||
|
|||
import org.apache.hc.core5.http.io.entity.EntityUtils; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class SiteBQuotesStrategy implements CrawlStrategy<Quote> { |
|||
@Override |
|||
public List<Quote> crawl(String url) throws CrawlerException { |
|||
System.out.println("正在爬取 [http://quotes.toscrape.com]..."); |
|||
List<Quote> quotes = new ArrayList<>(); |
|||
|
|||
try (CloseableHttpClient httpClient = HttpClients.createDefault()) { |
|||
HttpGet httpGet = new HttpGet(url); |
|||
|
|||
try (CloseableHttpResponse response = httpClient.execute(httpGet)) { |
|||
String html = EntityUtils.toString(response.getEntity()); |
|||
Document doc = Jsoup.parse(html); |
|||
Elements quoteElements = doc.select(".quote"); |
|||
|
|||
for (Element quoteEl : quoteElements) { |
|||
String text = quoteEl.select(".text").text(); |
|||
String author = quoteEl.select(".author").text(); |
|||
quotes.add(new Quote(text, author)); |
|||
} |
|||
} catch (org.apache.hc.core5.http.ParseException e) { |
|||
throw new ParseException("解析响应内容失败", e); |
|||
} |
|||
} catch (IOException e) { |
|||
throw new NetworkException("网络请求失败", e); |
|||
} |
|||
|
|||
return quotes; |
|||
} |
|||
} |
|||
@ -0,0 +1,51 @@ |
|||
package com.scraper.strategy; |
|||
|
|||
import com.scraper.exception.CrawlerException; |
|||
import com.scraper.exception.NetworkException; |
|||
import com.scraper.exception.ParseException; |
|||
import com.scraper.model.Country; |
|||
import org.apache.hc.client5.http.classic.methods.HttpGet; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
|||
import org.apache.hc.client5.http.impl.classic.HttpClients; |
|||
|
|||
import org.apache.hc.core5.http.io.entity.EntityUtils; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class SiteCCountriesStrategy implements CrawlStrategy<Country> { |
|||
@Override |
|||
public List<Country> crawl(String url) throws CrawlerException { |
|||
System.out.println("正在爬取 [https://www.scrapethissite.com/pages/simple/]..."); |
|||
List<Country> countries = new ArrayList<>(); |
|||
|
|||
try (CloseableHttpClient httpClient = HttpClients.createDefault()) { |
|||
HttpGet httpGet = new HttpGet(url); |
|||
|
|||
try (CloseableHttpResponse response = httpClient.execute(httpGet)) { |
|||
String html = EntityUtils.toString(response.getEntity()); |
|||
Document doc = Jsoup.parse(html); |
|||
Elements countryElements = doc.select(".country"); |
|||
|
|||
for (Element countryEl : countryElements) { |
|||
String name = countryEl.select(".country-name").text().trim(); |
|||
String capital = countryEl.select(".country-capital").text().trim(); |
|||
String population = countryEl.select(".country-population").text().trim(); |
|||
countries.add(new Country(name, capital, population)); |
|||
} |
|||
} catch (org.apache.hc.core5.http.ParseException e) { |
|||
throw new ParseException("解析响应内容失败", e); |
|||
} |
|||
} catch (IOException e) { |
|||
throw new NetworkException("网络请求失败", e); |
|||
} |
|||
|
|||
return countries; |
|||
} |
|||
} |
|||
@ -0,0 +1,26 @@ |
|||
package com.scraper.view; |
|||
|
|||
import com.scraper.model.Book; |
|||
import com.scraper.model.Country; |
|||
import com.scraper.model.Quote; |
|||
import java.util.List; |
|||
|
|||
public class ConsoleView { |
|||
public static void printBooks(List<Book> books) { |
|||
for (Book book : books) { |
|||
System.out.println("书名: 《" + book.getTitle() + "》, 价格: £" + book.getPrice()); |
|||
} |
|||
} |
|||
|
|||
public static void printQuotes(List<Quote> quotes) { |
|||
for (Quote quote : quotes) { |
|||
System.out.println("\"" + quote.getText() + "\" —— " + quote.getAuthor()); |
|||
} |
|||
} |
|||
|
|||
public static void printCountries(List<Country> countries) { |
|||
for (Country country : countries) { |
|||
System.out.println("国家: " + country.getName() + ", 首都: " + country.getCapital() + ", 人口: " + country.getPopulation()); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,31 @@ |
|||
package com.scraper.view; |
|||
|
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import com.scraper.exception.StorageException; |
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Path; |
|||
import java.nio.file.Paths; |
|||
import java.util.List; |
|||
import java.util.function.Function; |
|||
|
|||
public class FileSaver { |
|||
public static void saveToJson(Object data, String filePath) throws StorageException { |
|||
try { |
|||
Path path = Paths.get(filePath); |
|||
Path parentDir = path.getParent(); |
|||
if (parentDir != null) { |
|||
Files.createDirectories(parentDir); |
|||
} |
|||
ObjectMapper mapper = new ObjectMapper(); |
|||
mapper.writerWithDefaultPrettyPrinter().writeValue(new File(filePath), data); |
|||
} catch (IOException e) { |
|||
throw new StorageException("无法写入 JSON 文件: " + filePath, e); |
|||
} |
|||
} |
|||
|
|||
public static void saveToCsv(List<?> items, String filePath, String[] headers, Function<Object, String> rowMapper) throws StorageException { |
|||
throw new UnsupportedOperationException("CSV 保存功能暂未实现"); |
|||
} |
|||
} |
|||
Loading…
Reference in new issue