You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

156 lines
6.6 KiB

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
//Bookable接口,约束图书必须提供书名、价格、原价、折扣、图片链接、作者
interface Bookable {
String getTitle();
double getPrice();
double getOriginalPrice();
double getDiscount();
String getImageUrl();
String getAuthor();
}
//泛型爬虫接口,T必须是Bookable的子类,
interface Crawler<T extends Bookable> {
List<T> crawlPage(String url) throws IOException; //单页爬取
List<T> crawlAllPages(int startPage, int endPage); //批量爬取
default void printResults(List<T> items) {
items.forEach(System.out::println); //打印所有图书信息
} //default能用但不强制用且可以override修改的方法
}
//定义图书抽象父类AbstractBook,实现Bookable接口
abstract class AbstractBook implements Bookable { //实现图书规范接口
protected String title, imageUrl, author;
protected double price, originalPrice, discount;
@Override public String getTitle() { return title; } //get规定必须实现的方法,即返回图书书名
@Override public double getPrice() { return price; }
@Override public double getOriginalPrice() { return originalPrice; }
@Override public double getDiscount() { return discount; }
@Override public String getImageUrl() { return imageUrl; }
@Override public String getAuthor() { return author; }
//格式化输出图书信息,方便控制台打印查看
@Override
public String toString() {
return String.format("Book{title='%s', price=%.2f, originalPrice=%.2f, discount=%.1f, author='%s'}",
title, price, originalPrice, discount, author);
}
}
//定义图书子类Book,继承AbstractBook,实现Bookable接口
class Book extends AbstractBook {
public Book() {}
public Book(String title, double price, double originalPrice, double discount, String imageUrl, String author) {
this.title = title; //把传入参数精准赋值给创建的图书对象的title
this.price = price;
this.originalPrice = originalPrice;
this.discount = discount;
this.imageUrl = imageUrl;
this.author = author;
}
}
//定义爬虫图书抽象父类AbstractCrawler,实现Crawler接口
abstract class AbstractCrawler<T extends Bookable> implements Crawler<T> {
protected static final String BASE_URL = "http://bang.dangdang.com/books/bestsellers/%d";
@Override
public List<T> crawlAllPages(int startPage, int endPage) {
List<T> allItems = new ArrayList<>();
for (int page = startPage; page <= endPage; page++) {
try {
List<T> items = crawlPage(String.format(BASE_URL, page));
allItems.addAll(items);
System.out.println("Page " + page + ": " + items.size() + " items");
} catch (IOException e) {
System.err.println("Error: " + e.getMessage());
}
}
return allItems;
}
protected Document fetchDoc(String url) throws IOException {
return Jsoup.connect(url).timeout(10000).userAgent("Mozilla/5.0").get();
}
protected double parsePrice(String text) {
try { return Double.parseDouble(text.replaceAll("[^0-9.]", "")); }
catch (Exception e) { return 0.0; }
}
protected double parseDiscount(String text) {
try {
if (text.contains("折")) {
String discount = text.replace("折", "").replace("(", "").replace(")", "");
return Double.parseDouble(discount);
}
} catch (Exception e) {}
return 10.0;
}
}
//定义图书爬虫子类BookCrawler,继承AbstractCrawler<Book>,实现Crawler<Book>接口
class BookCrawler extends AbstractCrawler<Book> {
@Override
public List<Book> crawlPage(String url) throws IOException {
List<Book> books = new ArrayList<>();
Document doc = fetchDoc(url);
Elements bookElements = doc.select("li");
for (Element e : bookElements) {
String title = e.select("a[title]").attr("title");
if (title == null || title.isEmpty() || title.length() < 10) continue;
String priceText = e.select("span.price_n").text();
if (priceText.isEmpty()) priceText = e.select(".price").text();
if (priceText.isEmpty()) priceText = e.select("[class*=price]").text();
if (priceText.isEmpty()) priceText = "0";
String originalPriceText = e.select("span.price_r").text();
if (originalPriceText.isEmpty()) originalPriceText = priceText;
String discountText = e.select("span.discount").text();
if (discountText.isEmpty()) discountText = e.select("[class*=discount]").text();
String imageUrl = e.select("img").attr("src");
if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-original");
String author = e.select("span.author").text();
if (author.isEmpty()) author = e.select("[class*=author]").text();
Book book = new Book(
title,
parsePrice(priceText),
parsePrice(originalPriceText),
parseDiscount(discountText),
imageUrl,
author
);
books.add(book);
}
return books;
}
//程序入口main方法,创建BookCrawler实例,爬取1-8页图书数据,保存到books_data.txt文件
public static void main(String[] args) {
Crawler<Book> crawler = new BookCrawler();
List<Book> books = crawler.crawlAllPages(1, 8);
saveToFile(books, "books_data.txt");
System.out.println("Total: " + books.size() + " books");
}
//定义保存图书数据到文件的方法saveToFile,将图书数据保存到文件books_data.txt中
private static void saveToFile(List<Book> books, String filename) {
try (PrintWriter w = new PrintWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) {
w.println("Title,Price,OriginalPrice,Discount,ImageUrl,Author");
books.forEach(b -> w.printf("%s,%.2f,%.2f,%.1f,%s,%s%n",
b.getTitle(), b.getPrice(), b.getOriginalPrice(),
b.getDiscount(), b.getImageUrl(), b.getAuthor()));
} catch (IOException e) {
System.err.println("Save error: " + e.getMessage());
}
}
}