import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; //Bookable接口,约束图书必须提供书名、价格、原价、折扣、图片链接、作者 interface Bookable { String getTitle(); double getPrice(); double getOriginalPrice(); double getDiscount(); String getImageUrl(); String getAuthor(); } //泛型爬虫接口,T必须是Bookable的子类, interface Crawler { List crawlPage(String url) throws IOException; //单页爬取 List crawlAllPages(int startPage, int endPage); //批量爬取 default void printResults(List items) { items.forEach(System.out::println); //打印所有图书信息 } //default能用但不强制用且可以override修改的方法 } //定义图书抽象父类AbstractBook,实现Bookable接口 abstract class AbstractBook implements Bookable { //实现图书规范接口 protected String title, imageUrl, author; protected double price, originalPrice, discount; @Override public String getTitle() { return title; } //get规定必须实现的方法,即返回图书书名 @Override public double getPrice() { return price; } @Override public double getOriginalPrice() { return originalPrice; } @Override public double getDiscount() { return discount; } @Override public String getImageUrl() { return imageUrl; } @Override public String getAuthor() { return author; } //格式化输出图书信息,方便控制台打印查看 @Override public String toString() { return String.format("Book{title='%s', price=%.2f, originalPrice=%.2f, discount=%.1f, author='%s'}", title, price, originalPrice, discount, author); } } //定义图书子类Book,继承AbstractBook,实现Bookable接口 class Book extends AbstractBook { public Book() {} public Book(String title, double price, double originalPrice, double discount, String imageUrl, String author) { this.title = title; //把传入参数精准赋值给创建的图书对象的title this.price = price; this.originalPrice = originalPrice; this.discount = discount; this.imageUrl = imageUrl; this.author = author; } } //定义爬虫图书抽象父类AbstractCrawler,实现Crawler接口 abstract class AbstractCrawler implements Crawler { protected static final String BASE_URL = "http://bang.dangdang.com/books/bestsellers/%d"; @Override public List crawlAllPages(int startPage, int endPage) { List allItems = new ArrayList<>(); for (int page = startPage; page <= endPage; page++) { try { List items = crawlPage(String.format(BASE_URL, page)); allItems.addAll(items); System.out.println("Page " + page + ": " + items.size() + " items"); } catch (IOException e) { System.err.println("Error: " + e.getMessage()); } } return allItems; } protected Document fetchDoc(String url) throws IOException { return Jsoup.connect(url).timeout(10000).userAgent("Mozilla/5.0").get(); } protected double parsePrice(String text) { try { return Double.parseDouble(text.replaceAll("[^0-9.]", "")); } catch (Exception e) { return 0.0; } } protected double parseDiscount(String text) { try { if (text.contains("折")) { String discount = text.replace("折", "").replace("(", "").replace(")", ""); return Double.parseDouble(discount); } } catch (Exception e) {} return 10.0; } } //定义图书爬虫子类BookCrawler,继承AbstractCrawler,实现Crawler接口 class BookCrawler extends AbstractCrawler { @Override public List crawlPage(String url) throws IOException { List books = new ArrayList<>(); Document doc = fetchDoc(url); Elements bookElements = doc.select("li"); for (Element e : bookElements) { String title = e.select("a[title]").attr("title"); if (title == null || title.isEmpty() || title.length() < 10) continue; String priceText = e.select("span.price_n").text(); if (priceText.isEmpty()) priceText = e.select(".price").text(); if (priceText.isEmpty()) priceText = e.select("[class*=price]").text(); if (priceText.isEmpty()) priceText = "0"; String originalPriceText = e.select("span.price_r").text(); if (originalPriceText.isEmpty()) originalPriceText = priceText; String discountText = e.select("span.discount").text(); if (discountText.isEmpty()) discountText = e.select("[class*=discount]").text(); String imageUrl = e.select("img").attr("src"); if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-original"); String author = e.select("span.author").text(); if (author.isEmpty()) author = e.select("[class*=author]").text(); Book book = new Book( title, parsePrice(priceText), parsePrice(originalPriceText), parseDiscount(discountText), imageUrl, author ); books.add(book); } return books; } //程序入口main方法,创建BookCrawler实例,爬取1-8页图书数据,保存到books_data.txt文件 public static void main(String[] args) { Crawler crawler = new BookCrawler(); List books = crawler.crawlAllPages(1, 8); saveToFile(books, "books_data.txt"); System.out.println("Total: " + books.size() + " books"); } //定义保存图书数据到文件的方法saveToFile,将图书数据保存到文件books_data.txt中 private static void saveToFile(List books, String filename) { try (PrintWriter w = new PrintWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) { w.println("Title,Price,OriginalPrice,Discount,ImageUrl,Author"); books.forEach(b -> w.printf("%s,%.2f,%.2f,%.1f,%s,%s%n", b.getTitle(), b.getPrice(), b.getOriginalPrice(), b.getDiscount(), b.getImageUrl(), b.getAuthor())); } catch (IOException e) { System.err.println("Save error: " + e.getMessage()); } } }