import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; interface Bookable { String getTitle(); double getPrice(); double getOriginalPrice(); double getDiscount(); String getImageUrl(); String getAuthor(); } interface Crawler { List crawlPage(String url) throws IOException; List crawlAllPages(int startPage, int endPage); default void printResults(List items) { items.forEach(System.out::println); } } abstract class AbstractBook implements Bookable { protected String title, imageUrl, author; protected double price, originalPrice, discount; @Override public String getTitle() { return title; } @Override public double getPrice() { return price; } @Override public double getOriginalPrice() { return originalPrice; } @Override public double getDiscount() { return discount; } @Override public String getImageUrl() { return imageUrl; } @Override public String getAuthor() { return author; } @Override public String toString() { return String.format("Book{title='%s', price=%.2f, originalPrice=%.2f, discount=%.1f, author='%s'}", title, price, originalPrice, discount, author); } } class Product extends AbstractBook { public Product() {} public Product(String title, double price, double originalPrice, double discount, String imageUrl, String seller) { this.title = title; this.price = price; this.originalPrice = originalPrice; this.discount = discount; this.imageUrl = imageUrl; this.author = seller; } } class Ticket extends AbstractBook { public Ticket() {} public Ticket(String title, double price, double originalPrice, double discount, String imageUrl, String performer) { this.title = title; this.price = price; this.originalPrice = originalPrice; this.discount = discount; this.imageUrl = imageUrl; this.author = performer; } } abstract class AbstractCrawler implements Crawler { protected Document fetchDoc(String url) throws IOException { return Jsoup.connect(url).timeout(15000).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36").get(); } protected double parsePrice(String text) { try { return Double.parseDouble(text.replaceAll("[^0-9.]", "")); } catch (Exception e) { return 0.0; } } protected double parseDiscount(String text) { try { if (text.contains("折")) { String discount = text.replace("折", "").replace("(", "").replace(")", ""); return Double.parseDouble(discount); } } catch (Exception e) {} return 10.0; } @Override public List crawlAllPages(int startPage, int endPage) { List allItems = new ArrayList<>(); for (int page = startPage; page <= endPage; page++) { try { List items = crawlPage(String.format(getBaseUrl(), page)); allItems.addAll(items); System.out.println("Page " + page + ": " + items.size() + " items"); } catch (IOException e) { System.err.println("Error crawling page " + page + ": " + e.getMessage()); } } return allItems; } protected abstract String getBaseUrl(); } class JDProductCrawler extends AbstractCrawler { private static final String BASE_URL = "https://list.jd.com/list.html?cat=670,671,672&page="; @Override protected String getBaseUrl() { return BASE_URL; } @Override public List crawlAllPages(int startPage, int endPage) { List allItems = new ArrayList<>(); for (int page = startPage; page <= endPage; page++) { try { List items = crawlPage(BASE_URL + page); allItems.addAll(items); System.out.println("Page " + page + ": " + items.size() + " items"); } catch (IOException e) { System.err.println("Error crawling page " + page + ": " + e.getMessage()); } } return allItems; } @Override public List crawlPage(String url) throws IOException { List products = new ArrayList<>(); Document doc = fetchDoc(url); Elements items = doc.select("li[data-sku]"); if (items.isEmpty()) { items = doc.select("div[data-sku]"); } if (items.isEmpty()) { items = doc.select("div.item"); } for (Element e : items) { String title = e.select("a[title]").attr("title"); if (title.isEmpty()) { title = e.select("h3").text(); } if (title.isEmpty()) { title = e.select(".name").text(); } if (title == null || title.isEmpty() || title.length() < 5) continue; String priceText = e.select(".price strong").text(); if (priceText.isEmpty()) priceText = e.select(".price").text(); if (priceText.isEmpty()) priceText = e.select("[class*=price]").text(); String originalPriceText = e.select(".origin-price").text(); if (originalPriceText.isEmpty()) originalPriceText = e.select(".price del").text(); if (originalPriceText.isEmpty()) originalPriceText = priceText; String imageUrl = e.select("img").attr("src"); if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-lazy-img"); if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-src"); String seller = e.select(".shop-name").text(); if (seller.isEmpty()) seller = e.select(".store-name").text(); if (seller.isEmpty()) seller = "未知商家"; Product product = new Product( title, parsePrice(priceText), parsePrice(originalPriceText), parseDiscount(""), imageUrl, seller ); products.add(product); } return products; } } class DamaiTicketCrawler extends AbstractCrawler { private static final String BASE_URL = "https://www.damai.cn/projectlist.html?page="; @Override protected String getBaseUrl() { return BASE_URL; } @Override public List crawlAllPages(int startPage, int endPage) { List allItems = new ArrayList<>(); for (int page = startPage; page <= endPage; page++) { try { List items = crawlPage(BASE_URL + page); allItems.addAll(items); System.out.println("Page " + page + ": " + items.size() + " items"); } catch (IOException e) { System.err.println("Error crawling page " + page + ": " + e.getMessage()); } } return allItems; } @Override public List crawlPage(String url) throws IOException { List tickets = new ArrayList<>(); Document doc = fetchDoc(url); Elements items = doc.select(".project-item"); if (items.isEmpty()) { items = doc.select(".ticket-item"); } if (items.isEmpty()) { items = doc.select(".item"); } for (Element e : items) { String title = e.select(".title").text(); if (title.isEmpty()) { title = e.select("a[title]").attr("title"); } if (title.isEmpty()) { title = e.select("h3").text(); } if (title == null || title.isEmpty() || title.length() < 5) continue; String priceText = e.select(".price").text(); if (priceText.isEmpty()) priceText = e.select(".price-info").text(); if (priceText.isEmpty()) continue; String imageUrl = e.select("img").attr("src"); if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-src"); String performer = e.select(".actor").text(); if (performer.isEmpty()) performer = e.select(".tag").text(); if (performer.isEmpty()) performer = e.select(".artist").text(); if (performer.isEmpty()) performer = "未知演出方"; Ticket ticket = new Ticket( title, parsePrice(priceText), parsePrice(priceText), 10.0, imageUrl, performer ); tickets.add(ticket); } return tickets; } } class DangDangProductCrawler extends AbstractCrawler { private static final String BASE_URL = "http://bang.dangdang.com/books/bestsellers/%d"; @Override protected String getBaseUrl() { return BASE_URL; } @Override public List crawlPage(String url) throws IOException { List products = new ArrayList<>(); Document doc = fetchDoc(url); Elements bookElements = doc.select("li"); for (Element e : bookElements) { String title = e.select("a[title]").attr("title"); if (title == null || title.isEmpty() || title.length() < 10) continue; String priceText = e.select("span.price_n").text(); if (priceText.isEmpty()) priceText = e.select(".price").text(); if (priceText.isEmpty()) priceText = e.select("[class*=price]").text(); if (priceText.isEmpty()) priceText = "0"; String originalPriceText = e.select("span.price_r").text(); if (originalPriceText.isEmpty()) originalPriceText = priceText; String discountText = e.select("span.discount").text(); if (discountText.isEmpty()) discountText = e.select("[class*=discount]").text(); String imageUrl = e.select("img").attr("src"); if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-original"); String seller = e.select("span.author").text(); if (seller.isEmpty()) seller = e.select("[class*=author]").text(); if (seller.isEmpty()) seller = "当当自营"; Product product = new Product( title, parsePrice(priceText), parsePrice(originalPriceText), parseDiscount(discountText), imageUrl, seller ); products.add(product); } return products; } } class SimpleTicketCrawler extends AbstractCrawler { private static final String BASE_URL = "https://www.maoyan.com/"; @Override protected String getBaseUrl() { return BASE_URL; } @Override public List crawlAllPages(int startPage, int endPage) { List allItems = new ArrayList<>(); try { List items = crawlPage(BASE_URL); if (items.isEmpty()) { System.out.println("票务网站爬取结果为空,使用模拟数据"); allItems.addAll(getMockTickets()); } else { allItems.addAll(items); } System.out.println("Page 1: " + allItems.size() + " items"); } catch (IOException e) { System.err.println("票务网站访问失败,使用模拟数据: " + e.getMessage()); allItems.addAll(getMockTickets()); System.out.println("Page 1: " + allItems.size() + " items (模拟数据)"); } return allItems; } private List getMockTickets() { List tickets = new ArrayList<>(); tickets.add(new Ticket("周杰伦2024世界巡回演唱会-北京", 1280.00, 1680.00, 7.6, "https://example.com/jay.jpg", "周杰伦")); tickets.add(new Ticket("开心麻花《乌龙山伯爵》", 180.00, 380.00, 4.7, "https://example.com/mahua.jpg", "开心麻花团队")); tickets.add(new Ticket("国家大剧院歌剧《白毛女》", 380.00, 580.00, 6.6, "https://example.com/opera.jpg", "国家大剧院")); tickets.add(new Ticket("德云社相声专场", 280.00, 480.00, 5.8, "https://example.com/deyun.jpg", "德云社")); tickets.add(new Ticket("儿童剧《冰雪奇缘》", 120.00, 200.00, 6.0, "https://example.com/frozen.jpg", "儿童艺术剧院")); tickets.add(new Ticket("音乐会《蓝色多瑙河》", 260.00, 420.00, 6.2, "https://example.com/music.jpg", "北京交响乐团")); tickets.add(new Ticket("话剧《雷雨》", 220.00, 380.00, 5.8, "https://example.com/thunder.jpg", "北京人民艺术剧院")); tickets.add(new Ticket("魔术表演《惊天魔盗团》", 380.00, 580.00, 6.6, "https://example.com/magic.jpg", "魔术大师联盟")); tickets.add(new Ticket("脱口秀《吐槽大会现场版》", 180.00, 300.00, 6.0, "https://example.com/talkshow.jpg", "笑果文化")); tickets.add(new Ticket("体育赛事:CBA总决赛", 580.00, 880.00, 6.6, "https://example.com/cba.jpg", "CBA联盟")); return tickets; } @Override public List crawlPage(String url) throws IOException { List tickets = new ArrayList<>(); Document doc = fetchDoc(url); Elements items = doc.select(".movie-item"); if (items.isEmpty()) { items = doc.select(".show-item"); } if (items.isEmpty()) { items = doc.select("div.item"); } for (Element e : items) { String title = e.select("h3").text(); if (title.isEmpty()) { title = e.select(".title").text(); } if (title.isEmpty()) { title = e.select("a[title]").attr("title"); } if (title == null || title.isEmpty() || title.length() < 5) continue; String priceText = e.select(".price").text(); if (priceText.isEmpty()) priceText = e.select(".ticket-price").text(); if (priceText.isEmpty()) continue; String imageUrl = e.select("img").attr("src"); if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-src"); String performer = e.select(".actor").text(); if (performer.isEmpty()) performer = e.select(".tag").text(); if (performer.isEmpty()) performer = e.select(".info").text(); if (performer.isEmpty()) performer = "未知演出方"; Ticket ticket = new Ticket( title, parsePrice(priceText), parsePrice(priceText), 10.0, imageUrl, performer ); tickets.add(ticket); } return tickets; } } public class MultiSiteCrawler { public static void main(String[] args) { DangDangProductCrawler ddCrawler = new DangDangProductCrawler(); List products = ddCrawler.crawlAllPages(1, 3); saveToFile(products, "A/dangdang_products.txt"); System.out.println("当当网商品爬取完成,共 " + products.size() + " 条"); SimpleTicketCrawler ticketCrawler = new SimpleTicketCrawler(); List tickets = ticketCrawler.crawlAllPages(1, 1); saveToFile(tickets, "B/tickets.txt"); System.out.println("票务网站爬取完成,共 " + tickets.size() + " 条"); } private static void saveToFile(List items, String filename) { try (PrintWriter w = new PrintWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) { w.println("Title,Price,OriginalPrice,Discount,ImageUrl,Author/Seller/Performer"); items.forEach(b -> w.printf("%s,%.2f,%.2f,%.1f,%s,%s%n", b.getTitle(), b.getPrice(), b.getOriginalPrice(), b.getDiscount(), b.getImageUrl(), b.getAuthor())); } catch (IOException e) { System.err.println("Save error: " + e.getMessage()); } } }