You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

420 lines
16 KiB

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
interface Bookable {
String getTitle();
double getPrice();
double getOriginalPrice();
double getDiscount();
String getImageUrl();
String getAuthor();
}
interface Crawler<T extends Bookable> {
List<T> crawlPage(String url) throws IOException;
List<T> crawlAllPages(int startPage, int endPage);
default void printResults(List<T> items) {
items.forEach(System.out::println);
}
}
abstract class AbstractBook implements Bookable {
protected String title, imageUrl, author;
protected double price, originalPrice, discount;
@Override public String getTitle() { return title; }
@Override public double getPrice() { return price; }
@Override public double getOriginalPrice() { return originalPrice; }
@Override public double getDiscount() { return discount; }
@Override public String getImageUrl() { return imageUrl; }
@Override public String getAuthor() { return author; }
@Override
public String toString() {
return String.format("Book{title='%s', price=%.2f, originalPrice=%.2f, discount=%.1f, author='%s'}",
title, price, originalPrice, discount, author);
}
}
class Product extends AbstractBook {
public Product() {}
public Product(String title, double price, double originalPrice, double discount, String imageUrl, String seller) {
this.title = title;
this.price = price;
this.originalPrice = originalPrice;
this.discount = discount;
this.imageUrl = imageUrl;
this.author = seller;
}
}
class Ticket extends AbstractBook {
public Ticket() {}
public Ticket(String title, double price, double originalPrice, double discount, String imageUrl, String performer) {
this.title = title;
this.price = price;
this.originalPrice = originalPrice;
this.discount = discount;
this.imageUrl = imageUrl;
this.author = performer;
}
}
abstract class AbstractCrawler<T extends Bookable> implements Crawler<T> {
protected Document fetchDoc(String url) throws IOException {
return Jsoup.connect(url).timeout(15000).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36").get();
}
protected double parsePrice(String text) {
try { return Double.parseDouble(text.replaceAll("[^0-9.]", "")); }
catch (Exception e) { return 0.0; }
}
protected double parseDiscount(String text) {
try {
if (text.contains("折")) {
String discount = text.replace("折", "").replace("(", "").replace(")", "");
return Double.parseDouble(discount);
}
} catch (Exception e) {}
return 10.0;
}
@Override
public List<T> crawlAllPages(int startPage, int endPage) {
List<T> allItems = new ArrayList<>();
for (int page = startPage; page <= endPage; page++) {
try {
List<T> items = crawlPage(String.format(getBaseUrl(), page));
allItems.addAll(items);
System.out.println("Page " + page + ": " + items.size() + " items");
} catch (IOException e) {
System.err.println("Error crawling page " + page + ": " + e.getMessage());
}
}
return allItems;
}
protected abstract String getBaseUrl();
}
class JDProductCrawler extends AbstractCrawler<Product> {
private static final String BASE_URL = "https://list.jd.com/list.html?cat=670,671,672&page=";
@Override
protected String getBaseUrl() {
return BASE_URL;
}
@Override
public List<Product> crawlAllPages(int startPage, int endPage) {
List<Product> allItems = new ArrayList<>();
for (int page = startPage; page <= endPage; page++) {
try {
List<Product> items = crawlPage(BASE_URL + page);
allItems.addAll(items);
System.out.println("Page " + page + ": " + items.size() + " items");
} catch (IOException e) {
System.err.println("Error crawling page " + page + ": " + e.getMessage());
}
}
return allItems;
}
@Override
public List<Product> crawlPage(String url) throws IOException {
List<Product> products = new ArrayList<>();
Document doc = fetchDoc(url);
Elements items = doc.select("li[data-sku]");
if (items.isEmpty()) {
items = doc.select("div[data-sku]");
}
if (items.isEmpty()) {
items = doc.select("div.item");
}
for (Element e : items) {
String title = e.select("a[title]").attr("title");
if (title.isEmpty()) {
title = e.select("h3").text();
}
if (title.isEmpty()) {
title = e.select(".name").text();
}
if (title == null || title.isEmpty() || title.length() < 5) continue;
String priceText = e.select(".price strong").text();
if (priceText.isEmpty()) priceText = e.select(".price").text();
if (priceText.isEmpty()) priceText = e.select("[class*=price]").text();
String originalPriceText = e.select(".origin-price").text();
if (originalPriceText.isEmpty()) originalPriceText = e.select(".price del").text();
if (originalPriceText.isEmpty()) originalPriceText = priceText;
String imageUrl = e.select("img").attr("src");
if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-lazy-img");
if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-src");
String seller = e.select(".shop-name").text();
if (seller.isEmpty()) seller = e.select(".store-name").text();
if (seller.isEmpty()) seller = "未知商家";
Product product = new Product(
title,
parsePrice(priceText),
parsePrice(originalPriceText),
parseDiscount(""),
imageUrl,
seller
);
products.add(product);
}
return products;
}
}
class DamaiTicketCrawler extends AbstractCrawler<Ticket> {
private static final String BASE_URL = "https://www.damai.cn/projectlist.html?page=";
@Override
protected String getBaseUrl() {
return BASE_URL;
}
@Override
public List<Ticket> crawlAllPages(int startPage, int endPage) {
List<Ticket> allItems = new ArrayList<>();
for (int page = startPage; page <= endPage; page++) {
try {
List<Ticket> items = crawlPage(BASE_URL + page);
allItems.addAll(items);
System.out.println("Page " + page + ": " + items.size() + " items");
} catch (IOException e) {
System.err.println("Error crawling page " + page + ": " + e.getMessage());
}
}
return allItems;
}
@Override
public List<Ticket> crawlPage(String url) throws IOException {
List<Ticket> tickets = new ArrayList<>();
Document doc = fetchDoc(url);
Elements items = doc.select(".project-item");
if (items.isEmpty()) {
items = doc.select(".ticket-item");
}
if (items.isEmpty()) {
items = doc.select(".item");
}
for (Element e : items) {
String title = e.select(".title").text();
if (title.isEmpty()) {
title = e.select("a[title]").attr("title");
}
if (title.isEmpty()) {
title = e.select("h3").text();
}
if (title == null || title.isEmpty() || title.length() < 5) continue;
String priceText = e.select(".price").text();
if (priceText.isEmpty()) priceText = e.select(".price-info").text();
if (priceText.isEmpty()) continue;
String imageUrl = e.select("img").attr("src");
if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-src");
String performer = e.select(".actor").text();
if (performer.isEmpty()) performer = e.select(".tag").text();
if (performer.isEmpty()) performer = e.select(".artist").text();
if (performer.isEmpty()) performer = "未知演出方";
Ticket ticket = new Ticket(
title,
parsePrice(priceText),
parsePrice(priceText),
10.0,
imageUrl,
performer
);
tickets.add(ticket);
}
return tickets;
}
}
class DangDangProductCrawler extends AbstractCrawler<Product> {
private static final String BASE_URL = "http://bang.dangdang.com/books/bestsellers/%d";
@Override
protected String getBaseUrl() {
return BASE_URL;
}
@Override
public List<Product> crawlPage(String url) throws IOException {
List<Product> products = new ArrayList<>();
Document doc = fetchDoc(url);
Elements bookElements = doc.select("li");
for (Element e : bookElements) {
String title = e.select("a[title]").attr("title");
if (title == null || title.isEmpty() || title.length() < 10) continue;
String priceText = e.select("span.price_n").text();
if (priceText.isEmpty()) priceText = e.select(".price").text();
if (priceText.isEmpty()) priceText = e.select("[class*=price]").text();
if (priceText.isEmpty()) priceText = "0";
String originalPriceText = e.select("span.price_r").text();
if (originalPriceText.isEmpty()) originalPriceText = priceText;
String discountText = e.select("span.discount").text();
if (discountText.isEmpty()) discountText = e.select("[class*=discount]").text();
String imageUrl = e.select("img").attr("src");
if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-original");
String seller = e.select("span.author").text();
if (seller.isEmpty()) seller = e.select("[class*=author]").text();
if (seller.isEmpty()) seller = "当当自营";
Product product = new Product(
title,
parsePrice(priceText),
parsePrice(originalPriceText),
parseDiscount(discountText),
imageUrl,
seller
);
products.add(product);
}
return products;
}
}
class SimpleTicketCrawler extends AbstractCrawler<Ticket> {
private static final String BASE_URL = "https://www.maoyan.com/";
@Override
protected String getBaseUrl() {
return BASE_URL;
}
@Override
public List<Ticket> crawlAllPages(int startPage, int endPage) {
List<Ticket> allItems = new ArrayList<>();
try {
List<Ticket> items = crawlPage(BASE_URL);
if (items.isEmpty()) {
System.out.println("票务网站爬取结果为空,使用模拟数据");
allItems.addAll(getMockTickets());
} else {
allItems.addAll(items);
}
System.out.println("Page 1: " + allItems.size() + " items");
} catch (IOException e) {
System.err.println("票务网站访问失败,使用模拟数据: " + e.getMessage());
allItems.addAll(getMockTickets());
System.out.println("Page 1: " + allItems.size() + " items (模拟数据)");
}
return allItems;
}
private List<Ticket> getMockTickets() {
List<Ticket> tickets = new ArrayList<>();
tickets.add(new Ticket("周杰伦2024世界巡回演唱会-北京", 1280.00, 1680.00, 7.6, "https://example.com/jay.jpg", "周杰伦"));
tickets.add(new Ticket("开心麻花《乌龙山伯爵》", 180.00, 380.00, 4.7, "https://example.com/mahua.jpg", "开心麻花团队"));
tickets.add(new Ticket("国家大剧院歌剧《白毛女》", 380.00, 580.00, 6.6, "https://example.com/opera.jpg", "国家大剧院"));
tickets.add(new Ticket("德云社相声专场", 280.00, 480.00, 5.8, "https://example.com/deyun.jpg", "德云社"));
tickets.add(new Ticket("儿童剧《冰雪奇缘》", 120.00, 200.00, 6.0, "https://example.com/frozen.jpg", "儿童艺术剧院"));
tickets.add(new Ticket("音乐会《蓝色多瑙河》", 260.00, 420.00, 6.2, "https://example.com/music.jpg", "北京交响乐团"));
tickets.add(new Ticket("话剧《雷雨》", 220.00, 380.00, 5.8, "https://example.com/thunder.jpg", "北京人民艺术剧院"));
tickets.add(new Ticket("魔术表演《惊天魔盗团》", 380.00, 580.00, 6.6, "https://example.com/magic.jpg", "魔术大师联盟"));
tickets.add(new Ticket("脱口秀《吐槽大会现场版》", 180.00, 300.00, 6.0, "https://example.com/talkshow.jpg", "笑果文化"));
tickets.add(new Ticket("体育赛事:CBA总决赛", 580.00, 880.00, 6.6, "https://example.com/cba.jpg", "CBA联盟"));
return tickets;
}
@Override
public List<Ticket> crawlPage(String url) throws IOException {
List<Ticket> tickets = new ArrayList<>();
Document doc = fetchDoc(url);
Elements items = doc.select(".movie-item");
if (items.isEmpty()) {
items = doc.select(".show-item");
}
if (items.isEmpty()) {
items = doc.select("div.item");
}
for (Element e : items) {
String title = e.select("h3").text();
if (title.isEmpty()) {
title = e.select(".title").text();
}
if (title.isEmpty()) {
title = e.select("a[title]").attr("title");
}
if (title == null || title.isEmpty() || title.length() < 5) continue;
String priceText = e.select(".price").text();
if (priceText.isEmpty()) priceText = e.select(".ticket-price").text();
if (priceText.isEmpty()) continue;
String imageUrl = e.select("img").attr("src");
if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-src");
String performer = e.select(".actor").text();
if (performer.isEmpty()) performer = e.select(".tag").text();
if (performer.isEmpty()) performer = e.select(".info").text();
if (performer.isEmpty()) performer = "未知演出方";
Ticket ticket = new Ticket(
title,
parsePrice(priceText),
parsePrice(priceText),
10.0,
imageUrl,
performer
);
tickets.add(ticket);
}
return tickets;
}
}
public class MultiSiteCrawler {
public static void main(String[] args) {
DangDangProductCrawler ddCrawler = new DangDangProductCrawler();
List<Product> products = ddCrawler.crawlAllPages(1, 3);
saveToFile(products, "A/dangdang_products.txt");
System.out.println("当当网商品爬取完成,共 " + products.size() + " 条");
SimpleTicketCrawler ticketCrawler = new SimpleTicketCrawler();
List<Ticket> tickets = ticketCrawler.crawlAllPages(1, 1);
saveToFile(tickets, "B/tickets.txt");
System.out.println("票务网站爬取完成,共 " + tickets.size() + " 条");
}
private static void saveToFile(List<? extends Bookable> items, String filename) {
try (PrintWriter w = new PrintWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) {
w.println("Title,Price,OriginalPrice,Discount,ImageUrl,Author/Seller/Performer");
items.forEach(b -> w.printf("%s,%.2f,%.2f,%.1f,%s,%s%n",
b.getTitle(), b.getPrice(), b.getOriginalPrice(),
b.getDiscount(), b.getImageUrl(), b.getAuthor()));
} catch (IOException e) {
System.err.println("Save error: " + e.getMessage());
}
}
}