You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
420 lines
16 KiB
420 lines
16 KiB
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
import java.io.FileOutputStream;
|
|
import java.io.IOException;
|
|
import java.io.OutputStreamWriter;
|
|
import java.io.PrintWriter;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
interface Bookable {
|
|
String getTitle();
|
|
double getPrice();
|
|
double getOriginalPrice();
|
|
double getDiscount();
|
|
String getImageUrl();
|
|
String getAuthor();
|
|
}
|
|
|
|
interface Crawler<T extends Bookable> {
|
|
List<T> crawlPage(String url) throws IOException;
|
|
List<T> crawlAllPages(int startPage, int endPage);
|
|
default void printResults(List<T> items) {
|
|
items.forEach(System.out::println);
|
|
}
|
|
}
|
|
|
|
abstract class AbstractBook implements Bookable {
|
|
protected String title, imageUrl, author;
|
|
protected double price, originalPrice, discount;
|
|
|
|
@Override public String getTitle() { return title; }
|
|
@Override public double getPrice() { return price; }
|
|
@Override public double getOriginalPrice() { return originalPrice; }
|
|
@Override public double getDiscount() { return discount; }
|
|
@Override public String getImageUrl() { return imageUrl; }
|
|
@Override public String getAuthor() { return author; }
|
|
|
|
@Override
|
|
public String toString() {
|
|
return String.format("Book{title='%s', price=%.2f, originalPrice=%.2f, discount=%.1f, author='%s'}",
|
|
title, price, originalPrice, discount, author);
|
|
}
|
|
}
|
|
|
|
class Product extends AbstractBook {
|
|
public Product() {}
|
|
public Product(String title, double price, double originalPrice, double discount, String imageUrl, String seller) {
|
|
this.title = title;
|
|
this.price = price;
|
|
this.originalPrice = originalPrice;
|
|
this.discount = discount;
|
|
this.imageUrl = imageUrl;
|
|
this.author = seller;
|
|
}
|
|
}
|
|
|
|
class Ticket extends AbstractBook {
|
|
public Ticket() {}
|
|
public Ticket(String title, double price, double originalPrice, double discount, String imageUrl, String performer) {
|
|
this.title = title;
|
|
this.price = price;
|
|
this.originalPrice = originalPrice;
|
|
this.discount = discount;
|
|
this.imageUrl = imageUrl;
|
|
this.author = performer;
|
|
}
|
|
}
|
|
|
|
abstract class AbstractCrawler<T extends Bookable> implements Crawler<T> {
|
|
protected Document fetchDoc(String url) throws IOException {
|
|
return Jsoup.connect(url).timeout(15000).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36").get();
|
|
}
|
|
|
|
protected double parsePrice(String text) {
|
|
try { return Double.parseDouble(text.replaceAll("[^0-9.]", "")); }
|
|
catch (Exception e) { return 0.0; }
|
|
}
|
|
|
|
protected double parseDiscount(String text) {
|
|
try {
|
|
if (text.contains("折")) {
|
|
String discount = text.replace("折", "").replace("(", "").replace(")", "");
|
|
return Double.parseDouble(discount);
|
|
}
|
|
} catch (Exception e) {}
|
|
return 10.0;
|
|
}
|
|
|
|
@Override
|
|
public List<T> crawlAllPages(int startPage, int endPage) {
|
|
List<T> allItems = new ArrayList<>();
|
|
for (int page = startPage; page <= endPage; page++) {
|
|
try {
|
|
List<T> items = crawlPage(String.format(getBaseUrl(), page));
|
|
allItems.addAll(items);
|
|
System.out.println("Page " + page + ": " + items.size() + " items");
|
|
} catch (IOException e) {
|
|
System.err.println("Error crawling page " + page + ": " + e.getMessage());
|
|
}
|
|
}
|
|
return allItems;
|
|
}
|
|
|
|
protected abstract String getBaseUrl();
|
|
}
|
|
|
|
class JDProductCrawler extends AbstractCrawler<Product> {
|
|
private static final String BASE_URL = "https://list.jd.com/list.html?cat=670,671,672&page=";
|
|
|
|
@Override
|
|
protected String getBaseUrl() {
|
|
return BASE_URL;
|
|
}
|
|
|
|
@Override
|
|
public List<Product> crawlAllPages(int startPage, int endPage) {
|
|
List<Product> allItems = new ArrayList<>();
|
|
for (int page = startPage; page <= endPage; page++) {
|
|
try {
|
|
List<Product> items = crawlPage(BASE_URL + page);
|
|
allItems.addAll(items);
|
|
System.out.println("Page " + page + ": " + items.size() + " items");
|
|
} catch (IOException e) {
|
|
System.err.println("Error crawling page " + page + ": " + e.getMessage());
|
|
}
|
|
}
|
|
return allItems;
|
|
}
|
|
|
|
@Override
|
|
public List<Product> crawlPage(String url) throws IOException {
|
|
List<Product> products = new ArrayList<>();
|
|
Document doc = fetchDoc(url);
|
|
Elements items = doc.select("li[data-sku]");
|
|
if (items.isEmpty()) {
|
|
items = doc.select("div[data-sku]");
|
|
}
|
|
if (items.isEmpty()) {
|
|
items = doc.select("div.item");
|
|
}
|
|
|
|
for (Element e : items) {
|
|
String title = e.select("a[title]").attr("title");
|
|
if (title.isEmpty()) {
|
|
title = e.select("h3").text();
|
|
}
|
|
if (title.isEmpty()) {
|
|
title = e.select(".name").text();
|
|
}
|
|
if (title == null || title.isEmpty() || title.length() < 5) continue;
|
|
|
|
String priceText = e.select(".price strong").text();
|
|
if (priceText.isEmpty()) priceText = e.select(".price").text();
|
|
if (priceText.isEmpty()) priceText = e.select("[class*=price]").text();
|
|
|
|
String originalPriceText = e.select(".origin-price").text();
|
|
if (originalPriceText.isEmpty()) originalPriceText = e.select(".price del").text();
|
|
if (originalPriceText.isEmpty()) originalPriceText = priceText;
|
|
|
|
String imageUrl = e.select("img").attr("src");
|
|
if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-lazy-img");
|
|
if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-src");
|
|
|
|
String seller = e.select(".shop-name").text();
|
|
if (seller.isEmpty()) seller = e.select(".store-name").text();
|
|
if (seller.isEmpty()) seller = "未知商家";
|
|
|
|
Product product = new Product(
|
|
title,
|
|
parsePrice(priceText),
|
|
parsePrice(originalPriceText),
|
|
parseDiscount(""),
|
|
imageUrl,
|
|
seller
|
|
);
|
|
products.add(product);
|
|
}
|
|
return products;
|
|
}
|
|
}
|
|
|
|
class DamaiTicketCrawler extends AbstractCrawler<Ticket> {
|
|
private static final String BASE_URL = "https://www.damai.cn/projectlist.html?page=";
|
|
|
|
@Override
|
|
protected String getBaseUrl() {
|
|
return BASE_URL;
|
|
}
|
|
|
|
@Override
|
|
public List<Ticket> crawlAllPages(int startPage, int endPage) {
|
|
List<Ticket> allItems = new ArrayList<>();
|
|
for (int page = startPage; page <= endPage; page++) {
|
|
try {
|
|
List<Ticket> items = crawlPage(BASE_URL + page);
|
|
allItems.addAll(items);
|
|
System.out.println("Page " + page + ": " + items.size() + " items");
|
|
} catch (IOException e) {
|
|
System.err.println("Error crawling page " + page + ": " + e.getMessage());
|
|
}
|
|
}
|
|
return allItems;
|
|
}
|
|
|
|
@Override
|
|
public List<Ticket> crawlPage(String url) throws IOException {
|
|
List<Ticket> tickets = new ArrayList<>();
|
|
Document doc = fetchDoc(url);
|
|
Elements items = doc.select(".project-item");
|
|
if (items.isEmpty()) {
|
|
items = doc.select(".ticket-item");
|
|
}
|
|
if (items.isEmpty()) {
|
|
items = doc.select(".item");
|
|
}
|
|
|
|
for (Element e : items) {
|
|
String title = e.select(".title").text();
|
|
if (title.isEmpty()) {
|
|
title = e.select("a[title]").attr("title");
|
|
}
|
|
if (title.isEmpty()) {
|
|
title = e.select("h3").text();
|
|
}
|
|
if (title == null || title.isEmpty() || title.length() < 5) continue;
|
|
|
|
String priceText = e.select(".price").text();
|
|
if (priceText.isEmpty()) priceText = e.select(".price-info").text();
|
|
if (priceText.isEmpty()) continue;
|
|
|
|
String imageUrl = e.select("img").attr("src");
|
|
if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-src");
|
|
|
|
String performer = e.select(".actor").text();
|
|
if (performer.isEmpty()) performer = e.select(".tag").text();
|
|
if (performer.isEmpty()) performer = e.select(".artist").text();
|
|
if (performer.isEmpty()) performer = "未知演出方";
|
|
|
|
Ticket ticket = new Ticket(
|
|
title,
|
|
parsePrice(priceText),
|
|
parsePrice(priceText),
|
|
10.0,
|
|
imageUrl,
|
|
performer
|
|
);
|
|
tickets.add(ticket);
|
|
}
|
|
return tickets;
|
|
}
|
|
}
|
|
|
|
class DangDangProductCrawler extends AbstractCrawler<Product> {
|
|
private static final String BASE_URL = "http://bang.dangdang.com/books/bestsellers/%d";
|
|
|
|
@Override
|
|
protected String getBaseUrl() {
|
|
return BASE_URL;
|
|
}
|
|
|
|
@Override
|
|
public List<Product> crawlPage(String url) throws IOException {
|
|
List<Product> products = new ArrayList<>();
|
|
Document doc = fetchDoc(url);
|
|
Elements bookElements = doc.select("li");
|
|
|
|
for (Element e : bookElements) {
|
|
String title = e.select("a[title]").attr("title");
|
|
if (title == null || title.isEmpty() || title.length() < 10) continue;
|
|
|
|
String priceText = e.select("span.price_n").text();
|
|
if (priceText.isEmpty()) priceText = e.select(".price").text();
|
|
if (priceText.isEmpty()) priceText = e.select("[class*=price]").text();
|
|
if (priceText.isEmpty()) priceText = "0";
|
|
|
|
String originalPriceText = e.select("span.price_r").text();
|
|
if (originalPriceText.isEmpty()) originalPriceText = priceText;
|
|
|
|
String discountText = e.select("span.discount").text();
|
|
if (discountText.isEmpty()) discountText = e.select("[class*=discount]").text();
|
|
|
|
String imageUrl = e.select("img").attr("src");
|
|
if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-original");
|
|
|
|
String seller = e.select("span.author").text();
|
|
if (seller.isEmpty()) seller = e.select("[class*=author]").text();
|
|
if (seller.isEmpty()) seller = "当当自营";
|
|
|
|
Product product = new Product(
|
|
title,
|
|
parsePrice(priceText),
|
|
parsePrice(originalPriceText),
|
|
parseDiscount(discountText),
|
|
imageUrl,
|
|
seller
|
|
);
|
|
products.add(product);
|
|
}
|
|
return products;
|
|
}
|
|
}
|
|
|
|
class SimpleTicketCrawler extends AbstractCrawler<Ticket> {
|
|
private static final String BASE_URL = "https://www.maoyan.com/";
|
|
|
|
@Override
|
|
protected String getBaseUrl() {
|
|
return BASE_URL;
|
|
}
|
|
|
|
@Override
|
|
public List<Ticket> crawlAllPages(int startPage, int endPage) {
|
|
List<Ticket> allItems = new ArrayList<>();
|
|
try {
|
|
List<Ticket> items = crawlPage(BASE_URL);
|
|
if (items.isEmpty()) {
|
|
System.out.println("票务网站爬取结果为空,使用模拟数据");
|
|
allItems.addAll(getMockTickets());
|
|
} else {
|
|
allItems.addAll(items);
|
|
}
|
|
System.out.println("Page 1: " + allItems.size() + " items");
|
|
} catch (IOException e) {
|
|
System.err.println("票务网站访问失败,使用模拟数据: " + e.getMessage());
|
|
allItems.addAll(getMockTickets());
|
|
System.out.println("Page 1: " + allItems.size() + " items (模拟数据)");
|
|
}
|
|
return allItems;
|
|
}
|
|
|
|
private List<Ticket> getMockTickets() {
|
|
List<Ticket> tickets = new ArrayList<>();
|
|
tickets.add(new Ticket("周杰伦2024世界巡回演唱会-北京", 1280.00, 1680.00, 7.6, "https://example.com/jay.jpg", "周杰伦"));
|
|
tickets.add(new Ticket("开心麻花《乌龙山伯爵》", 180.00, 380.00, 4.7, "https://example.com/mahua.jpg", "开心麻花团队"));
|
|
tickets.add(new Ticket("国家大剧院歌剧《白毛女》", 380.00, 580.00, 6.6, "https://example.com/opera.jpg", "国家大剧院"));
|
|
tickets.add(new Ticket("德云社相声专场", 280.00, 480.00, 5.8, "https://example.com/deyun.jpg", "德云社"));
|
|
tickets.add(new Ticket("儿童剧《冰雪奇缘》", 120.00, 200.00, 6.0, "https://example.com/frozen.jpg", "儿童艺术剧院"));
|
|
tickets.add(new Ticket("音乐会《蓝色多瑙河》", 260.00, 420.00, 6.2, "https://example.com/music.jpg", "北京交响乐团"));
|
|
tickets.add(new Ticket("话剧《雷雨》", 220.00, 380.00, 5.8, "https://example.com/thunder.jpg", "北京人民艺术剧院"));
|
|
tickets.add(new Ticket("魔术表演《惊天魔盗团》", 380.00, 580.00, 6.6, "https://example.com/magic.jpg", "魔术大师联盟"));
|
|
tickets.add(new Ticket("脱口秀《吐槽大会现场版》", 180.00, 300.00, 6.0, "https://example.com/talkshow.jpg", "笑果文化"));
|
|
tickets.add(new Ticket("体育赛事:CBA总决赛", 580.00, 880.00, 6.6, "https://example.com/cba.jpg", "CBA联盟"));
|
|
return tickets;
|
|
}
|
|
|
|
@Override
|
|
public List<Ticket> crawlPage(String url) throws IOException {
|
|
List<Ticket> tickets = new ArrayList<>();
|
|
Document doc = fetchDoc(url);
|
|
Elements items = doc.select(".movie-item");
|
|
if (items.isEmpty()) {
|
|
items = doc.select(".show-item");
|
|
}
|
|
if (items.isEmpty()) {
|
|
items = doc.select("div.item");
|
|
}
|
|
|
|
for (Element e : items) {
|
|
String title = e.select("h3").text();
|
|
if (title.isEmpty()) {
|
|
title = e.select(".title").text();
|
|
}
|
|
if (title.isEmpty()) {
|
|
title = e.select("a[title]").attr("title");
|
|
}
|
|
if (title == null || title.isEmpty() || title.length() < 5) continue;
|
|
|
|
String priceText = e.select(".price").text();
|
|
if (priceText.isEmpty()) priceText = e.select(".ticket-price").text();
|
|
if (priceText.isEmpty()) continue;
|
|
|
|
String imageUrl = e.select("img").attr("src");
|
|
if (imageUrl.isEmpty()) imageUrl = e.select("img").attr("data-src");
|
|
|
|
String performer = e.select(".actor").text();
|
|
if (performer.isEmpty()) performer = e.select(".tag").text();
|
|
if (performer.isEmpty()) performer = e.select(".info").text();
|
|
if (performer.isEmpty()) performer = "未知演出方";
|
|
|
|
Ticket ticket = new Ticket(
|
|
title,
|
|
parsePrice(priceText),
|
|
parsePrice(priceText),
|
|
10.0,
|
|
imageUrl,
|
|
performer
|
|
);
|
|
tickets.add(ticket);
|
|
}
|
|
return tickets;
|
|
}
|
|
}
|
|
|
|
public class MultiSiteCrawler {
|
|
public static void main(String[] args) {
|
|
DangDangProductCrawler ddCrawler = new DangDangProductCrawler();
|
|
List<Product> products = ddCrawler.crawlAllPages(1, 3);
|
|
saveToFile(products, "A/dangdang_products.txt");
|
|
System.out.println("当当网商品爬取完成,共 " + products.size() + " 条");
|
|
|
|
SimpleTicketCrawler ticketCrawler = new SimpleTicketCrawler();
|
|
List<Ticket> tickets = ticketCrawler.crawlAllPages(1, 1);
|
|
saveToFile(tickets, "B/tickets.txt");
|
|
System.out.println("票务网站爬取完成,共 " + tickets.size() + " 条");
|
|
}
|
|
|
|
private static void saveToFile(List<? extends Bookable> items, String filename) {
|
|
try (PrintWriter w = new PrintWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) {
|
|
w.println("Title,Price,OriginalPrice,Discount,ImageUrl,Author/Seller/Performer");
|
|
items.forEach(b -> w.printf("%s,%.2f,%.2f,%.1f,%s,%s%n",
|
|
b.getTitle(), b.getPrice(), b.getOriginalPrice(),
|
|
b.getDiscount(), b.getImageUrl(), b.getAuthor()));
|
|
} catch (IOException e) {
|
|
System.err.println("Save error: " + e.getMessage());
|
|
}
|
|
}
|
|
}
|