import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; // 爬虫基类 abstract class Spider { protected String baseUrl; public Spider(String baseUrl) { this.baseUrl = baseUrl; } // 发送HTTP请求获取页面内容 protected Document fetchPage(String url) throws IOException { return Jsoup.connect(url) .userAgent("Mozilla/5.0") .timeout(10000) .get(); } // 抽象方法:爬取数据 public abstract List crawl() throws IOException; // 抽象方法:解析页面 protected abstract List parsePage(Document doc); // 重载方法:解析页面(用于原生HTML解析) protected List parsePage(String html) { // 默认实现,子类可以选择性重写 return new ArrayList<>(); } } // 电影数据模型 class Movie { private String title; private String rating; private String url; public Movie(String title, String rating, String url) { this.title = title; this.rating = rating; this.url = url; } @Override public String toString() { return "Movie{" + "title='" + title + '\'' + ", rating='" + rating + '\'' + ", url='" + url + '\'' + '}'; } } // 书籍数据模型 class Book { private String title; private String author; private String rating; private String url; public Book(String title, String author, String rating, String url) { this.title = title; this.author = author; this.rating = rating; this.url = url; } @Override public String toString() { return "Book{" + "title='" + title + '\'' + ", author='" + author + '\'' + ", rating='" + rating + '\'' + ", url='" + url + '\'' + '}'; } } // 天气数据模型 class Weather { private String province; private String city; private String temperature; private String weatherCondition; private String url; public Weather(String province, String city, String temperature, String weatherCondition, String url) { this.province = province; this.city = city; this.temperature = temperature; this.weatherCondition = weatherCondition; this.url = url; } @Override public String toString() { return "Weather{" + "province='" + province + '\'' + ", city='" + city + '\'' + ", temperature='" + temperature + '\'' + ", weatherCondition='" + weatherCondition + '\'' + ", url='" + url + '\'' + '}'; } } // 豆瓣电影爬虫 - 爬取Top250 class DoubanMovieSpider extends Spider { public DoubanMovieSpider() { super("https://movie.douban.com/top250"); } @Override public List crawl() throws IOException { List allMovies = new ArrayList<>(); // 豆瓣Top250有10页,每页25部电影 for (int start = 0; start < 250; start += 25) { String pageUrl = baseUrl + "?start=" + start; System.out.println("正在爬取第 " + (start / 25 + 1) + " 页..."); Document doc = fetchPage(pageUrl); List movies = parsePage(doc); allMovies.addAll(movies); // 添加延迟,避免请求过快 try { Thread.sleep(1000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } return allMovies; } @Override protected List parsePage(Document doc) { List movies = new ArrayList<>(); // 使用Jsoup选择器解析页面 Elements movieItems = doc.select(".info"); for (Element item : movieItems) { // 提取标题 Element titleElement = item.selectFirst(".title"); if (titleElement != null) { String title = titleElement.text().trim(); // 提取评分 Element ratingElement = item.selectFirst(".rating_num"); String rating = ""; if (ratingElement != null) { rating = ratingElement.text().trim(); } // 提取链接 Element linkElement = item.selectFirst("a"); String url = ""; if (linkElement != null) { url = linkElement.attr("href"); } movies.add(new Movie(title, rating, url)); } } return movies; } } // 豆瓣读书爬虫 - 爬取评分前100 class DoubanBookSpider extends Spider { public DoubanBookSpider() { super("https://book.douban.com/top250"); } @Override public List crawl() throws IOException { List allBooks = new ArrayList<>(); // 豆瓣读书Top250,每页25本,爬取前4页(100本) for (int start = 0; start < 100; start += 25) { String pageUrl = baseUrl + "?start=" + start; System.out.println("正在爬取第 " + (start / 25 + 1) + " 页..."); Document doc = fetchPage(pageUrl); List books = parsePage(doc); allBooks.addAll(books); // 添加延迟,避免请求过快 try { Thread.sleep(1000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } return allBooks; } @Override protected List parsePage(Document doc) { List books = new ArrayList<>(); // 使用Jsoup选择器解析页面 Elements bookItems = doc.select("tr.item"); for (Element item : bookItems) { // 提取链接 Element linkElement = item.selectFirst(".nbg"); if (linkElement != null) { String url = linkElement.attr("href"); // 提取标题 Element titleElement = linkElement.selectFirst("img"); String title = ""; if (titleElement != null) { title = titleElement.attr("alt").trim(); } // 提取作者信息 Element authorElement = item.selectFirst(".pl"); String author = ""; if (authorElement != null) { author = authorElement.text().trim(); } // 提取评分 Element ratingElement = item.selectFirst(".rating_nums"); String rating = ""; if (ratingElement != null) { rating = ratingElement.text().trim(); } books.add(new Book(title, author, rating, url)); } } return books; } } // 中国天气网爬虫 - 爬取各省份天气 class WeatherSpider extends Spider { // 中国各省份代码映射 private static final String[] PROVINCES = { "北京", "上海", "天津", "重庆", "河北", "山西", "辽宁", "吉林", "黑龙江", "江苏", "浙江", "安徽", "福建", "江西", "山东", "河南", "湖北", "湖南", "广东", "海南", "四川", "贵州", "云南", "陕西", "甘肃", "青海", "台湾", "内蒙古", "广西", "西藏", "宁夏", "新疆", "香港", "澳门" }; public WeatherSpider() { super("http://www.weather.com.cn"); } @Override public List crawl() throws IOException { List allWeather = new ArrayList<>(); System.out.println("正在爬取中国各省份天气数据..."); // 爬取主要城市天气(使用中国天气网的API) String[] majorCities = { "101010100", // 北京 "101020100", // 上海 "101030100", // 天津 "101040100", // 重庆 "101050101", // 哈尔滨 "101060101", // 长春 "101070101", // 沈阳 "101080101", // 呼和浩特 "101090101", // 石家庄 "101100101", // 太原 "101110101", // 西安 "101120101", // 济南 "101130101", // 乌鲁木齐 "101140101", // 拉萨 "101150101", // 西宁 "101160101", // 兰州 "101170101", // 银川 "101180101", // 郑州 "101190101", // 南京 "101200101", // 武汉 "101210101", // 杭州 "101220101", // 合肥 "101230101", // 福州 "101240101", // 南昌 "101250101", // 长沙 "101260101", // 贵阳 "101270101", // 成都 "101280101", // 广州 "101290101", // 昆明 "101300101", // 南宁 "101310101", // 海口 "101320101", // 香港 "101330101", // 澳门 "101340101" // 台北 }; String[] cityNames = { "北京", "上海", "天津", "重庆", "哈尔滨", "长春", "沈阳", "呼和浩特", "石家庄", "太原", "西安", "济南", "乌鲁木齐", "拉萨", "西宁", "兰州", "银川", "郑州", "南京", "武汉", "杭州", "合肥", "福州", "南昌", "长沙", "贵阳", "成都", "广州", "昆明", "南宁", "海口", "香港", "澳门", "台北" }; for (int i = 0; i < majorCities.length; i++) { String cityCode = majorCities[i]; String cityName = cityNames[i]; String province = getProvinceByCity(cityName); try { String weatherUrl = "http://www.weather.com.cn/weather1d/" + cityCode + ".shtml"; Document doc = fetchPage(weatherUrl); Weather weather = parseWeatherPage(doc, province, cityName, weatherUrl); if (weather != null) { allWeather.add(weather); System.out.println("已获取 " + cityName + " 的天气数据"); } // 添加延迟,避免请求过快 Thread.sleep(500); } catch (Exception e) { System.err.println("获取 " + cityName + " 天气数据失败: " + e.getMessage()); } } return allWeather; } private String getProvinceByCity(String city) { // 简化的省份映射 if (city.equals("北京") || city.equals("上海") || city.equals("天津") || city.equals("重庆")) { return city; } else if (city.equals("哈尔滨") || city.equals("长春") || city.equals("沈阳")) { return "东北"; } else if (city.equals("呼和浩特") || city.equals("石家庄") || city.equals("太原")) { return "华北"; } else if (city.equals("西安") || city.equals("兰州") || city.equals("西宁") || city.equals("银川")) { return "西北"; } else if (city.equals("济南") || city.equals("郑州") || city.equals("南京")) { return "华东"; } else if (city.equals("武汉") || city.equals("长沙") || city.equals("南昌")) { return "华中"; } else if (city.equals("杭州") || city.equals("合肥") || city.equals("福州")) { return "华东"; } else if (city.equals("成都") || city.equals("贵阳") || city.equals("昆明")) { return "西南"; } else if (city.equals("广州") || city.equals("南宁") || city.equals("海口")) { return "华南"; } else if (city.equals("乌鲁木齐") || city.equals("拉萨")) { return "西北"; } else if (city.equals("香港") || city.equals("澳门") || city.equals("台北")) { return "港澳台"; } return "其他"; } private Weather parseWeatherPage(Document doc, String province, String city, String url) { try { // 提取温度 Element tempElement = doc.selectFirst(".tem"); String temperature = ""; if (tempElement != null) { temperature = tempElement.text().trim(); } // 提取天气状况 Element weatherElement = doc.selectFirst(".wea"); String weatherCondition = ""; if (weatherElement != null) { weatherCondition = weatherElement.text().trim(); } return new Weather(province, city, temperature, weatherCondition, url); } catch (Exception e) { System.err.println("解析 " + city + " 天气数据失败: " + e.getMessage()); return null; } } @Override protected List parsePage(Document doc) { // 这个方法在crawl中被直接调用,不需要实现 return new ArrayList<>(); } } // 测试类 public class SpiderFramework { public static void main(String[] args) { List spiders = new ArrayList<>(); spiders.add(new DoubanMovieSpider()); spiders.add(new DoubanBookSpider()); spiders.add(new WeatherSpider()); for (Spider spider : spiders) { try { System.out.println("\n爬取 " + spider.getClass().getSimpleName() + " 数据:"); List data = spider.crawl(); System.out.println("共爬取 " + data.size() + " 条数据"); // 只显示前5条数据 int displayCount = Math.min(5, data.size()); System.out.println("显示前 " + displayCount + " 条:"); for (int i = 0; i < displayCount; i++) { System.out.println(data.get(i)); } } catch (IOException e) { System.err.println("爬取失败: " + e.getMessage()); } } } }