From 42ab1db5c878e84cd284d0d225648c650f7f1b82 Mon Sep 17 00:00:00 2001 From: ZhengShiyi <1980003269@qq.com> Date: Thu, 9 Apr 2026 19:47:15 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E7=88=AC=E8=99=AB=E9=A1=B9?= =?UTF-8?q?=E7=9B=AE=E4=BB=A3=E7=A0=81=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 继承+多态爬虫/SpiderFramework.java | 422 +++++++++++++++++++++++ 1 file changed, 422 insertions(+) create mode 100644 继承+多态爬虫/SpiderFramework.java diff --git a/继承+多态爬虫/SpiderFramework.java b/继承+多态爬虫/SpiderFramework.java new file mode 100644 index 0000000..c9bc8ad --- /dev/null +++ b/继承+多态爬虫/SpiderFramework.java @@ -0,0 +1,422 @@ +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +// 爬虫基类 +abstract class Spider { + protected String baseUrl; + + public Spider(String baseUrl) { + this.baseUrl = baseUrl; + } + + // 发送HTTP请求获取页面内容 + protected Document fetchPage(String url) throws IOException { + return Jsoup.connect(url) + .userAgent("Mozilla/5.0") + .timeout(10000) + .get(); + } + + // 抽象方法:爬取数据 + public abstract List crawl() throws IOException; + + // 抽象方法:解析页面 + protected abstract List parsePage(Document doc); + + // 重载方法:解析页面(用于原生HTML解析) + protected List parsePage(String html) { + // 默认实现,子类可以选择性重写 + return new ArrayList<>(); + } +} + +// 电影数据模型 +class Movie { + private String title; + private String rating; + private String url; + + public Movie(String title, String rating, String url) { + this.title = title; + this.rating = rating; + this.url = url; + } + + @Override + public String toString() { + return "Movie{" + + "title='" + title + '\'' + + ", rating='" + rating + '\'' + + ", url='" + url + '\'' + + '}'; + } +} + +// 书籍数据模型 +class Book { + private String title; + private String author; + private String rating; + private String url; + + public Book(String title, String author, String rating, String url) { + this.title = title; + this.author = author; + this.rating = rating; + this.url = url; + } + + @Override + public String toString() { + return "Book{" + + "title='" + title + '\'' + + ", author='" + author + '\'' + + ", rating='" + rating + '\'' + + ", url='" + url + '\'' + + '}'; + } +} + +// 天气数据模型 +class Weather { + private String province; + private String city; + private String temperature; + private String weatherCondition; + private String url; + + public Weather(String province, String city, String temperature, String weatherCondition, String url) { + this.province = province; + this.city = city; + this.temperature = temperature; + this.weatherCondition = weatherCondition; + this.url = url; + } + + @Override + public String toString() { + return "Weather{" + + "province='" + province + '\'' + + ", city='" + city + '\'' + + ", temperature='" + temperature + '\'' + + ", weatherCondition='" + weatherCondition + '\'' + + ", url='" + url + '\'' + + '}'; + } +} + +// 豆瓣电影爬虫 - 爬取Top250 +class DoubanMovieSpider extends Spider { + public DoubanMovieSpider() { + super("https://movie.douban.com/top250"); + } + + @Override + public List crawl() throws IOException { + List allMovies = new ArrayList<>(); + + // 豆瓣Top250有10页,每页25部电影 + for (int start = 0; start < 250; start += 25) { + String pageUrl = baseUrl + "?start=" + start; + System.out.println("正在爬取第 " + (start / 25 + 1) + " 页..."); + Document doc = fetchPage(pageUrl); + List movies = parsePage(doc); + allMovies.addAll(movies); + + // 添加延迟,避免请求过快 + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + return allMovies; + } + + @Override + protected List parsePage(Document doc) { + List movies = new ArrayList<>(); + + // 使用Jsoup选择器解析页面 + Elements movieItems = doc.select(".info"); + for (Element item : movieItems) { + // 提取标题 + Element titleElement = item.selectFirst(".title"); + if (titleElement != null) { + String title = titleElement.text().trim(); + + // 提取评分 + Element ratingElement = item.selectFirst(".rating_num"); + String rating = ""; + if (ratingElement != null) { + rating = ratingElement.text().trim(); + } + + // 提取链接 + Element linkElement = item.selectFirst("a"); + String url = ""; + if (linkElement != null) { + url = linkElement.attr("href"); + } + + movies.add(new Movie(title, rating, url)); + } + } + + return movies; + } +} + +// 豆瓣读书爬虫 - 爬取评分前100 +class DoubanBookSpider extends Spider { + public DoubanBookSpider() { + super("https://book.douban.com/top250"); + } + + @Override + public List crawl() throws IOException { + List allBooks = new ArrayList<>(); + + // 豆瓣读书Top250,每页25本,爬取前4页(100本) + for (int start = 0; start < 100; start += 25) { + String pageUrl = baseUrl + "?start=" + start; + System.out.println("正在爬取第 " + (start / 25 + 1) + " 页..."); + Document doc = fetchPage(pageUrl); + List books = parsePage(doc); + allBooks.addAll(books); + + // 添加延迟,避免请求过快 + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + return allBooks; + } + + @Override + protected List parsePage(Document doc) { + List books = new ArrayList<>(); + + // 使用Jsoup选择器解析页面 + Elements bookItems = doc.select("tr.item"); + for (Element item : bookItems) { + // 提取链接 + Element linkElement = item.selectFirst(".nbg"); + if (linkElement != null) { + String url = linkElement.attr("href"); + + // 提取标题 + Element titleElement = linkElement.selectFirst("img"); + String title = ""; + if (titleElement != null) { + title = titleElement.attr("alt").trim(); + } + + // 提取作者信息 + Element authorElement = item.selectFirst(".pl"); + String author = ""; + if (authorElement != null) { + author = authorElement.text().trim(); + } + + // 提取评分 + Element ratingElement = item.selectFirst(".rating_nums"); + String rating = ""; + if (ratingElement != null) { + rating = ratingElement.text().trim(); + } + + books.add(new Book(title, author, rating, url)); + } + } + + return books; + } +} + +// 中国天气网爬虫 - 爬取各省份天气 +class WeatherSpider extends Spider { + // 中国各省份代码映射 + private static final String[] PROVINCES = { + "北京", "上海", "天津", "重庆", "河北", "山西", "辽宁", "吉林", "黑龙江", + "江苏", "浙江", "安徽", "福建", "江西", "山东", "河南", "湖北", "湖南", + "广东", "海南", "四川", "贵州", "云南", "陕西", "甘肃", "青海", "台湾", + "内蒙古", "广西", "西藏", "宁夏", "新疆", "香港", "澳门" + }; + + public WeatherSpider() { + super("http://www.weather.com.cn"); + } + + @Override + public List crawl() throws IOException { + List allWeather = new ArrayList<>(); + + System.out.println("正在爬取中国各省份天气数据..."); + + // 爬取主要城市天气(使用中国天气网的API) + String[] majorCities = { + "101010100", // 北京 + "101020100", // 上海 + "101030100", // 天津 + "101040100", // 重庆 + "101050101", // 哈尔滨 + "101060101", // 长春 + "101070101", // 沈阳 + "101080101", // 呼和浩特 + "101090101", // 石家庄 + "101100101", // 太原 + "101110101", // 西安 + "101120101", // 济南 + "101130101", // 乌鲁木齐 + "101140101", // 拉萨 + "101150101", // 西宁 + "101160101", // 兰州 + "101170101", // 银川 + "101180101", // 郑州 + "101190101", // 南京 + "101200101", // 武汉 + "101210101", // 杭州 + "101220101", // 合肥 + "101230101", // 福州 + "101240101", // 南昌 + "101250101", // 长沙 + "101260101", // 贵阳 + "101270101", // 成都 + "101280101", // 广州 + "101290101", // 昆明 + "101300101", // 南宁 + "101310101", // 海口 + "101320101", // 香港 + "101330101", // 澳门 + "101340101" // 台北 + }; + + String[] cityNames = { + "北京", "上海", "天津", "重庆", "哈尔滨", "长春", "沈阳", "呼和浩特", + "石家庄", "太原", "西安", "济南", "乌鲁木齐", "拉萨", "西宁", "兰州", + "银川", "郑州", "南京", "武汉", "杭州", "合肥", "福州", "南昌", + "长沙", "贵阳", "成都", "广州", "昆明", "南宁", "海口", "香港", "澳门", "台北" + }; + + for (int i = 0; i < majorCities.length; i++) { + String cityCode = majorCities[i]; + String cityName = cityNames[i]; + String province = getProvinceByCity(cityName); + + try { + String weatherUrl = "http://www.weather.com.cn/weather1d/" + cityCode + ".shtml"; + Document doc = fetchPage(weatherUrl); + Weather weather = parseWeatherPage(doc, province, cityName, weatherUrl); + if (weather != null) { + allWeather.add(weather); + System.out.println("已获取 " + cityName + " 的天气数据"); + } + + // 添加延迟,避免请求过快 + Thread.sleep(500); + } catch (Exception e) { + System.err.println("获取 " + cityName + " 天气数据失败: " + e.getMessage()); + } + } + + return allWeather; + } + + private String getProvinceByCity(String city) { + // 简化的省份映射 + if (city.equals("北京") || city.equals("上海") || city.equals("天津") || city.equals("重庆")) { + return city; + } else if (city.equals("哈尔滨") || city.equals("长春") || city.equals("沈阳")) { + return "东北"; + } else if (city.equals("呼和浩特") || city.equals("石家庄") || city.equals("太原")) { + return "华北"; + } else if (city.equals("西安") || city.equals("兰州") || city.equals("西宁") || city.equals("银川")) { + return "西北"; + } else if (city.equals("济南") || city.equals("郑州") || city.equals("南京")) { + return "华东"; + } else if (city.equals("武汉") || city.equals("长沙") || city.equals("南昌")) { + return "华中"; + } else if (city.equals("杭州") || city.equals("合肥") || city.equals("福州")) { + return "华东"; + } else if (city.equals("成都") || city.equals("贵阳") || city.equals("昆明")) { + return "西南"; + } else if (city.equals("广州") || city.equals("南宁") || city.equals("海口")) { + return "华南"; + } else if (city.equals("乌鲁木齐") || city.equals("拉萨")) { + return "西北"; + } else if (city.equals("香港") || city.equals("澳门") || city.equals("台北")) { + return "港澳台"; + } + return "其他"; + } + + private Weather parseWeatherPage(Document doc, String province, String city, String url) { + try { + // 提取温度 + Element tempElement = doc.selectFirst(".tem"); + String temperature = ""; + if (tempElement != null) { + temperature = tempElement.text().trim(); + } + + // 提取天气状况 + Element weatherElement = doc.selectFirst(".wea"); + String weatherCondition = ""; + if (weatherElement != null) { + weatherCondition = weatherElement.text().trim(); + } + + return new Weather(province, city, temperature, weatherCondition, url); + } catch (Exception e) { + System.err.println("解析 " + city + " 天气数据失败: " + e.getMessage()); + return null; + } + } + + @Override + protected List parsePage(Document doc) { + // 这个方法在crawl中被直接调用,不需要实现 + return new ArrayList<>(); + } +} + +// 测试类 +public class SpiderFramework { + public static void main(String[] args) { + List spiders = new ArrayList<>(); + spiders.add(new DoubanMovieSpider()); + spiders.add(new DoubanBookSpider()); + spiders.add(new WeatherSpider()); + + for (Spider spider : spiders) { + try { + System.out.println("\n爬取 " + spider.getClass().getSimpleName() + " 数据:"); + List data = spider.crawl(); + System.out.println("共爬取 " + data.size() + " 条数据"); + + // 只显示前5条数据 + int displayCount = Math.min(5, data.size()); + System.out.println("显示前 " + displayCount + " 条:"); + for (int i = 0; i < displayCount; i++) { + System.out.println(data.get(i)); + } + } catch (IOException e) { + System.err.println("爬取失败: " + e.getMessage()); + } + } + } +}