diff --git a/继承+多态爬虫/SpiderFramework.java b/继承+多态爬虫/SpiderFramework.java deleted file mode 100644 index c9bc8ad..0000000 --- a/继承+多态爬虫/SpiderFramework.java +++ /dev/null @@ -1,422 +0,0 @@ -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.HttpURLConnection; -import java.net.URL; -import java.util.ArrayList; -import java.util.List; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -// 爬虫基类 -abstract class Spider { - protected String baseUrl; - - public Spider(String baseUrl) { - this.baseUrl = baseUrl; - } - - // 发送HTTP请求获取页面内容 - protected Document fetchPage(String url) throws IOException { - return Jsoup.connect(url) - .userAgent("Mozilla/5.0") - .timeout(10000) - .get(); - } - - // 抽象方法:爬取数据 - public abstract List crawl() throws IOException; - - // 抽象方法:解析页面 - protected abstract List parsePage(Document doc); - - // 重载方法:解析页面(用于原生HTML解析) - protected List parsePage(String html) { - // 默认实现,子类可以选择性重写 - return new ArrayList<>(); - } -} - -// 电影数据模型 -class Movie { - private String title; - private String rating; - private String url; - - public Movie(String title, String rating, String url) { - this.title = title; - this.rating = rating; - this.url = url; - } - - @Override - public String toString() { - return "Movie{" + - "title='" + title + '\'' + - ", rating='" + rating + '\'' + - ", url='" + url + '\'' + - '}'; - } -} - -// 书籍数据模型 -class Book { - private String title; - private String author; - private String rating; - private String url; - - public Book(String title, String author, String rating, String url) { - this.title = title; - this.author = author; - this.rating = rating; - this.url = url; - } - - @Override - public String toString() { - return "Book{" + - "title='" + title + '\'' + - ", author='" + author + '\'' + - ", rating='" + rating + '\'' + - ", url='" + url + '\'' + - '}'; - } -} - -// 天气数据模型 -class Weather { - private String province; - private String city; - private String temperature; - private String weatherCondition; - private String url; - - public Weather(String province, String city, String temperature, String weatherCondition, String url) { - this.province = province; - this.city = city; - this.temperature = temperature; - this.weatherCondition = weatherCondition; - this.url = url; - } - - @Override - public String toString() { - return "Weather{" + - "province='" + province + '\'' + - ", city='" + city + '\'' + - ", temperature='" + temperature + '\'' + - ", weatherCondition='" + weatherCondition + '\'' + - ", url='" + url + '\'' + - '}'; - } -} - -// 豆瓣电影爬虫 - 爬取Top250 -class DoubanMovieSpider extends Spider { - public DoubanMovieSpider() { - super("https://movie.douban.com/top250"); - } - - @Override - public List crawl() throws IOException { - List allMovies = new ArrayList<>(); - - // 豆瓣Top250有10页,每页25部电影 - for (int start = 0; start < 250; start += 25) { - String pageUrl = baseUrl + "?start=" + start; - System.out.println("正在爬取第 " + (start / 25 + 1) + " 页..."); - Document doc = fetchPage(pageUrl); - List movies = parsePage(doc); - allMovies.addAll(movies); - - // 添加延迟,避免请求过快 - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - } - - return allMovies; - } - - @Override - protected List parsePage(Document doc) { - List movies = new ArrayList<>(); - - // 使用Jsoup选择器解析页面 - Elements movieItems = doc.select(".info"); - for (Element item : movieItems) { - // 提取标题 - Element titleElement = item.selectFirst(".title"); - if (titleElement != null) { - String title = titleElement.text().trim(); - - // 提取评分 - Element ratingElement = item.selectFirst(".rating_num"); - String rating = ""; - if (ratingElement != null) { - rating = ratingElement.text().trim(); - } - - // 提取链接 - Element linkElement = item.selectFirst("a"); - String url = ""; - if (linkElement != null) { - url = linkElement.attr("href"); - } - - movies.add(new Movie(title, rating, url)); - } - } - - return movies; - } -} - -// 豆瓣读书爬虫 - 爬取评分前100 -class DoubanBookSpider extends Spider { - public DoubanBookSpider() { - super("https://book.douban.com/top250"); - } - - @Override - public List crawl() throws IOException { - List allBooks = new ArrayList<>(); - - // 豆瓣读书Top250,每页25本,爬取前4页(100本) - for (int start = 0; start < 100; start += 25) { - String pageUrl = baseUrl + "?start=" + start; - System.out.println("正在爬取第 " + (start / 25 + 1) + " 页..."); - Document doc = fetchPage(pageUrl); - List books = parsePage(doc); - allBooks.addAll(books); - - // 添加延迟,避免请求过快 - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - } - - return allBooks; - } - - @Override - protected List parsePage(Document doc) { - List books = new ArrayList<>(); - - // 使用Jsoup选择器解析页面 - Elements bookItems = doc.select("tr.item"); - for (Element item : bookItems) { - // 提取链接 - Element linkElement = item.selectFirst(".nbg"); - if (linkElement != null) { - String url = linkElement.attr("href"); - - // 提取标题 - Element titleElement = linkElement.selectFirst("img"); - String title = ""; - if (titleElement != null) { - title = titleElement.attr("alt").trim(); - } - - // 提取作者信息 - Element authorElement = item.selectFirst(".pl"); - String author = ""; - if (authorElement != null) { - author = authorElement.text().trim(); - } - - // 提取评分 - Element ratingElement = item.selectFirst(".rating_nums"); - String rating = ""; - if (ratingElement != null) { - rating = ratingElement.text().trim(); - } - - books.add(new Book(title, author, rating, url)); - } - } - - return books; - } -} - -// 中国天气网爬虫 - 爬取各省份天气 -class WeatherSpider extends Spider { - // 中国各省份代码映射 - private static final String[] PROVINCES = { - "北京", "上海", "天津", "重庆", "河北", "山西", "辽宁", "吉林", "黑龙江", - "江苏", "浙江", "安徽", "福建", "江西", "山东", "河南", "湖北", "湖南", - "广东", "海南", "四川", "贵州", "云南", "陕西", "甘肃", "青海", "台湾", - "内蒙古", "广西", "西藏", "宁夏", "新疆", "香港", "澳门" - }; - - public WeatherSpider() { - super("http://www.weather.com.cn"); - } - - @Override - public List crawl() throws IOException { - List allWeather = new ArrayList<>(); - - System.out.println("正在爬取中国各省份天气数据..."); - - // 爬取主要城市天气(使用中国天气网的API) - String[] majorCities = { - "101010100", // 北京 - "101020100", // 上海 - "101030100", // 天津 - "101040100", // 重庆 - "101050101", // 哈尔滨 - "101060101", // 长春 - "101070101", // 沈阳 - "101080101", // 呼和浩特 - "101090101", // 石家庄 - "101100101", // 太原 - "101110101", // 西安 - "101120101", // 济南 - "101130101", // 乌鲁木齐 - "101140101", // 拉萨 - "101150101", // 西宁 - "101160101", // 兰州 - "101170101", // 银川 - "101180101", // 郑州 - "101190101", // 南京 - "101200101", // 武汉 - "101210101", // 杭州 - "101220101", // 合肥 - "101230101", // 福州 - "101240101", // 南昌 - "101250101", // 长沙 - "101260101", // 贵阳 - "101270101", // 成都 - "101280101", // 广州 - "101290101", // 昆明 - "101300101", // 南宁 - "101310101", // 海口 - "101320101", // 香港 - "101330101", // 澳门 - "101340101" // 台北 - }; - - String[] cityNames = { - "北京", "上海", "天津", "重庆", "哈尔滨", "长春", "沈阳", "呼和浩特", - "石家庄", "太原", "西安", "济南", "乌鲁木齐", "拉萨", "西宁", "兰州", - "银川", "郑州", "南京", "武汉", "杭州", "合肥", "福州", "南昌", - "长沙", "贵阳", "成都", "广州", "昆明", "南宁", "海口", "香港", "澳门", "台北" - }; - - for (int i = 0; i < majorCities.length; i++) { - String cityCode = majorCities[i]; - String cityName = cityNames[i]; - String province = getProvinceByCity(cityName); - - try { - String weatherUrl = "http://www.weather.com.cn/weather1d/" + cityCode + ".shtml"; - Document doc = fetchPage(weatherUrl); - Weather weather = parseWeatherPage(doc, province, cityName, weatherUrl); - if (weather != null) { - allWeather.add(weather); - System.out.println("已获取 " + cityName + " 的天气数据"); - } - - // 添加延迟,避免请求过快 - Thread.sleep(500); - } catch (Exception e) { - System.err.println("获取 " + cityName + " 天气数据失败: " + e.getMessage()); - } - } - - return allWeather; - } - - private String getProvinceByCity(String city) { - // 简化的省份映射 - if (city.equals("北京") || city.equals("上海") || city.equals("天津") || city.equals("重庆")) { - return city; - } else if (city.equals("哈尔滨") || city.equals("长春") || city.equals("沈阳")) { - return "东北"; - } else if (city.equals("呼和浩特") || city.equals("石家庄") || city.equals("太原")) { - return "华北"; - } else if (city.equals("西安") || city.equals("兰州") || city.equals("西宁") || city.equals("银川")) { - return "西北"; - } else if (city.equals("济南") || city.equals("郑州") || city.equals("南京")) { - return "华东"; - } else if (city.equals("武汉") || city.equals("长沙") || city.equals("南昌")) { - return "华中"; - } else if (city.equals("杭州") || city.equals("合肥") || city.equals("福州")) { - return "华东"; - } else if (city.equals("成都") || city.equals("贵阳") || city.equals("昆明")) { - return "西南"; - } else if (city.equals("广州") || city.equals("南宁") || city.equals("海口")) { - return "华南"; - } else if (city.equals("乌鲁木齐") || city.equals("拉萨")) { - return "西北"; - } else if (city.equals("香港") || city.equals("澳门") || city.equals("台北")) { - return "港澳台"; - } - return "其他"; - } - - private Weather parseWeatherPage(Document doc, String province, String city, String url) { - try { - // 提取温度 - Element tempElement = doc.selectFirst(".tem"); - String temperature = ""; - if (tempElement != null) { - temperature = tempElement.text().trim(); - } - - // 提取天气状况 - Element weatherElement = doc.selectFirst(".wea"); - String weatherCondition = ""; - if (weatherElement != null) { - weatherCondition = weatherElement.text().trim(); - } - - return new Weather(province, city, temperature, weatherCondition, url); - } catch (Exception e) { - System.err.println("解析 " + city + " 天气数据失败: " + e.getMessage()); - return null; - } - } - - @Override - protected List parsePage(Document doc) { - // 这个方法在crawl中被直接调用,不需要实现 - return new ArrayList<>(); - } -} - -// 测试类 -public class SpiderFramework { - public static void main(String[] args) { - List spiders = new ArrayList<>(); - spiders.add(new DoubanMovieSpider()); - spiders.add(new DoubanBookSpider()); - spiders.add(new WeatherSpider()); - - for (Spider spider : spiders) { - try { - System.out.println("\n爬取 " + spider.getClass().getSimpleName() + " 数据:"); - List data = spider.crawl(); - System.out.println("共爬取 " + data.size() + " 条数据"); - - // 只显示前5条数据 - int displayCount = Math.min(5, data.size()); - System.out.println("显示前 " + displayCount + " 条:"); - for (int i = 0; i < displayCount; i++) { - System.out.println(data.get(i)); - } - } catch (IOException e) { - System.err.println("爬取失败: " + e.getMessage()); - } - } - } -}