完成爬虫项目代码提交

2 months ago · 42ab1db5c8
1 changed files with 422 additions and 0 deletions
--- a/继承+多态爬虫/SpiderFramework.java
+++ b/继承+多态爬虫/SpiderFramework.java
@ -0,0 +1,422 @@
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+// 爬虫基类
+abstract class Spider {
+    protected String baseUrl;
+    
+    public Spider(String baseUrl) {
+        this.baseUrl = baseUrl;
+    }
+    
+    // 发送HTTP请求获取页面内容
+    protected Document fetchPage(String url) throws IOException {
+        return Jsoup.connect(url)
+                .userAgent("Mozilla/5.0")
+                .timeout(10000)
+                .get();
+    }
+    
+    // 抽象方法：爬取数据
+    public abstract List<?> crawl() throws IOException;
+    
+    // 抽象方法：解析页面
+    protected abstract List<?> parsePage(Document doc);
+    
+    // 重载方法：解析页面（用于原生HTML解析）
+    protected List<?> parsePage(String html) {
+        // 默认实现，子类可以选择性重写
+        return new ArrayList<>();
+    }
+}
+
+// 电影数据模型
+class Movie {
+    private String title;
+    private String rating;
+    private String url;
+    
+    public Movie(String title, String rating, String url) {
+        this.title = title;
+        this.rating = rating;
+        this.url = url;
+    }
+    
+    @Override
+    public String toString() {
+        return "Movie{" +
+                "title='" + title + '\'' +
+                ", rating='" + rating + '\'' +
+                ", url='" + url + '\'' +
+                '}';
+    }
+}
+
+// 书籍数据模型
+class Book {
+    private String title;
+    private String author;
+    private String rating;
+    private String url;
+    
+    public Book(String title, String author, String rating, String url) {
+        this.title = title;
+        this.author = author;
+        this.rating = rating;
+        this.url = url;
+    }
+    
+    @Override
+    public String toString() {
+        return "Book{" +
+                "title='" + title + '\'' +
+                ", author='" + author + '\'' +
+                ", rating='" + rating + '\'' +
+                ", url='" + url + '\'' +
+                '}';
+    }
+}
+
+// 天气数据模型
+class Weather {
+    private String province;
+    private String city;
+    private String temperature;
+    private String weatherCondition;
+    private String url;
+    
+    public Weather(String province, String city, String temperature, String weatherCondition, String url) {
+        this.province = province;
+        this.city = city;
+        this.temperature = temperature;
+        this.weatherCondition = weatherCondition;
+        this.url = url;
+    }
+    
+    @Override
+    public String toString() {
+        return "Weather{" +
+                "province='" + province + '\'' +
+                ", city='" + city + '\'' +
+                ", temperature='" + temperature + '\'' +
+                ", weatherCondition='" + weatherCondition + '\'' +
+                ", url='" + url + '\'' +
+                '}';
+    }
+}
+
+// 豆瓣电影爬虫 - 爬取Top250
+class DoubanMovieSpider extends Spider {
+    public DoubanMovieSpider() {
+        super("https://movie.douban.com/top250");
+    }
+    
+    @Override
+    public List<Movie> crawl() throws IOException {
+        List<Movie> allMovies = new ArrayList<>();
+        
+        // 豆瓣Top250有10页，每页25部电影
+        for (int start = 0; start < 250; start += 25) {
+            String pageUrl = baseUrl + "?start=" + start;
+            System.out.println("正在爬取第 " + (start / 25 + 1) + " 页...");
+            Document doc = fetchPage(pageUrl);
+            List<Movie> movies = parsePage(doc);
+            allMovies.addAll(movies);
+            
+            // 添加延迟，避免请求过快
+            try {
+                Thread.sleep(1000);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        }
+        
+        return allMovies;
+    }
+    
+    @Override
+    protected List<Movie> parsePage(Document doc) {
+        List<Movie> movies = new ArrayList<>();
+        
+        // 使用Jsoup选择器解析页面
+        Elements movieItems = doc.select(".info");
+        for (Element item : movieItems) {
+            // 提取标题
+            Element titleElement = item.selectFirst(".title");
+            if (titleElement != null) {
+                String title = titleElement.text().trim();
+                
+                // 提取评分
+                Element ratingElement = item.selectFirst(".rating_num");
+                String rating = "";
+                if (ratingElement != null) {
+                    rating = ratingElement.text().trim();
+                }
+                
+                // 提取链接
+                Element linkElement = item.selectFirst("a");
+                String url = "";
+                if (linkElement != null) {
+                    url = linkElement.attr("href");
+                }
+                
+                movies.add(new Movie(title, rating, url));
+            }
+        }
+        
+        return movies;
+    }
+}
+
+// 豆瓣读书爬虫 - 爬取评分前100
+class DoubanBookSpider extends Spider {
+    public DoubanBookSpider() {
+        super("https://book.douban.com/top250");
+    }
+    
+    @Override
+    public List<Book> crawl() throws IOException {
+        List<Book> allBooks = new ArrayList<>();
+        
+        // 豆瓣读书Top250，每页25本，爬取前4页（100本）
+        for (int start = 0; start < 100; start += 25) {
+            String pageUrl = baseUrl + "?start=" + start;
+            System.out.println("正在爬取第 " + (start / 25 + 1) + " 页...");
+            Document doc = fetchPage(pageUrl);
+            List<Book> books = parsePage(doc);
+            allBooks.addAll(books);
+            
+            // 添加延迟，避免请求过快
+            try {
+                Thread.sleep(1000);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        }
+        
+        return allBooks;
+    }
+    
+    @Override
+    protected List<Book> parsePage(Document doc) {
+        List<Book> books = new ArrayList<>();
+        
+        // 使用Jsoup选择器解析页面
+        Elements bookItems = doc.select("tr.item");
+        for (Element item : bookItems) {
+            // 提取链接
+            Element linkElement = item.selectFirst(".nbg");
+            if (linkElement != null) {
+                String url = linkElement.attr("href");
+                
+                // 提取标题
+                Element titleElement = linkElement.selectFirst("img");
+                String title = "";
+                if (titleElement != null) {
+                    title = titleElement.attr("alt").trim();
+                }
+                
+                // 提取作者信息
+                Element authorElement = item.selectFirst(".pl");
+                String author = "";
+                if (authorElement != null) {
+                    author = authorElement.text().trim();
+                }
+                
+                // 提取评分
+                Element ratingElement = item.selectFirst(".rating_nums");
+                String rating = "";
+                if (ratingElement != null) {
+                    rating = ratingElement.text().trim();
+                }
+                
+                books.add(new Book(title, author, rating, url));
+            }
+        }
+        
+        return books;
+    }
+}
+
+// 中国天气网爬虫 - 爬取各省份天气
+class WeatherSpider extends Spider {
+    // 中国各省份代码映射
+    private static final String[] PROVINCES = {
+        "北京", "上海", "天津", "重庆", "河北", "山西", "辽宁", "吉林", "黑龙江",
+        "江苏", "浙江", "安徽", "福建", "江西", "山东", "河南", "湖北", "湖南",
+        "广东", "海南", "四川", "贵州", "云南", "陕西", "甘肃", "青海", "台湾",
+        "内蒙古", "广西", "西藏", "宁夏", "新疆", "香港", "澳门"
+    };
+    
+    public WeatherSpider() {
+        super("http://www.weather.com.cn");
+    }
+    
+    @Override
+    public List<Weather> crawl() throws IOException {
+        List<Weather> allWeather = new ArrayList<>();
+        
+        System.out.println("正在爬取中国各省份天气数据...");
+        
+        // 爬取主要城市天气（使用中国天气网的API）
+        String[] majorCities = {
+            "101010100", // 北京
+            "101020100", // 上海
+            "101030100", // 天津
+            "101040100", // 重庆
+            "101050101", // 哈尔滨
+            "101060101", // 长春
+            "101070101", // 沈阳
+            "101080101", // 呼和浩特
+            "101090101", // 石家庄
+            "101100101", // 太原
+            "101110101", // 西安
+            "101120101", // 济南
+            "101130101", // 乌鲁木齐
+            "101140101", // 拉萨
+            "101150101", // 西宁
+            "101160101", // 兰州
+            "101170101", // 银川
+            "101180101", // 郑州
+            "101190101", // 南京
+            "101200101", // 武汉
+            "101210101", // 杭州
+            "101220101", // 合肥
+            "101230101", // 福州
+            "101240101", // 南昌
+            "101250101", // 长沙
+            "101260101", // 贵阳
+            "101270101", // 成都
+            "101280101", // 广州
+            "101290101", // 昆明
+            "101300101", // 南宁
+            "101310101", // 海口
+            "101320101", // 香港
+            "101330101", // 澳门
+            "101340101"  // 台北
+        };
+        
+        String[] cityNames = {
+            "北京", "上海", "天津", "重庆", "哈尔滨", "长春", "沈阳", "呼和浩特",
+            "石家庄", "太原", "西安", "济南", "乌鲁木齐", "拉萨", "西宁", "兰州",
+            "银川", "郑州", "南京", "武汉", "杭州", "合肥", "福州", "南昌",
+            "长沙", "贵阳", "成都", "广州", "昆明", "南宁", "海口", "香港", "澳门", "台北"
+        };
+        
+        for (int i = 0; i < majorCities.length; i++) {
+            String cityCode = majorCities[i];
+            String cityName = cityNames[i];
+            String province = getProvinceByCity(cityName);
+            
+            try {
+                String weatherUrl = "http://www.weather.com.cn/weather1d/" + cityCode + ".shtml";
+                Document doc = fetchPage(weatherUrl);
+                Weather weather = parseWeatherPage(doc, province, cityName, weatherUrl);
+                if (weather != null) {
+                    allWeather.add(weather);
+                    System.out.println("已获取 " + cityName + " 的天气数据");
+                }
+                
+                // 添加延迟，避免请求过快
+                Thread.sleep(500);
+            } catch (Exception e) {
+                System.err.println("获取 " + cityName + " 天气数据失败: " + e.getMessage());
+            }
+        }
+        
+        return allWeather;
+    }
+    
+    private String getProvinceByCity(String city) {
+        // 简化的省份映射
+        if (city.equals("北京") || city.equals("上海") || city.equals("天津") || city.equals("重庆")) {
+            return city;
+        } else if (city.equals("哈尔滨") || city.equals("长春") || city.equals("沈阳")) {
+            return "东北";
+        } else if (city.equals("呼和浩特") || city.equals("石家庄") || city.equals("太原")) {
+            return "华北";
+        } else if (city.equals("西安") || city.equals("兰州") || city.equals("西宁") || city.equals("银川")) {
+            return "西北";
+        } else if (city.equals("济南") || city.equals("郑州") || city.equals("南京")) {
+            return "华东";
+        } else if (city.equals("武汉") || city.equals("长沙") || city.equals("南昌")) {
+            return "华中";
+        } else if (city.equals("杭州") || city.equals("合肥") || city.equals("福州")) {
+            return "华东";
+        } else if (city.equals("成都") || city.equals("贵阳") || city.equals("昆明")) {
+            return "西南";
+        } else if (city.equals("广州") || city.equals("南宁") || city.equals("海口")) {
+            return "华南";
+        } else if (city.equals("乌鲁木齐") || city.equals("拉萨")) {
+            return "西北";
+        } else if (city.equals("香港") || city.equals("澳门") || city.equals("台北")) {
+            return "港澳台";
+        }
+        return "其他";
+    }
+    
+    private Weather parseWeatherPage(Document doc, String province, String city, String url) {
+        try {
+            // 提取温度
+            Element tempElement = doc.selectFirst(".tem");
+            String temperature = "";
+            if (tempElement != null) {
+                temperature = tempElement.text().trim();
+            }
+            
+            // 提取天气状况
+            Element weatherElement = doc.selectFirst(".wea");
+            String weatherCondition = "";
+            if (weatherElement != null) {
+                weatherCondition = weatherElement.text().trim();
+            }
+            
+            return new Weather(province, city, temperature, weatherCondition, url);
+        } catch (Exception e) {
+            System.err.println("解析 " + city + " 天气数据失败: " + e.getMessage());
+            return null;
+        }
+    }
+    
+    @Override
+    protected List<Weather> parsePage(Document doc) {
+        // 这个方法在crawl中被直接调用，不需要实现
+        return new ArrayList<>();
+    }
+}
+
+// 测试类
+public class SpiderFramework {
+    public static void main(String[] args) {
+        List<Spider> spiders = new ArrayList<>();
+        spiders.add(new DoubanMovieSpider());
+        spiders.add(new DoubanBookSpider());
+        spiders.add(new WeatherSpider());
+        
+        for (Spider spider : spiders) {
+            try {
+                System.out.println("\n爬取 " + spider.getClass().getSimpleName() + " 数据:");
+                List<?> data = spider.crawl();
+                System.out.println("共爬取 " + data.size() + " 条数据");
+                
+                // 只显示前5条数据
+                int displayCount = Math.min(5, data.size());
+                System.out.println("显示前 " + displayCount + " 条:");
+                for (int i = 0; i < displayCount; i++) {
+                    System.out.println(data.get(i));
+                }
+            } catch (IOException e) {
+                System.err.println("爬取失败: " + e.getMessage());
+            }
+        }
+    }
+}