上传文件至 'project/project'

3 weeks ago · 6ad244c8bd
1 changed files with 229 additions and 0 deletions
--- a/project/project/Main.java
+++ b/project/project/Main.java
@ -0,0 +1,229 @@
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.select.Elements;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.time.LocalDate;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Scanner;
+
+// 实体类
+class Article {
+    private String title;
+    private String content;
+    private String url;
+    private String author;
+    private LocalDate publishDate;
+
+    public Article(String title, String content, String url, String author, LocalDate publishDate) {
+        this.title = title;
+        this.content = content;
+        this.url = url;
+        this.author = author;
+        this.publishDate = publishDate;
+    }
+
+    public String getTitle() { return title; }
+    public String getContent() { return content; }
+    public String getUrl() { return url; }
+    public String getAuthor() { return author; }
+    public LocalDate getPublishDate() { return publishDate; }
+
+    @Override
+    public String toString() {
+        return "标题：" + title +
+                " | 来源：" + author +
+                " | 日期：" + publishDate +
+                " | 链接：" + url;
+    }
+}
+
+// 策略接口
+interface CrawlStrategy {
+    List<Article> crawl() throws Exception;
+}
+
+// 百度热搜
+class BaiduStrategy implements CrawlStrategy {
+    @Override
+    public List<Article> crawl() throws Exception {
+        List<Article> list = new ArrayList<Article>();
+        Document doc = Jsoup.connect("https://top.baidu.com/board?tab=realtime")
+                .userAgent("Mozilla/5.0")
+                .timeout(6000).get();
+        Elements items = doc.select(".category-wrap_iQLoo");
+        for (int i = 0; i < 8 && i < items.size(); i++) {
+            String title = items.get(i).select(".c-single-text-ellipsis").text();
+            String fullUrl = "https://top.baidu.com" + items.get(i).select("a").attr("href");
+            list.add(new Article(title, "", fullUrl, "百度热搜", LocalDate.now()));
+        }
+        return list;
+    }
+}
+
+// 人民网
+class PeopleStrategy implements CrawlStrategy {
+    @Override
+    public List<Article> crawl() throws Exception {
+        List<Article> list = new ArrayList<Article>();
+        Document doc = Jsoup.connect("https://www.people.com.cn/")
+                .userAgent("Mozilla/5.0")
+                .timeout(6000).get();
+        Elements links = doc.select("a[href]");
+        for (int i = 0; i < 5 && i < links.size(); i++) {
+            String title = links.get(i).text().trim();
+            String url = links.get(i).attr("href");
+            if (title.length() > 6 && url.startsWith("http")) {
+                list.add(new Article(title, "", url, "人民网", LocalDate.now()));
+            }
+        }
+        return list;
+    }
+}
+
+// 新浪新闻
+class SinaStrategy implements CrawlStrategy {
+    @Override
+    public List<Article> crawl() throws Exception {
+        List<Article> list = new ArrayList<Article>();
+        Document doc = Jsoup.connect("https://news.sina.com.cn/")
+                .userAgent("Mozilla/5.0")
+                .timeout(6000).get();
+        Elements links = doc.select("a[href]");
+        int count = 0;
+        for (int i = 0; i < links.size(); i++) {
+            if(count >= 5) break;
+            String title = links.get(i).text().trim();
+            String url = links.get(i).attr("href");
+            if (title.length() > 8 && url.startsWith("http")) {
+                list.add(new Article(title, "", url, "新浪新闻", LocalDate.now()));
+                count++;
+            }
+        }
+        return list;
+    }
+}
+
+// 策略工厂 兼容JDK8/11
+class StrategyFactory {
+    public static CrawlStrategy getStrategy(String type) {
+        if ("baidu".equalsIgnoreCase(type)) {
+            return new BaiduStrategy();
+        } else if ("people".equalsIgnoreCase(type)) {
+            return new PeopleStrategy();
+        } else if ("sina".equalsIgnoreCase(type)) {
+            return new SinaStrategy();
+        } else {
+            throw new IllegalArgumentException("不支持的站点类型");
+        }
+    }
+}
+
+// 命令模式
+interface Command {
+    void execute() throws Exception;
+}
+
+class CrawlCommand implements Command {
+    private CrawlStrategy strategy;
+    private List<Article> globalData;
+
+    public CrawlCommand(CrawlStrategy strategy, List<Article> globalData) {
+        this.strategy = strategy;
+        this.globalData = globalData;
+    }
+
+    @Override
+    public void execute() throws Exception {
+        List<Article> data = strategy.crawl();
+        globalData.addAll(data);
+        for(Article a : data){
+            System.out.println(a);
+        }
+    }
+}
+// CLI控制器（已加入分层异常处理）
+class CrawlController {
+    private List<Article> allNews = new ArrayList<Article>();
+    public void runCLI() {
+        Scanner scanner = new Scanner(System.in);
+        System.out.println("===== 新闻爬虫CLI终端 =====");
+        System.out.println("可用指令：baidu / people / sina / all / save / exit");
+        while(true) {
+            System.out.print("\n请输入指令：");
+            String input = scanner.nextLine().trim();
+            try {
+                if("baidu".equalsIgnoreCase(input)){
+                    new CrawlCommand(StrategyFactory.getStrategy("baidu"), allNews).execute();
+                }else if("people".equalsIgnoreCase(input)){
+                    new CrawlCommand(StrategyFactory.getStrategy("people"), allNews).execute();
+                }else if("sina".equalsIgnoreCase(input)){
+                    new CrawlCommand(StrategyFactory.getStrategy("sina"), allNews).execute();
+                }else if("all".equalsIgnoreCase(input)){
+                    crawlAll();
+                }else if("save".equalsIgnoreCase(input)){
+                    saveToFile();
+                }else if("exit".equalsIgnoreCase(input)){
+                    System.out.println("程序已退出");
+                    scanner.close();
+                    return;
+                }else{
+                    System.out.println("无效指令，请重新输入");
+                }
+                // 分层异常捕获
+                // 1. 业务层异常：参数、站点类型非法
+            } catch (IllegalArgumentException e) {
+                System.out.println("业务调度异常：" + e.getMessage());
+                // 2. 网络层异常：连接超时
+            } catch (java.net.SocketTimeoutException e) {
+                System.out.println("网络层异常：网站连接超时，爬取失败");
+                // 3. IO异常：区分网络请求 / 本地文件读写
+            } catch (java.io.IOException e) {
+                if(e.getMessage().contains("news_data")){
+                    System.out.println("持久层异常：本地文件保存失败");
+                }else{
+                    System.out.println("网络层异常：网页数据拉取失败");
+                }
+                // 4. 全局兜底异常
+            } catch (Exception e) {
+                System.out.println("系统未知异常：" + e.getMessage());
+            }
+        }
+    }
+    private void crawlAll() throws Exception {
+        System.out.println("--- 开始批量爬取全部3个站点 ---");
+        new CrawlCommand(StrategyFactory.getStrategy("baidu"), allNews).execute();
+        new CrawlCommand(StrategyFactory.getStrategy("people"), allNews).execute();
+        new CrawlCommand(StrategyFactory.getStrategy("sina"), allNews).execute();
+    }
+    private void saveToFile() {
+        FileWriter writer = null;
+        try {
+            writer = new FileWriter("news_data.json");
+            writer.write("{\n\"newsList\":[\n");
+            for (int i = 0; i < allNews.size(); i++) {
+                Article a = allNews.get(i);
+                String json = "{\"title\":\""+a.getTitle()+"\",\"source\":\""+a.getAuthor()+"\",\"date\":\""+a.getPublishDate()+"\",\"url\":\""+a.getUrl()+"\"}";
+                writer.write(json);
+                if(i != allNews.size()-1) writer.write(",\n");
+            }
+            writer.write("\n]\n}");
+            System.out.println("全部新闻数据已成功保存到项目根目录 news_data.json");
+        } catch (IOException e) {
+            System.out.println("文件保存失败：" + e.getMessage());
+        } finally {
+            if(writer != null){
+                try {
+                    writer.close();
+                } catch (IOException ignored) {}
+            }
+        }
+    }
+}
+// 主类 文件名 Main.java
+public class Main {
+    public static void main(String[] args) {
+        new CrawlController().runCLI();
+    }
+}