diff --git a/project/project/Main.java b/project/project/Main.java new file mode 100644 index 0000000..59d472e --- /dev/null +++ b/project/project/Main.java @@ -0,0 +1,229 @@ +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; +import java.io.FileWriter; +import java.io.IOException; +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.List; +import java.util.Scanner; + +// 实体类 +class Article { + private String title; + private String content; + private String url; + private String author; + private LocalDate publishDate; + + public Article(String title, String content, String url, String author, LocalDate publishDate) { + this.title = title; + this.content = content; + this.url = url; + this.author = author; + this.publishDate = publishDate; + } + + public String getTitle() { return title; } + public String getContent() { return content; } + public String getUrl() { return url; } + public String getAuthor() { return author; } + public LocalDate getPublishDate() { return publishDate; } + + @Override + public String toString() { + return "标题:" + title + + " | 来源:" + author + + " | 日期:" + publishDate + + " | 链接:" + url; + } +} + +// 策略接口 +interface CrawlStrategy { + List
crawl() throws Exception; +} + +// 百度热搜 +class BaiduStrategy implements CrawlStrategy { + @Override + public List
crawl() throws Exception { + List
list = new ArrayList
(); + Document doc = Jsoup.connect("https://top.baidu.com/board?tab=realtime") + .userAgent("Mozilla/5.0") + .timeout(6000).get(); + Elements items = doc.select(".category-wrap_iQLoo"); + for (int i = 0; i < 8 && i < items.size(); i++) { + String title = items.get(i).select(".c-single-text-ellipsis").text(); + String fullUrl = "https://top.baidu.com" + items.get(i).select("a").attr("href"); + list.add(new Article(title, "", fullUrl, "百度热搜", LocalDate.now())); + } + return list; + } +} + +// 人民网 +class PeopleStrategy implements CrawlStrategy { + @Override + public List
crawl() throws Exception { + List
list = new ArrayList
(); + Document doc = Jsoup.connect("https://www.people.com.cn/") + .userAgent("Mozilla/5.0") + .timeout(6000).get(); + Elements links = doc.select("a[href]"); + for (int i = 0; i < 5 && i < links.size(); i++) { + String title = links.get(i).text().trim(); + String url = links.get(i).attr("href"); + if (title.length() > 6 && url.startsWith("http")) { + list.add(new Article(title, "", url, "人民网", LocalDate.now())); + } + } + return list; + } +} + +// 新浪新闻 +class SinaStrategy implements CrawlStrategy { + @Override + public List
crawl() throws Exception { + List
list = new ArrayList
(); + Document doc = Jsoup.connect("https://news.sina.com.cn/") + .userAgent("Mozilla/5.0") + .timeout(6000).get(); + Elements links = doc.select("a[href]"); + int count = 0; + for (int i = 0; i < links.size(); i++) { + if(count >= 5) break; + String title = links.get(i).text().trim(); + String url = links.get(i).attr("href"); + if (title.length() > 8 && url.startsWith("http")) { + list.add(new Article(title, "", url, "新浪新闻", LocalDate.now())); + count++; + } + } + return list; + } +} + +// 策略工厂 兼容JDK8/11 +class StrategyFactory { + public static CrawlStrategy getStrategy(String type) { + if ("baidu".equalsIgnoreCase(type)) { + return new BaiduStrategy(); + } else if ("people".equalsIgnoreCase(type)) { + return new PeopleStrategy(); + } else if ("sina".equalsIgnoreCase(type)) { + return new SinaStrategy(); + } else { + throw new IllegalArgumentException("不支持的站点类型"); + } + } +} + +// 命令模式 +interface Command { + void execute() throws Exception; +} + +class CrawlCommand implements Command { + private CrawlStrategy strategy; + private List
globalData; + + public CrawlCommand(CrawlStrategy strategy, List
globalData) { + this.strategy = strategy; + this.globalData = globalData; + } + + @Override + public void execute() throws Exception { + List
data = strategy.crawl(); + globalData.addAll(data); + for(Article a : data){ + System.out.println(a); + } + } +} +// CLI控制器(已加入分层异常处理) +class CrawlController { + private List
allNews = new ArrayList
(); + public void runCLI() { + Scanner scanner = new Scanner(System.in); + System.out.println("===== 新闻爬虫CLI终端 ====="); + System.out.println("可用指令:baidu / people / sina / all / save / exit"); + while(true) { + System.out.print("\n请输入指令:"); + String input = scanner.nextLine().trim(); + try { + if("baidu".equalsIgnoreCase(input)){ + new CrawlCommand(StrategyFactory.getStrategy("baidu"), allNews).execute(); + }else if("people".equalsIgnoreCase(input)){ + new CrawlCommand(StrategyFactory.getStrategy("people"), allNews).execute(); + }else if("sina".equalsIgnoreCase(input)){ + new CrawlCommand(StrategyFactory.getStrategy("sina"), allNews).execute(); + }else if("all".equalsIgnoreCase(input)){ + crawlAll(); + }else if("save".equalsIgnoreCase(input)){ + saveToFile(); + }else if("exit".equalsIgnoreCase(input)){ + System.out.println("程序已退出"); + scanner.close(); + return; + }else{ + System.out.println("无效指令,请重新输入"); + } + // 分层异常捕获 + // 1. 业务层异常:参数、站点类型非法 + } catch (IllegalArgumentException e) { + System.out.println("业务调度异常:" + e.getMessage()); + // 2. 网络层异常:连接超时 + } catch (java.net.SocketTimeoutException e) { + System.out.println("网络层异常:网站连接超时,爬取失败"); + // 3. IO异常:区分网络请求 / 本地文件读写 + } catch (java.io.IOException e) { + if(e.getMessage().contains("news_data")){ + System.out.println("持久层异常:本地文件保存失败"); + }else{ + System.out.println("网络层异常:网页数据拉取失败"); + } + // 4. 全局兜底异常 + } catch (Exception e) { + System.out.println("系统未知异常:" + e.getMessage()); + } + } + } + private void crawlAll() throws Exception { + System.out.println("--- 开始批量爬取全部3个站点 ---"); + new CrawlCommand(StrategyFactory.getStrategy("baidu"), allNews).execute(); + new CrawlCommand(StrategyFactory.getStrategy("people"), allNews).execute(); + new CrawlCommand(StrategyFactory.getStrategy("sina"), allNews).execute(); + } + private void saveToFile() { + FileWriter writer = null; + try { + writer = new FileWriter("news_data.json"); + writer.write("{\n\"newsList\":[\n"); + for (int i = 0; i < allNews.size(); i++) { + Article a = allNews.get(i); + String json = "{\"title\":\""+a.getTitle()+"\",\"source\":\""+a.getAuthor()+"\",\"date\":\""+a.getPublishDate()+"\",\"url\":\""+a.getUrl()+"\"}"; + writer.write(json); + if(i != allNews.size()-1) writer.write(",\n"); + } + writer.write("\n]\n}"); + System.out.println("全部新闻数据已成功保存到项目根目录 news_data.json"); + } catch (IOException e) { + System.out.println("文件保存失败:" + e.getMessage()); + } finally { + if(writer != null){ + try { + writer.close(); + } catch (IOException ignored) {} + } + } + } +} +// 主类 文件名 Main.java +public class Main { + public static void main(String[] args) { + new CrawlController().runCLI(); + } +} \ No newline at end of file