1 changed files with 229 additions and 0 deletions
@ -0,0 +1,229 @@ |
|||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.time.LocalDate; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
// 实体类
|
||||
|
class Article { |
||||
|
private String title; |
||||
|
private String content; |
||||
|
private String url; |
||||
|
private String author; |
||||
|
private LocalDate publishDate; |
||||
|
|
||||
|
public Article(String title, String content, String url, String author, LocalDate publishDate) { |
||||
|
this.title = title; |
||||
|
this.content = content; |
||||
|
this.url = url; |
||||
|
this.author = author; |
||||
|
this.publishDate = publishDate; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { return title; } |
||||
|
public String getContent() { return content; } |
||||
|
public String getUrl() { return url; } |
||||
|
public String getAuthor() { return author; } |
||||
|
public LocalDate getPublishDate() { return publishDate; } |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "标题:" + title + |
||||
|
" | 来源:" + author + |
||||
|
" | 日期:" + publishDate + |
||||
|
" | 链接:" + url; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 策略接口
|
||||
|
interface CrawlStrategy { |
||||
|
List<Article> crawl() throws Exception; |
||||
|
} |
||||
|
|
||||
|
// 百度热搜
|
||||
|
class BaiduStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public List<Article> crawl() throws Exception { |
||||
|
List<Article> list = new ArrayList<Article>(); |
||||
|
Document doc = Jsoup.connect("https://top.baidu.com/board?tab=realtime") |
||||
|
.userAgent("Mozilla/5.0") |
||||
|
.timeout(6000).get(); |
||||
|
Elements items = doc.select(".category-wrap_iQLoo"); |
||||
|
for (int i = 0; i < 8 && i < items.size(); i++) { |
||||
|
String title = items.get(i).select(".c-single-text-ellipsis").text(); |
||||
|
String fullUrl = "https://top.baidu.com" + items.get(i).select("a").attr("href"); |
||||
|
list.add(new Article(title, "", fullUrl, "百度热搜", LocalDate.now())); |
||||
|
} |
||||
|
return list; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 人民网
|
||||
|
class PeopleStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public List<Article> crawl() throws Exception { |
||||
|
List<Article> list = new ArrayList<Article>(); |
||||
|
Document doc = Jsoup.connect("https://www.people.com.cn/") |
||||
|
.userAgent("Mozilla/5.0") |
||||
|
.timeout(6000).get(); |
||||
|
Elements links = doc.select("a[href]"); |
||||
|
for (int i = 0; i < 5 && i < links.size(); i++) { |
||||
|
String title = links.get(i).text().trim(); |
||||
|
String url = links.get(i).attr("href"); |
||||
|
if (title.length() > 6 && url.startsWith("http")) { |
||||
|
list.add(new Article(title, "", url, "人民网", LocalDate.now())); |
||||
|
} |
||||
|
} |
||||
|
return list; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 新浪新闻
|
||||
|
class SinaStrategy implements CrawlStrategy { |
||||
|
@Override |
||||
|
public List<Article> crawl() throws Exception { |
||||
|
List<Article> list = new ArrayList<Article>(); |
||||
|
Document doc = Jsoup.connect("https://news.sina.com.cn/") |
||||
|
.userAgent("Mozilla/5.0") |
||||
|
.timeout(6000).get(); |
||||
|
Elements links = doc.select("a[href]"); |
||||
|
int count = 0; |
||||
|
for (int i = 0; i < links.size(); i++) { |
||||
|
if(count >= 5) break; |
||||
|
String title = links.get(i).text().trim(); |
||||
|
String url = links.get(i).attr("href"); |
||||
|
if (title.length() > 8 && url.startsWith("http")) { |
||||
|
list.add(new Article(title, "", url, "新浪新闻", LocalDate.now())); |
||||
|
count++; |
||||
|
} |
||||
|
} |
||||
|
return list; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 策略工厂 兼容JDK8/11
|
||||
|
class StrategyFactory { |
||||
|
public static CrawlStrategy getStrategy(String type) { |
||||
|
if ("baidu".equalsIgnoreCase(type)) { |
||||
|
return new BaiduStrategy(); |
||||
|
} else if ("people".equalsIgnoreCase(type)) { |
||||
|
return new PeopleStrategy(); |
||||
|
} else if ("sina".equalsIgnoreCase(type)) { |
||||
|
return new SinaStrategy(); |
||||
|
} else { |
||||
|
throw new IllegalArgumentException("不支持的站点类型"); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 命令模式
|
||||
|
interface Command { |
||||
|
void execute() throws Exception; |
||||
|
} |
||||
|
|
||||
|
class CrawlCommand implements Command { |
||||
|
private CrawlStrategy strategy; |
||||
|
private List<Article> globalData; |
||||
|
|
||||
|
public CrawlCommand(CrawlStrategy strategy, List<Article> globalData) { |
||||
|
this.strategy = strategy; |
||||
|
this.globalData = globalData; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() throws Exception { |
||||
|
List<Article> data = strategy.crawl(); |
||||
|
globalData.addAll(data); |
||||
|
for(Article a : data){ |
||||
|
System.out.println(a); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
// CLI控制器(已加入分层异常处理)
|
||||
|
class CrawlController { |
||||
|
private List<Article> allNews = new ArrayList<Article>(); |
||||
|
public void runCLI() { |
||||
|
Scanner scanner = new Scanner(System.in); |
||||
|
System.out.println("===== 新闻爬虫CLI终端 ====="); |
||||
|
System.out.println("可用指令:baidu / people / sina / all / save / exit"); |
||||
|
while(true) { |
||||
|
System.out.print("\n请输入指令:"); |
||||
|
String input = scanner.nextLine().trim(); |
||||
|
try { |
||||
|
if("baidu".equalsIgnoreCase(input)){ |
||||
|
new CrawlCommand(StrategyFactory.getStrategy("baidu"), allNews).execute(); |
||||
|
}else if("people".equalsIgnoreCase(input)){ |
||||
|
new CrawlCommand(StrategyFactory.getStrategy("people"), allNews).execute(); |
||||
|
}else if("sina".equalsIgnoreCase(input)){ |
||||
|
new CrawlCommand(StrategyFactory.getStrategy("sina"), allNews).execute(); |
||||
|
}else if("all".equalsIgnoreCase(input)){ |
||||
|
crawlAll(); |
||||
|
}else if("save".equalsIgnoreCase(input)){ |
||||
|
saveToFile(); |
||||
|
}else if("exit".equalsIgnoreCase(input)){ |
||||
|
System.out.println("程序已退出"); |
||||
|
scanner.close(); |
||||
|
return; |
||||
|
}else{ |
||||
|
System.out.println("无效指令,请重新输入"); |
||||
|
} |
||||
|
// 分层异常捕获
|
||||
|
// 1. 业务层异常:参数、站点类型非法
|
||||
|
} catch (IllegalArgumentException e) { |
||||
|
System.out.println("业务调度异常:" + e.getMessage()); |
||||
|
// 2. 网络层异常:连接超时
|
||||
|
} catch (java.net.SocketTimeoutException e) { |
||||
|
System.out.println("网络层异常:网站连接超时,爬取失败"); |
||||
|
// 3. IO异常:区分网络请求 / 本地文件读写
|
||||
|
} catch (java.io.IOException e) { |
||||
|
if(e.getMessage().contains("news_data")){ |
||||
|
System.out.println("持久层异常:本地文件保存失败"); |
||||
|
}else{ |
||||
|
System.out.println("网络层异常:网页数据拉取失败"); |
||||
|
} |
||||
|
// 4. 全局兜底异常
|
||||
|
} catch (Exception e) { |
||||
|
System.out.println("系统未知异常:" + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
private void crawlAll() throws Exception { |
||||
|
System.out.println("--- 开始批量爬取全部3个站点 ---"); |
||||
|
new CrawlCommand(StrategyFactory.getStrategy("baidu"), allNews).execute(); |
||||
|
new CrawlCommand(StrategyFactory.getStrategy("people"), allNews).execute(); |
||||
|
new CrawlCommand(StrategyFactory.getStrategy("sina"), allNews).execute(); |
||||
|
} |
||||
|
private void saveToFile() { |
||||
|
FileWriter writer = null; |
||||
|
try { |
||||
|
writer = new FileWriter("news_data.json"); |
||||
|
writer.write("{\n\"newsList\":[\n"); |
||||
|
for (int i = 0; i < allNews.size(); i++) { |
||||
|
Article a = allNews.get(i); |
||||
|
String json = "{\"title\":\""+a.getTitle()+"\",\"source\":\""+a.getAuthor()+"\",\"date\":\""+a.getPublishDate()+"\",\"url\":\""+a.getUrl()+"\"}"; |
||||
|
writer.write(json); |
||||
|
if(i != allNews.size()-1) writer.write(",\n"); |
||||
|
} |
||||
|
writer.write("\n]\n}"); |
||||
|
System.out.println("全部新闻数据已成功保存到项目根目录 news_data.json"); |
||||
|
} catch (IOException e) { |
||||
|
System.out.println("文件保存失败:" + e.getMessage()); |
||||
|
} finally { |
||||
|
if(writer != null){ |
||||
|
try { |
||||
|
writer.close(); |
||||
|
} catch (IOException ignored) {} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
// 主类 文件名 Main.java
|
||||
|
public class Main { |
||||
|
public static void main(String[] args) { |
||||
|
new CrawlController().runCLI(); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue