1 changed files with 229 additions and 0 deletions
@ -0,0 +1,229 @@ |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.select.Elements; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.time.LocalDate; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.Scanner; |
|||
|
|||
// 实体类
|
|||
class Article { |
|||
private String title; |
|||
private String content; |
|||
private String url; |
|||
private String author; |
|||
private LocalDate publishDate; |
|||
|
|||
public Article(String title, String content, String url, String author, LocalDate publishDate) { |
|||
this.title = title; |
|||
this.content = content; |
|||
this.url = url; |
|||
this.author = author; |
|||
this.publishDate = publishDate; |
|||
} |
|||
|
|||
public String getTitle() { return title; } |
|||
public String getContent() { return content; } |
|||
public String getUrl() { return url; } |
|||
public String getAuthor() { return author; } |
|||
public LocalDate getPublishDate() { return publishDate; } |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "标题:" + title + |
|||
" | 来源:" + author + |
|||
" | 日期:" + publishDate + |
|||
" | 链接:" + url; |
|||
} |
|||
} |
|||
|
|||
// 策略接口
|
|||
interface CrawlStrategy { |
|||
List<Article> crawl() throws Exception; |
|||
} |
|||
|
|||
// 百度热搜
|
|||
class BaiduStrategy implements CrawlStrategy { |
|||
@Override |
|||
public List<Article> crawl() throws Exception { |
|||
List<Article> list = new ArrayList<Article>(); |
|||
Document doc = Jsoup.connect("https://top.baidu.com/board?tab=realtime") |
|||
.userAgent("Mozilla/5.0") |
|||
.timeout(6000).get(); |
|||
Elements items = doc.select(".category-wrap_iQLoo"); |
|||
for (int i = 0; i < 8 && i < items.size(); i++) { |
|||
String title = items.get(i).select(".c-single-text-ellipsis").text(); |
|||
String fullUrl = "https://top.baidu.com" + items.get(i).select("a").attr("href"); |
|||
list.add(new Article(title, "", fullUrl, "百度热搜", LocalDate.now())); |
|||
} |
|||
return list; |
|||
} |
|||
} |
|||
|
|||
// 人民网
|
|||
class PeopleStrategy implements CrawlStrategy { |
|||
@Override |
|||
public List<Article> crawl() throws Exception { |
|||
List<Article> list = new ArrayList<Article>(); |
|||
Document doc = Jsoup.connect("https://www.people.com.cn/") |
|||
.userAgent("Mozilla/5.0") |
|||
.timeout(6000).get(); |
|||
Elements links = doc.select("a[href]"); |
|||
for (int i = 0; i < 5 && i < links.size(); i++) { |
|||
String title = links.get(i).text().trim(); |
|||
String url = links.get(i).attr("href"); |
|||
if (title.length() > 6 && url.startsWith("http")) { |
|||
list.add(new Article(title, "", url, "人民网", LocalDate.now())); |
|||
} |
|||
} |
|||
return list; |
|||
} |
|||
} |
|||
|
|||
// 新浪新闻
|
|||
class SinaStrategy implements CrawlStrategy { |
|||
@Override |
|||
public List<Article> crawl() throws Exception { |
|||
List<Article> list = new ArrayList<Article>(); |
|||
Document doc = Jsoup.connect("https://news.sina.com.cn/") |
|||
.userAgent("Mozilla/5.0") |
|||
.timeout(6000).get(); |
|||
Elements links = doc.select("a[href]"); |
|||
int count = 0; |
|||
for (int i = 0; i < links.size(); i++) { |
|||
if(count >= 5) break; |
|||
String title = links.get(i).text().trim(); |
|||
String url = links.get(i).attr("href"); |
|||
if (title.length() > 8 && url.startsWith("http")) { |
|||
list.add(new Article(title, "", url, "新浪新闻", LocalDate.now())); |
|||
count++; |
|||
} |
|||
} |
|||
return list; |
|||
} |
|||
} |
|||
|
|||
// 策略工厂 兼容JDK8/11
|
|||
class StrategyFactory { |
|||
public static CrawlStrategy getStrategy(String type) { |
|||
if ("baidu".equalsIgnoreCase(type)) { |
|||
return new BaiduStrategy(); |
|||
} else if ("people".equalsIgnoreCase(type)) { |
|||
return new PeopleStrategy(); |
|||
} else if ("sina".equalsIgnoreCase(type)) { |
|||
return new SinaStrategy(); |
|||
} else { |
|||
throw new IllegalArgumentException("不支持的站点类型"); |
|||
} |
|||
} |
|||
} |
|||
|
|||
// 命令模式
|
|||
interface Command { |
|||
void execute() throws Exception; |
|||
} |
|||
|
|||
class CrawlCommand implements Command { |
|||
private CrawlStrategy strategy; |
|||
private List<Article> globalData; |
|||
|
|||
public CrawlCommand(CrawlStrategy strategy, List<Article> globalData) { |
|||
this.strategy = strategy; |
|||
this.globalData = globalData; |
|||
} |
|||
|
|||
@Override |
|||
public void execute() throws Exception { |
|||
List<Article> data = strategy.crawl(); |
|||
globalData.addAll(data); |
|||
for(Article a : data){ |
|||
System.out.println(a); |
|||
} |
|||
} |
|||
} |
|||
// CLI控制器(已加入分层异常处理)
|
|||
class CrawlController { |
|||
private List<Article> allNews = new ArrayList<Article>(); |
|||
public void runCLI() { |
|||
Scanner scanner = new Scanner(System.in); |
|||
System.out.println("===== 新闻爬虫CLI终端 ====="); |
|||
System.out.println("可用指令:baidu / people / sina / all / save / exit"); |
|||
while(true) { |
|||
System.out.print("\n请输入指令:"); |
|||
String input = scanner.nextLine().trim(); |
|||
try { |
|||
if("baidu".equalsIgnoreCase(input)){ |
|||
new CrawlCommand(StrategyFactory.getStrategy("baidu"), allNews).execute(); |
|||
}else if("people".equalsIgnoreCase(input)){ |
|||
new CrawlCommand(StrategyFactory.getStrategy("people"), allNews).execute(); |
|||
}else if("sina".equalsIgnoreCase(input)){ |
|||
new CrawlCommand(StrategyFactory.getStrategy("sina"), allNews).execute(); |
|||
}else if("all".equalsIgnoreCase(input)){ |
|||
crawlAll(); |
|||
}else if("save".equalsIgnoreCase(input)){ |
|||
saveToFile(); |
|||
}else if("exit".equalsIgnoreCase(input)){ |
|||
System.out.println("程序已退出"); |
|||
scanner.close(); |
|||
return; |
|||
}else{ |
|||
System.out.println("无效指令,请重新输入"); |
|||
} |
|||
// 分层异常捕获
|
|||
// 1. 业务层异常:参数、站点类型非法
|
|||
} catch (IllegalArgumentException e) { |
|||
System.out.println("业务调度异常:" + e.getMessage()); |
|||
// 2. 网络层异常:连接超时
|
|||
} catch (java.net.SocketTimeoutException e) { |
|||
System.out.println("网络层异常:网站连接超时,爬取失败"); |
|||
// 3. IO异常:区分网络请求 / 本地文件读写
|
|||
} catch (java.io.IOException e) { |
|||
if(e.getMessage().contains("news_data")){ |
|||
System.out.println("持久层异常:本地文件保存失败"); |
|||
}else{ |
|||
System.out.println("网络层异常:网页数据拉取失败"); |
|||
} |
|||
// 4. 全局兜底异常
|
|||
} catch (Exception e) { |
|||
System.out.println("系统未知异常:" + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
private void crawlAll() throws Exception { |
|||
System.out.println("--- 开始批量爬取全部3个站点 ---"); |
|||
new CrawlCommand(StrategyFactory.getStrategy("baidu"), allNews).execute(); |
|||
new CrawlCommand(StrategyFactory.getStrategy("people"), allNews).execute(); |
|||
new CrawlCommand(StrategyFactory.getStrategy("sina"), allNews).execute(); |
|||
} |
|||
private void saveToFile() { |
|||
FileWriter writer = null; |
|||
try { |
|||
writer = new FileWriter("news_data.json"); |
|||
writer.write("{\n\"newsList\":[\n"); |
|||
for (int i = 0; i < allNews.size(); i++) { |
|||
Article a = allNews.get(i); |
|||
String json = "{\"title\":\""+a.getTitle()+"\",\"source\":\""+a.getAuthor()+"\",\"date\":\""+a.getPublishDate()+"\",\"url\":\""+a.getUrl()+"\"}"; |
|||
writer.write(json); |
|||
if(i != allNews.size()-1) writer.write(",\n"); |
|||
} |
|||
writer.write("\n]\n}"); |
|||
System.out.println("全部新闻数据已成功保存到项目根目录 news_data.json"); |
|||
} catch (IOException e) { |
|||
System.out.println("文件保存失败:" + e.getMessage()); |
|||
} finally { |
|||
if(writer != null){ |
|||
try { |
|||
writer.close(); |
|||
} catch (IOException ignored) {} |
|||
} |
|||
} |
|||
} |
|||
} |
|||
// 主类 文件名 Main.java
|
|||
public class Main { |
|||
public static void main(String[] args) { |
|||
new CrawlController().runCLI(); |
|||
} |
|||
} |
|||
Loading…
Reference in new issue