Browse Source

上传文件至 'project/project'

main
dengxitong 3 weeks ago
parent
commit
6ad244c8bd
  1. 229
      project/project/Main.java

229
project/project/Main.java

@ -0,0 +1,229 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.FileWriter;
import java.io.IOException;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
// 实体类
class Article {
private String title;
private String content;
private String url;
private String author;
private LocalDate publishDate;
public Article(String title, String content, String url, String author, LocalDate publishDate) {
this.title = title;
this.content = content;
this.url = url;
this.author = author;
this.publishDate = publishDate;
}
public String getTitle() { return title; }
public String getContent() { return content; }
public String getUrl() { return url; }
public String getAuthor() { return author; }
public LocalDate getPublishDate() { return publishDate; }
@Override
public String toString() {
return "标题:" + title +
" | 来源:" + author +
" | 日期:" + publishDate +
" | 链接:" + url;
}
}
// 策略接口
interface CrawlStrategy {
List<Article> crawl() throws Exception;
}
// 百度热搜
class BaiduStrategy implements CrawlStrategy {
@Override
public List<Article> crawl() throws Exception {
List<Article> list = new ArrayList<Article>();
Document doc = Jsoup.connect("https://top.baidu.com/board?tab=realtime")
.userAgent("Mozilla/5.0")
.timeout(6000).get();
Elements items = doc.select(".category-wrap_iQLoo");
for (int i = 0; i < 8 && i < items.size(); i++) {
String title = items.get(i).select(".c-single-text-ellipsis").text();
String fullUrl = "https://top.baidu.com" + items.get(i).select("a").attr("href");
list.add(new Article(title, "", fullUrl, "百度热搜", LocalDate.now()));
}
return list;
}
}
// 人民网
class PeopleStrategy implements CrawlStrategy {
@Override
public List<Article> crawl() throws Exception {
List<Article> list = new ArrayList<Article>();
Document doc = Jsoup.connect("https://www.people.com.cn/")
.userAgent("Mozilla/5.0")
.timeout(6000).get();
Elements links = doc.select("a[href]");
for (int i = 0; i < 5 && i < links.size(); i++) {
String title = links.get(i).text().trim();
String url = links.get(i).attr("href");
if (title.length() > 6 && url.startsWith("http")) {
list.add(new Article(title, "", url, "人民网", LocalDate.now()));
}
}
return list;
}
}
// 新浪新闻
class SinaStrategy implements CrawlStrategy {
@Override
public List<Article> crawl() throws Exception {
List<Article> list = new ArrayList<Article>();
Document doc = Jsoup.connect("https://news.sina.com.cn/")
.userAgent("Mozilla/5.0")
.timeout(6000).get();
Elements links = doc.select("a[href]");
int count = 0;
for (int i = 0; i < links.size(); i++) {
if(count >= 5) break;
String title = links.get(i).text().trim();
String url = links.get(i).attr("href");
if (title.length() > 8 && url.startsWith("http")) {
list.add(new Article(title, "", url, "新浪新闻", LocalDate.now()));
count++;
}
}
return list;
}
}
// 策略工厂 兼容JDK8/11
class StrategyFactory {
public static CrawlStrategy getStrategy(String type) {
if ("baidu".equalsIgnoreCase(type)) {
return new BaiduStrategy();
} else if ("people".equalsIgnoreCase(type)) {
return new PeopleStrategy();
} else if ("sina".equalsIgnoreCase(type)) {
return new SinaStrategy();
} else {
throw new IllegalArgumentException("不支持的站点类型");
}
}
}
// 命令模式
interface Command {
void execute() throws Exception;
}
class CrawlCommand implements Command {
private CrawlStrategy strategy;
private List<Article> globalData;
public CrawlCommand(CrawlStrategy strategy, List<Article> globalData) {
this.strategy = strategy;
this.globalData = globalData;
}
@Override
public void execute() throws Exception {
List<Article> data = strategy.crawl();
globalData.addAll(data);
for(Article a : data){
System.out.println(a);
}
}
}
// CLI控制器(已加入分层异常处理)
class CrawlController {
private List<Article> allNews = new ArrayList<Article>();
public void runCLI() {
Scanner scanner = new Scanner(System.in);
System.out.println("===== 新闻爬虫CLI终端 =====");
System.out.println("可用指令:baidu / people / sina / all / save / exit");
while(true) {
System.out.print("\n请输入指令:");
String input = scanner.nextLine().trim();
try {
if("baidu".equalsIgnoreCase(input)){
new CrawlCommand(StrategyFactory.getStrategy("baidu"), allNews).execute();
}else if("people".equalsIgnoreCase(input)){
new CrawlCommand(StrategyFactory.getStrategy("people"), allNews).execute();
}else if("sina".equalsIgnoreCase(input)){
new CrawlCommand(StrategyFactory.getStrategy("sina"), allNews).execute();
}else if("all".equalsIgnoreCase(input)){
crawlAll();
}else if("save".equalsIgnoreCase(input)){
saveToFile();
}else if("exit".equalsIgnoreCase(input)){
System.out.println("程序已退出");
scanner.close();
return;
}else{
System.out.println("无效指令,请重新输入");
}
// 分层异常捕获
// 1. 业务层异常:参数、站点类型非法
} catch (IllegalArgumentException e) {
System.out.println("业务调度异常:" + e.getMessage());
// 2. 网络层异常:连接超时
} catch (java.net.SocketTimeoutException e) {
System.out.println("网络层异常:网站连接超时,爬取失败");
// 3. IO异常:区分网络请求 / 本地文件读写
} catch (java.io.IOException e) {
if(e.getMessage().contains("news_data")){
System.out.println("持久层异常:本地文件保存失败");
}else{
System.out.println("网络层异常:网页数据拉取失败");
}
// 4. 全局兜底异常
} catch (Exception e) {
System.out.println("系统未知异常:" + e.getMessage());
}
}
}
private void crawlAll() throws Exception {
System.out.println("--- 开始批量爬取全部3个站点 ---");
new CrawlCommand(StrategyFactory.getStrategy("baidu"), allNews).execute();
new CrawlCommand(StrategyFactory.getStrategy("people"), allNews).execute();
new CrawlCommand(StrategyFactory.getStrategy("sina"), allNews).execute();
}
private void saveToFile() {
FileWriter writer = null;
try {
writer = new FileWriter("news_data.json");
writer.write("{\n\"newsList\":[\n");
for (int i = 0; i < allNews.size(); i++) {
Article a = allNews.get(i);
String json = "{\"title\":\""+a.getTitle()+"\",\"source\":\""+a.getAuthor()+"\",\"date\":\""+a.getPublishDate()+"\",\"url\":\""+a.getUrl()+"\"}";
writer.write(json);
if(i != allNews.size()-1) writer.write(",\n");
}
writer.write("\n]\n}");
System.out.println("全部新闻数据已成功保存到项目根目录 news_data.json");
} catch (IOException e) {
System.out.println("文件保存失败:" + e.getMessage());
} finally {
if(writer != null){
try {
writer.close();
} catch (IOException ignored) {}
}
}
}
}
// 主类 文件名 Main.java
public class Main {
public static void main(String[] args) {
new CrawlController().runCLI();
}
}
Loading…
Cancel
Save