You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

79 lines
2.7 KiB

package com.cctv.news.command;
import com.cctv.news.view.OutputView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class CrawlCommand implements Command {
private final OutputView view;
private final List<String> articles;
public CrawlCommand(OutputView view, List<String> articles) {
this.view = view;
this.articles = articles;
}
@Override
public String getName() {
return "crawl";
}
@Override
public String getHelp() {
return "crawl - 爬取央视新闻";
}
@Override
public void execute(String[] args) {
view.showMessage("开始爬取央视新闻...");
try {
Document doc = Jsoup.connect("https://news.cctv.com/world/")
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(10000)
.get();
Elements newsItems = doc.select("a.item-title");
if (newsItems.isEmpty()) {
newsItems = doc.select("h1 a, h2 a, .news-title");
}
List<String> newArticles = new ArrayList<>();
for (Element item : newsItems) {
String title = item.text();
String url = item.attr("href");
if (!title.isEmpty() && url.startsWith("http")) {
String articleInfo = title + " - " + url;
newArticles.add(articleInfo);
articles.add(articleInfo);
}
}
if (newArticles.isEmpty()) {
view.showMessage("未能获取到文章,尝试其他选择器...");
Elements allLinks = doc.select("a[href]");
for (Element link : allLinks) {
String text = link.text().trim();
String href = link.attr("href");
if (text.length() > 10 && href.contains("cctv.com")) {
String articleInfo = text + " - " + href;
newArticles.add(articleInfo);
articles.add(articleInfo);
if (newArticles.size() >= 10) break;
}
}
}
view.showMessage("成功爬取 " + newArticles.size() + " 篇文章。");
view.showMessage("使用 list 命令可以查看已抓取的文章。");
} catch (IOException e) {
view.showError("爬取失败: " + e.getMessage());
}
}
}