You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
79 lines
2.7 KiB
79 lines
2.7 KiB
package com.cctv.news.command;
|
|
|
|
import com.cctv.news.view.OutputView;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class CrawlCommand implements Command {
|
|
private final OutputView view;
|
|
private final List<String> articles;
|
|
|
|
public CrawlCommand(OutputView view, List<String> articles) {
|
|
this.view = view;
|
|
this.articles = articles;
|
|
}
|
|
|
|
@Override
|
|
public String getName() {
|
|
return "crawl";
|
|
}
|
|
|
|
@Override
|
|
public String getHelp() {
|
|
return "crawl - 爬取央视新闻";
|
|
}
|
|
|
|
@Override
|
|
public void execute(String[] args) {
|
|
view.showMessage("开始爬取央视新闻...");
|
|
try {
|
|
Document doc = Jsoup.connect("https://news.cctv.com/world/")
|
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
|
.timeout(10000)
|
|
.get();
|
|
|
|
Elements newsItems = doc.select("a.item-title");
|
|
if (newsItems.isEmpty()) {
|
|
newsItems = doc.select("h1 a, h2 a, .news-title");
|
|
}
|
|
|
|
List<String> newArticles = new ArrayList<>();
|
|
for (Element item : newsItems) {
|
|
String title = item.text();
|
|
String url = item.attr("href");
|
|
if (!title.isEmpty() && url.startsWith("http")) {
|
|
String articleInfo = title + " - " + url;
|
|
newArticles.add(articleInfo);
|
|
articles.add(articleInfo);
|
|
}
|
|
}
|
|
|
|
if (newArticles.isEmpty()) {
|
|
view.showMessage("未能获取到文章,尝试其他选择器...");
|
|
Elements allLinks = doc.select("a[href]");
|
|
for (Element link : allLinks) {
|
|
String text = link.text().trim();
|
|
String href = link.attr("href");
|
|
if (text.length() > 10 && href.contains("cctv.com")) {
|
|
String articleInfo = text + " - " + href;
|
|
newArticles.add(articleInfo);
|
|
articles.add(articleInfo);
|
|
if (newArticles.size() >= 10) break;
|
|
}
|
|
}
|
|
}
|
|
|
|
view.showMessage("成功爬取 " + newArticles.size() + " 篇文章。");
|
|
view.showMessage("使用 list 命令可以查看已抓取的文章。");
|
|
|
|
} catch (IOException e) {
|
|
view.showError("爬取失败: " + e.getMessage());
|
|
}
|
|
}
|
|
}
|
|
|