Browse Source

上传文件至 '新闻网爬虫'

main
peisishuang 2 months ago
parent
commit
51b8414b09
  1. 7
      新闻网爬虫/Command.java
  2. 79
      新闻网爬虫/CrawlCommand.java
  3. 27
      新闻网爬虫/ExitCommand.java
  4. 33
      新闻网爬虫/Main.java

7
新闻网爬虫/Command.java

@ -0,0 +1,7 @@
package com.cctv.news.command;
public interface Command {
String getName();
String getHelp();
void execute(String[] args);
}

79
新闻网爬虫/CrawlCommand.java

@ -0,0 +1,79 @@
package com.cctv.news.command;
import com.cctv.news.view.OutputView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class CrawlCommand implements Command {
private final OutputView view;
private final List<String> articles;
public CrawlCommand(OutputView view, List<String> articles) {
this.view = view;
this.articles = articles;
}
@Override
public String getName() {
return "crawl";
}
@Override
public String getHelp() {
return "crawl - 爬取央视新闻";
}
@Override
public void execute(String[] args) {
view.showMessage("开始爬取央视新闻...");
try {
Document doc = Jsoup.connect("https://news.cctv.com/world/")
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(10000)
.get();
Elements newsItems = doc.select("a.item-title");
if (newsItems.isEmpty()) {
newsItems = doc.select("h1 a, h2 a, .news-title");
}
List<String> newArticles = new ArrayList<>();
for (Element item : newsItems) {
String title = item.text();
String url = item.attr("href");
if (!title.isEmpty() && url.startsWith("http")) {
String articleInfo = title + " - " + url;
newArticles.add(articleInfo);
articles.add(articleInfo);
}
}
if (newArticles.isEmpty()) {
view.showMessage("未能获取到文章,尝试其他选择器...");
Elements allLinks = doc.select("a[href]");
for (Element link : allLinks) {
String text = link.text().trim();
String href = link.attr("href");
if (text.length() > 10 && href.contains("cctv.com")) {
String articleInfo = text + " - " + href;
newArticles.add(articleInfo);
articles.add(articleInfo);
if (newArticles.size() >= 10) break;
}
}
}
view.showMessage("成功爬取 " + newArticles.size() + " 篇文章。");
view.showMessage("使用 list 命令可以查看已抓取的文章。");
} catch (IOException e) {
view.showError("爬取失败: " + e.getMessage());
}
}
}

27
新闻网爬虫/ExitCommand.java

@ -0,0 +1,27 @@
package com.cctv.news.command;
import com.cctv.news.view.OutputView;
public class ExitCommand implements Command {
private final OutputView view;
public ExitCommand(OutputView view) {
this.view = view;
}
@Override
public String getName() {
return "exit";
}
@Override
public String getHelp() {
return "exit - 退出程序";
}
@Override
public void execute(String[] args) {
view.showMessage("感谢使用央视新闻爬虫,再见!");
System.exit(0);
}
}

33
新闻网爬虫/Main.java

@ -0,0 +1,33 @@
package com.cctv.news;
import com.cctv.news.command.*;
import com.cctv.news.controller.CommandController;
import com.cctv.news.view.ConsoleView;
import com.cctv.news.view.OutputView;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
public class Main {
public static void main(String[] args) {
OutputView view = new ConsoleView();
List<String> articles = new ArrayList<>();
CommandController controller = new CommandController(view);
controller.registerCommand(new HelpCommand(view));
controller.registerCommand(new ListCommand(view, articles));
controller.registerCommand(new CrawlCommand(view, articles));
controller.registerCommand(new ExitCommand(view));
view.showWelcome();
view.showMessage("输入 help 查看可用命令");
Scanner scanner = new Scanner(System.in);
while (true) {
view.showPrompt();
String input = scanner.nextLine();
controller.executeCommand(input);
}
}
}
Loading…
Cancel
Save