From 17cb76a469a5a4ff0dce4e9832adaa4bc38f79e3 Mon Sep 17 00:00:00 2001 From: wangyandi <3512851994@qq.com> Date: Sun, 31 May 2026 13:52:53 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0Stategy=E5=8C=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../org/example/strategy/BaiduStrategy.java | 46 +++++++++++++++++ .../org/example/strategy/BingStrategy.java | 50 +++++++++++++++++++ .../org/example/strategy/CrawlerStrategy.java | 23 +++++++++ .../org/example/strategy/CsdnStrategy.java | 47 +++++++++++++++++ 4 files changed, 166 insertions(+) create mode 100644 project/src/main/java/org/example/strategy/BaiduStrategy.java create mode 100644 project/src/main/java/org/example/strategy/BingStrategy.java create mode 100644 project/src/main/java/org/example/strategy/CrawlerStrategy.java create mode 100644 project/src/main/java/org/example/strategy/CsdnStrategy.java diff --git a/project/src/main/java/org/example/strategy/BaiduStrategy.java b/project/src/main/java/org/example/strategy/BaiduStrategy.java new file mode 100644 index 0000000..35e36a8 --- /dev/null +++ b/project/src/main/java/org/example/strategy/BaiduStrategy.java @@ -0,0 +1,46 @@ +package org.example.strategy; + +import org.example.exception.CrawlerException; +import org.example.model.Article; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class BaiduStrategy implements CrawlerStrategy { + + @Override + public List
crawl(String keyword) throws CrawlerException { + List
articles = new ArrayList<>(); + try { + String url = "https://www.baidu.com/s?wd=" + keyword; + + Document document = Jsoup.connect(url) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") + .timeout(10000) + .get(); + Elements elements = document.select("h3.t > a"); + + for (Element element : elements) { + String title = element.text(); + String href = element.attr("href"); + if (!title.isEmpty() && !href.isEmpty()) { + articles.add(new Article(title, href)); + } + } + + } catch (IOException e) { + throw new CrawlerException("百度爬取失败: " + e.getMessage()); + } + return articles; + } + + @Override + public String getName() { + return "Baidu"; + } +} \ No newline at end of file diff --git a/project/src/main/java/org/example/strategy/BingStrategy.java b/project/src/main/java/org/example/strategy/BingStrategy.java new file mode 100644 index 0000000..475a87d --- /dev/null +++ b/project/src/main/java/org/example/strategy/BingStrategy.java @@ -0,0 +1,50 @@ +package org.example.strategy; + +import org.example.exception.CrawlerException; +import org.example.model.Article; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class BingStrategy implements CrawlerStrategy { + + @Override + public List
crawl(String keyword) throws CrawlerException { + List
articles = new ArrayList<>(); + try { + // 1. 构造必应搜索 URL + String url = "https://cn.bing.com/search?q=" + keyword; + + // 2. 发送请求 + Document document = Jsoup.connect(url) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") + .timeout(10000) + .get(); + + // 3. 解析内容 (必应结果通常在 li.b_algo h2 > a) + Elements elements = document.select("li.b_algo h2 > a"); + + for (Element element : elements) { + String title = element.text(); + String href = element.attr("href"); + if (!title.isEmpty() && !href.isEmpty()) { + articles.add(new Article(title, href)); + } + } + + } catch (IOException e) { + throw new CrawlerException("必应爬取失败: " + e.getMessage()); + } + return articles; + } + + @Override + public String getName() { + return "Bing"; + } +} \ No newline at end of file diff --git a/project/src/main/java/org/example/strategy/CrawlerStrategy.java b/project/src/main/java/org/example/strategy/CrawlerStrategy.java new file mode 100644 index 0000000..0dd9be2 --- /dev/null +++ b/project/src/main/java/org/example/strategy/CrawlerStrategy.java @@ -0,0 +1,23 @@ +package org.example.strategy; + +import org.example.exception.CrawlerException; +import org.example.model.Article; + +import java.util.List; + +/** + * 爬虫策略接口 (或抽象类) + */ +public interface CrawlerStrategy { // 如果是 abstract class 也可以,只要方法定义一致 + + String getName(); + + /** + * 【关键】在这里添加 crawl 方法! + * 这是所有具体策略(百度、必应等)必须实现的方法。 + * @param keyword 搜索关键词 + * @return 爬取到的文章列表 + * @throws CrawlerException 爬取失败时抛出 + */ + List
crawl(String keyword) throws CrawlerException; +} \ No newline at end of file diff --git a/project/src/main/java/org/example/strategy/CsdnStrategy.java b/project/src/main/java/org/example/strategy/CsdnStrategy.java new file mode 100644 index 0000000..efe7c63 --- /dev/null +++ b/project/src/main/java/org/example/strategy/CsdnStrategy.java @@ -0,0 +1,47 @@ +package org.example.strategy; + +import org.example.exception.CrawlerException; +import org.example.model.Article; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class CsdnStrategy implements CrawlerStrategy { + + @Override + public List
crawl(String keyword) throws CrawlerException { + List
articles = new ArrayList<>(); + try { + String url = "https://so.csdn.net/so/search?q=" + keyword + "&t=&u="; + + Document document = Jsoup.connect(url) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") + .timeout(10000) + .get(); + + Elements elements = document.select(".search-list .list-con dl dd a"); + + for (Element element : elements) { + String title = element.text(); + String href = element.attr("href"); + if (!title.isEmpty() && !href.isEmpty()) { + articles.add(new Article(title, href)); + } + } + + } catch (IOException e) { + throw new CrawlerException("CSDN 爬取失败: " + e.getMessage()); + } + return articles; + } + + @Override + public String getName() { + return "CSDN"; + } +} \ No newline at end of file