4 changed files with 166 additions and 0 deletions
@ -0,0 +1,46 @@ |
|||||
|
package org.example.strategy; |
||||
|
|
||||
|
import org.example.exception.CrawlerException; |
||||
|
import org.example.model.Article; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class BaiduStrategy implements CrawlerStrategy { |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> crawl(String keyword) throws CrawlerException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
String url = "https://www.baidu.com/s?wd=" + keyword; |
||||
|
|
||||
|
Document document = Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
||||
|
.timeout(10000) |
||||
|
.get(); |
||||
|
Elements elements = document.select("h3.t > a"); |
||||
|
|
||||
|
for (Element element : elements) { |
||||
|
String title = element.text(); |
||||
|
String href = element.attr("href"); |
||||
|
if (!title.isEmpty() && !href.isEmpty()) { |
||||
|
articles.add(new Article(title, href)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
throw new CrawlerException("百度爬取失败: " + e.getMessage()); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "Baidu"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,50 @@ |
|||||
|
package org.example.strategy; |
||||
|
|
||||
|
import org.example.exception.CrawlerException; |
||||
|
import org.example.model.Article; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class BingStrategy implements CrawlerStrategy { |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> crawl(String keyword) throws CrawlerException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
// 1. 构造必应搜索 URL
|
||||
|
String url = "https://cn.bing.com/search?q=" + keyword; |
||||
|
|
||||
|
// 2. 发送请求
|
||||
|
Document document = Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
||||
|
.timeout(10000) |
||||
|
.get(); |
||||
|
|
||||
|
// 3. 解析内容 (必应结果通常在 li.b_algo h2 > a)
|
||||
|
Elements elements = document.select("li.b_algo h2 > a"); |
||||
|
|
||||
|
for (Element element : elements) { |
||||
|
String title = element.text(); |
||||
|
String href = element.attr("href"); |
||||
|
if (!title.isEmpty() && !href.isEmpty()) { |
||||
|
articles.add(new Article(title, href)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
throw new CrawlerException("必应爬取失败: " + e.getMessage()); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "Bing"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,23 @@ |
|||||
|
package org.example.strategy; |
||||
|
|
||||
|
import org.example.exception.CrawlerException; |
||||
|
import org.example.model.Article; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 爬虫策略接口 (或抽象类) |
||||
|
*/ |
||||
|
public interface CrawlerStrategy { // 如果是 abstract class 也可以,只要方法定义一致
|
||||
|
|
||||
|
String getName(); |
||||
|
|
||||
|
/** |
||||
|
* 【关键】在这里添加 crawl 方法! |
||||
|
* 这是所有具体策略(百度、必应等)必须实现的方法。 |
||||
|
* @param keyword 搜索关键词 |
||||
|
* @return 爬取到的文章列表 |
||||
|
* @throws CrawlerException 爬取失败时抛出 |
||||
|
*/ |
||||
|
List<Article> crawl(String keyword) throws CrawlerException; |
||||
|
} |
||||
@ -0,0 +1,47 @@ |
|||||
|
package org.example.strategy; |
||||
|
|
||||
|
import org.example.exception.CrawlerException; |
||||
|
import org.example.model.Article; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CsdnStrategy implements CrawlerStrategy { |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> crawl(String keyword) throws CrawlerException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
String url = "https://so.csdn.net/so/search?q=" + keyword + "&t=&u="; |
||||
|
|
||||
|
Document document = Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
||||
|
.timeout(10000) |
||||
|
.get(); |
||||
|
|
||||
|
Elements elements = document.select(".search-list .list-con dl dd a"); |
||||
|
|
||||
|
for (Element element : elements) { |
||||
|
String title = element.text(); |
||||
|
String href = element.attr("href"); |
||||
|
if (!title.isEmpty() && !href.isEmpty()) { |
||||
|
articles.add(new Article(title, href)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
throw new CrawlerException("CSDN 爬取失败: " + e.getMessage()); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String getName() { |
||||
|
return "CSDN"; |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue