4 changed files with 166 additions and 0 deletions
@ -0,0 +1,46 @@ |
|||
package org.example.strategy; |
|||
|
|||
import org.example.exception.CrawlerException; |
|||
import org.example.model.Article; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class BaiduStrategy implements CrawlerStrategy { |
|||
|
|||
@Override |
|||
public List<Article> crawl(String keyword) throws CrawlerException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
String url = "https://www.baidu.com/s?wd=" + keyword; |
|||
|
|||
Document document = Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
|||
.timeout(10000) |
|||
.get(); |
|||
Elements elements = document.select("h3.t > a"); |
|||
|
|||
for (Element element : elements) { |
|||
String title = element.text(); |
|||
String href = element.attr("href"); |
|||
if (!title.isEmpty() && !href.isEmpty()) { |
|||
articles.add(new Article(title, href)); |
|||
} |
|||
} |
|||
|
|||
} catch (IOException e) { |
|||
throw new CrawlerException("百度爬取失败: " + e.getMessage()); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "Baidu"; |
|||
} |
|||
} |
|||
@ -0,0 +1,50 @@ |
|||
package org.example.strategy; |
|||
|
|||
import org.example.exception.CrawlerException; |
|||
import org.example.model.Article; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class BingStrategy implements CrawlerStrategy { |
|||
|
|||
@Override |
|||
public List<Article> crawl(String keyword) throws CrawlerException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
// 1. 构造必应搜索 URL
|
|||
String url = "https://cn.bing.com/search?q=" + keyword; |
|||
|
|||
// 2. 发送请求
|
|||
Document document = Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
|||
.timeout(10000) |
|||
.get(); |
|||
|
|||
// 3. 解析内容 (必应结果通常在 li.b_algo h2 > a)
|
|||
Elements elements = document.select("li.b_algo h2 > a"); |
|||
|
|||
for (Element element : elements) { |
|||
String title = element.text(); |
|||
String href = element.attr("href"); |
|||
if (!title.isEmpty() && !href.isEmpty()) { |
|||
articles.add(new Article(title, href)); |
|||
} |
|||
} |
|||
|
|||
} catch (IOException e) { |
|||
throw new CrawlerException("必应爬取失败: " + e.getMessage()); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "Bing"; |
|||
} |
|||
} |
|||
@ -0,0 +1,23 @@ |
|||
package org.example.strategy; |
|||
|
|||
import org.example.exception.CrawlerException; |
|||
import org.example.model.Article; |
|||
|
|||
import java.util.List; |
|||
|
|||
/** |
|||
* 爬虫策略接口 (或抽象类) |
|||
*/ |
|||
public interface CrawlerStrategy { // 如果是 abstract class 也可以,只要方法定义一致
|
|||
|
|||
String getName(); |
|||
|
|||
/** |
|||
* 【关键】在这里添加 crawl 方法! |
|||
* 这是所有具体策略(百度、必应等)必须实现的方法。 |
|||
* @param keyword 搜索关键词 |
|||
* @return 爬取到的文章列表 |
|||
* @throws CrawlerException 爬取失败时抛出 |
|||
*/ |
|||
List<Article> crawl(String keyword) throws CrawlerException; |
|||
} |
|||
@ -0,0 +1,47 @@ |
|||
package org.example.strategy; |
|||
|
|||
import org.example.exception.CrawlerException; |
|||
import org.example.model.Article; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class CsdnStrategy implements CrawlerStrategy { |
|||
|
|||
@Override |
|||
public List<Article> crawl(String keyword) throws CrawlerException { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
String url = "https://so.csdn.net/so/search?q=" + keyword + "&t=&u="; |
|||
|
|||
Document document = Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") |
|||
.timeout(10000) |
|||
.get(); |
|||
|
|||
Elements elements = document.select(".search-list .list-con dl dd a"); |
|||
|
|||
for (Element element : elements) { |
|||
String title = element.text(); |
|||
String href = element.attr("href"); |
|||
if (!title.isEmpty() && !href.isEmpty()) { |
|||
articles.add(new Article(title, href)); |
|||
} |
|||
} |
|||
|
|||
} catch (IOException e) { |
|||
throw new CrawlerException("CSDN 爬取失败: " + e.getMessage()); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "CSDN"; |
|||
} |
|||
} |
|||
Loading…
Reference in new issue