Browse Source

上传Stategy包

main
wangyandi 3 weeks ago
parent
commit
17cb76a469
  1. 46
      project/src/main/java/org/example/strategy/BaiduStrategy.java
  2. 50
      project/src/main/java/org/example/strategy/BingStrategy.java
  3. 23
      project/src/main/java/org/example/strategy/CrawlerStrategy.java
  4. 47
      project/src/main/java/org/example/strategy/CsdnStrategy.java

46
project/src/main/java/org/example/strategy/BaiduStrategy.java

@ -0,0 +1,46 @@
package org.example.strategy;
import org.example.exception.CrawlerException;
import org.example.model.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class BaiduStrategy implements CrawlerStrategy {
@Override
public List<Article> crawl(String keyword) throws CrawlerException {
List<Article> articles = new ArrayList<>();
try {
String url = "https://www.baidu.com/s?wd=" + keyword;
Document document = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(10000)
.get();
Elements elements = document.select("h3.t > a");
for (Element element : elements) {
String title = element.text();
String href = element.attr("href");
if (!title.isEmpty() && !href.isEmpty()) {
articles.add(new Article(title, href));
}
}
} catch (IOException e) {
throw new CrawlerException("百度爬取失败: " + e.getMessage());
}
return articles;
}
@Override
public String getName() {
return "Baidu";
}
}

50
project/src/main/java/org/example/strategy/BingStrategy.java

@ -0,0 +1,50 @@
package org.example.strategy;
import org.example.exception.CrawlerException;
import org.example.model.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class BingStrategy implements CrawlerStrategy {
@Override
public List<Article> crawl(String keyword) throws CrawlerException {
List<Article> articles = new ArrayList<>();
try {
// 1. 构造必应搜索 URL
String url = "https://cn.bing.com/search?q=" + keyword;
// 2. 发送请求
Document document = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(10000)
.get();
// 3. 解析内容 (必应结果通常在 li.b_algo h2 > a)
Elements elements = document.select("li.b_algo h2 > a");
for (Element element : elements) {
String title = element.text();
String href = element.attr("href");
if (!title.isEmpty() && !href.isEmpty()) {
articles.add(new Article(title, href));
}
}
} catch (IOException e) {
throw new CrawlerException("必应爬取失败: " + e.getMessage());
}
return articles;
}
@Override
public String getName() {
return "Bing";
}
}

23
project/src/main/java/org/example/strategy/CrawlerStrategy.java

@ -0,0 +1,23 @@
package org.example.strategy;
import org.example.exception.CrawlerException;
import org.example.model.Article;
import java.util.List;
/**
* 爬虫策略接口 (或抽象类)
*/
public interface CrawlerStrategy { // 如果是 abstract class 也可以,只要方法定义一致
String getName();
/**
* 关键在这里添加 crawl 方法
* 这是所有具体策略百度必应等必须实现的方法
* @param keyword 搜索关键词
* @return 爬取到的文章列表
* @throws CrawlerException 爬取失败时抛出
*/
List<Article> crawl(String keyword) throws CrawlerException;
}

47
project/src/main/java/org/example/strategy/CsdnStrategy.java

@ -0,0 +1,47 @@
package org.example.strategy;
import org.example.exception.CrawlerException;
import org.example.model.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class CsdnStrategy implements CrawlerStrategy {
@Override
public List<Article> crawl(String keyword) throws CrawlerException {
List<Article> articles = new ArrayList<>();
try {
String url = "https://so.csdn.net/so/search?q=" + keyword + "&t=&u=";
Document document = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
.timeout(10000)
.get();
Elements elements = document.select(".search-list .list-con dl dd a");
for (Element element : elements) {
String title = element.text();
String href = element.attr("href");
if (!title.isEmpty() && !href.isEmpty()) {
articles.add(new Article(title, href));
}
}
} catch (IOException e) {
throw new CrawlerException("CSDN 爬取失败: " + e.getMessage());
}
return articles;
}
@Override
public String getName() {
return "CSDN";
}
}
Loading…
Cancel
Save