1 changed files with 153 additions and 0 deletions
@ -0,0 +1,153 @@ |
|||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.time.LocalDate; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class Article { |
||||
|
private String title; |
||||
|
private String content; |
||||
|
private String url; |
||||
|
private String author; |
||||
|
private LocalDate publishDate; |
||||
|
|
||||
|
public Article(String title, String content, String url, String author, LocalDate publishDate) { |
||||
|
this.title = title; |
||||
|
this.content = content; |
||||
|
this.url = url; |
||||
|
this.author = author; |
||||
|
this.publishDate = publishDate; |
||||
|
} |
||||
|
|
||||
|
public String getAuthor() { return author; } |
||||
|
public void setAuthor(String author) { this.author = author; } |
||||
|
|
||||
|
public LocalDate getPublishDate() { return publishDate; } |
||||
|
public void setPublishDate(LocalDate publishDate) { this.publishDate = publishDate; } |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "标题:" + title + |
||||
|
" | 作者:" + author + |
||||
|
" | 发布日期:" + publishDate + |
||||
|
" | 链接:" + url; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
interface CrawlStrategy { |
||||
|
List<Article> crawl(); |
||||
|
} |
||||
|
|
||||
|
class BaiduCrawl implements CrawlStrategy { |
||||
|
@Override |
||||
|
public List<Article> crawl() { |
||||
|
List<Article> articleList = new ArrayList<>(); |
||||
|
try { |
||||
|
Document doc = Jsoup.connect("https://top.baidu.com/board?tab=realtime") |
||||
|
.userAgent("Mozilla/5.0") |
||||
|
.timeout(5000) |
||||
|
.get(); |
||||
|
Elements items = doc.getElementsByClass("category-wrap_iQLoo horizontal_1eKyQ"); |
||||
|
int count = 0; |
||||
|
for (Element item : items) { |
||||
|
if (count >= 8) break; |
||||
|
String title = item.getElementsByClass("c-single-text-ellipsis").text(); |
||||
|
String link = item.select("a").attr("href"); |
||||
|
String fullUrl = "https://top.baidu.com" + link; |
||||
|
Article article = new Article(title, "", fullUrl, "百度热搜", LocalDate.now()); |
||||
|
articleList.add(article); |
||||
|
count++; |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
System.out.println("百度热搜爬取失败"); |
||||
|
} |
||||
|
return articleList; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
class PeopleCrawl implements CrawlStrategy { |
||||
|
@Override |
||||
|
public List<Article> crawl() { |
||||
|
List<Article> articleList = new ArrayList<>(); |
||||
|
try { |
||||
|
Document doc = Jsoup.connect("http://www.people.com.cn/GB/59476/review/20260514.html") |
||||
|
.userAgent("Mozilla/5.0") |
||||
|
.timeout(5000) |
||||
|
.get(); |
||||
|
Elements links = doc.select("a[href^=http://politics.people.com.cn]"); |
||||
|
int count = 0; |
||||
|
for (Element link : links) { |
||||
|
if (count >= 5) break; |
||||
|
String title = link.text(); |
||||
|
String url = link.attr("href"); |
||||
|
if (!title.isEmpty()) { |
||||
|
Article article = new Article(title, "", url, "人民网", LocalDate.now()); |
||||
|
articleList.add(article); |
||||
|
count++; |
||||
|
} |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
System.out.println("人民网新闻爬取失败"); |
||||
|
} |
||||
|
return articleList; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
class StrategyFactory { |
||||
|
public static CrawlStrategy getCrawlStrategy(String type) { |
||||
|
if ("baidu".equalsIgnoreCase(type)) { |
||||
|
return new BaiduCrawl(); |
||||
|
} else if ("people".equalsIgnoreCase(type)) { |
||||
|
return new PeopleCrawl(); |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
class ArticleRepository { |
||||
|
private final List<Article> allArticles = new ArrayList<>(); |
||||
|
|
||||
|
public void saveArticles(List<Article> list) { |
||||
|
allArticles.addAll(list); |
||||
|
} |
||||
|
|
||||
|
public List<Article> getAllArticles() { |
||||
|
return allArticles; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
interface Command { |
||||
|
void execute(); |
||||
|
} |
||||
|
|
||||
|
class CrawlCommand implements Command { |
||||
|
private final CrawlStrategy strategy; |
||||
|
private final ArticleRepository repository; |
||||
|
|
||||
|
public CrawlCommand(CrawlStrategy strategy, ArticleRepository repository) { |
||||
|
this.strategy = strategy; |
||||
|
this.repository = repository; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute() { |
||||
|
List<Article> articles = strategy.crawl(); |
||||
|
repository.saveArticles(articles); |
||||
|
articles.forEach(System.out::println); |
||||
|
} |
||||
|
} |
||||
|
class CrawlMain { |
||||
|
public static void main(String[] args) { |
||||
|
ArticleRepository repository = new ArticleRepository(); |
||||
|
System.out.println("===== 百度实时热搜 ====="); |
||||
|
Command baiduCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("baidu"), repository); |
||||
|
baiduCmd.execute(); |
||||
|
System.out.println("\n===== 人民网头条 ====="); |
||||
|
Command peopleCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("people"), repository); |
||||
|
peopleCmd.execute(); |
||||
|
System.out.println("\n===== 全部新闻汇总 ====="); |
||||
|
repository.getAllArticles().forEach(System.out::println); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue