1 changed files with 153 additions and 0 deletions
@ -0,0 +1,153 @@ |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.time.LocalDate; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class Article { |
|||
private String title; |
|||
private String content; |
|||
private String url; |
|||
private String author; |
|||
private LocalDate publishDate; |
|||
|
|||
public Article(String title, String content, String url, String author, LocalDate publishDate) { |
|||
this.title = title; |
|||
this.content = content; |
|||
this.url = url; |
|||
this.author = author; |
|||
this.publishDate = publishDate; |
|||
} |
|||
|
|||
public String getAuthor() { return author; } |
|||
public void setAuthor(String author) { this.author = author; } |
|||
|
|||
public LocalDate getPublishDate() { return publishDate; } |
|||
public void setPublishDate(LocalDate publishDate) { this.publishDate = publishDate; } |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "标题:" + title + |
|||
" | 作者:" + author + |
|||
" | 发布日期:" + publishDate + |
|||
" | 链接:" + url; |
|||
} |
|||
} |
|||
|
|||
interface CrawlStrategy { |
|||
List<Article> crawl(); |
|||
} |
|||
|
|||
class BaiduCrawl implements CrawlStrategy { |
|||
@Override |
|||
public List<Article> crawl() { |
|||
List<Article> articleList = new ArrayList<>(); |
|||
try { |
|||
Document doc = Jsoup.connect("https://top.baidu.com/board?tab=realtime") |
|||
.userAgent("Mozilla/5.0") |
|||
.timeout(5000) |
|||
.get(); |
|||
Elements items = doc.getElementsByClass("category-wrap_iQLoo horizontal_1eKyQ"); |
|||
int count = 0; |
|||
for (Element item : items) { |
|||
if (count >= 8) break; |
|||
String title = item.getElementsByClass("c-single-text-ellipsis").text(); |
|||
String link = item.select("a").attr("href"); |
|||
String fullUrl = "https://top.baidu.com" + link; |
|||
Article article = new Article(title, "", fullUrl, "百度热搜", LocalDate.now()); |
|||
articleList.add(article); |
|||
count++; |
|||
} |
|||
} catch (Exception e) { |
|||
System.out.println("百度热搜爬取失败"); |
|||
} |
|||
return articleList; |
|||
} |
|||
} |
|||
|
|||
class PeopleCrawl implements CrawlStrategy { |
|||
@Override |
|||
public List<Article> crawl() { |
|||
List<Article> articleList = new ArrayList<>(); |
|||
try { |
|||
Document doc = Jsoup.connect("http://www.people.com.cn/GB/59476/review/20260514.html") |
|||
.userAgent("Mozilla/5.0") |
|||
.timeout(5000) |
|||
.get(); |
|||
Elements links = doc.select("a[href^=http://politics.people.com.cn]"); |
|||
int count = 0; |
|||
for (Element link : links) { |
|||
if (count >= 5) break; |
|||
String title = link.text(); |
|||
String url = link.attr("href"); |
|||
if (!title.isEmpty()) { |
|||
Article article = new Article(title, "", url, "人民网", LocalDate.now()); |
|||
articleList.add(article); |
|||
count++; |
|||
} |
|||
} |
|||
} catch (Exception e) { |
|||
System.out.println("人民网新闻爬取失败"); |
|||
} |
|||
return articleList; |
|||
} |
|||
} |
|||
|
|||
class StrategyFactory { |
|||
public static CrawlStrategy getCrawlStrategy(String type) { |
|||
if ("baidu".equalsIgnoreCase(type)) { |
|||
return new BaiduCrawl(); |
|||
} else if ("people".equalsIgnoreCase(type)) { |
|||
return new PeopleCrawl(); |
|||
} |
|||
return null; |
|||
} |
|||
} |
|||
|
|||
class ArticleRepository { |
|||
private final List<Article> allArticles = new ArrayList<>(); |
|||
|
|||
public void saveArticles(List<Article> list) { |
|||
allArticles.addAll(list); |
|||
} |
|||
|
|||
public List<Article> getAllArticles() { |
|||
return allArticles; |
|||
} |
|||
} |
|||
|
|||
interface Command { |
|||
void execute(); |
|||
} |
|||
|
|||
class CrawlCommand implements Command { |
|||
private final CrawlStrategy strategy; |
|||
private final ArticleRepository repository; |
|||
|
|||
public CrawlCommand(CrawlStrategy strategy, ArticleRepository repository) { |
|||
this.strategy = strategy; |
|||
this.repository = repository; |
|||
} |
|||
|
|||
@Override |
|||
public void execute() { |
|||
List<Article> articles = strategy.crawl(); |
|||
repository.saveArticles(articles); |
|||
articles.forEach(System.out::println); |
|||
} |
|||
} |
|||
class CrawlMain { |
|||
public static void main(String[] args) { |
|||
ArticleRepository repository = new ArticleRepository(); |
|||
System.out.println("===== 百度实时热搜 ====="); |
|||
Command baiduCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("baidu"), repository); |
|||
baiduCmd.execute(); |
|||
System.out.println("\n===== 人民网头条 ====="); |
|||
Command peopleCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("people"), repository); |
|||
peopleCmd.execute(); |
|||
System.out.println("\n===== 全部新闻汇总 ====="); |
|||
repository.getAllArticles().forEach(System.out::println); |
|||
} |
|||
} |
|||
Loading…
Reference in new issue