You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
87 lines
2.5 KiB
87 lines
2.5 KiB
package com.example.crawler;
|
|
|
|
import com.example.exception.CrawlerException;
|
|
import com.example.exception.ParseException;
|
|
import com.example.model.News;
|
|
import com.example.util.FileUtil;
|
|
import com.example.util.HttpUtil;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.HashSet;
|
|
import java.util.List;
|
|
import java.util.Set;
|
|
|
|
public class NewsCrawler implements Crawler<News> {
|
|
|
|
private List<News> newsList = new ArrayList<>();
|
|
|
|
@Override
|
|
public void crawl() throws CrawlerException {
|
|
newsList.clear();
|
|
String url = "https://news.sina.com.cn/";
|
|
|
|
System.out.println("正在爬取新浪新闻...");
|
|
|
|
try {
|
|
String html = HttpUtil.getHtmlContent(url);
|
|
extractNews(html);
|
|
} catch (Exception e) {
|
|
throw new CrawlerException("爬取新闻失败", e);
|
|
}
|
|
|
|
System.out.println("新闻爬取完成,共获取 " + newsList.size() + " 条新闻");
|
|
}
|
|
|
|
private void extractNews(String html) throws CrawlerException {
|
|
try {
|
|
Document doc = Jsoup.parse(html);
|
|
Elements items = doc.select("a[href*=sina.com.cn]");
|
|
|
|
Set<String> seenTitles = new HashSet<>();
|
|
|
|
for (Element item : items) {
|
|
String title = item.text().trim();
|
|
String link = item.attr("href");
|
|
|
|
if (title.length() > 5 && title.length() < 100 && !seenTitles.contains(title)) {
|
|
News news = new News();
|
|
news.setTitle(title);
|
|
news.setLink(link);
|
|
|
|
newsList.add(news);
|
|
seenTitles.add(title);
|
|
}
|
|
|
|
if (newsList.size() >= 50) {
|
|
break;
|
|
}
|
|
}
|
|
} catch (Exception e) {
|
|
throw new ParseException("解析新闻数据失败", e);
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public void saveToFile() throws CrawlerException {
|
|
List<String> headers = List.of("新闻标题", "链接");
|
|
List<List<String>> data = new ArrayList<>();
|
|
|
|
for (News news : newsList) {
|
|
List<String> row = new ArrayList<>();
|
|
row.add(news.getTitle());
|
|
row.add(news.getLink());
|
|
data.add(row);
|
|
}
|
|
|
|
FileUtil.writeCsv("news.csv", headers, data);
|
|
}
|
|
|
|
@Override
|
|
public int getCount() {
|
|
return newsList.size();
|
|
}
|
|
}
|