You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

87 lines
2.5 KiB

package com.example.crawler;
import com.example.exception.CrawlerException;
import com.example.exception.ParseException;
import com.example.model.News;
import com.example.util.FileUtil;
import com.example.util.HttpUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class NewsCrawler implements Crawler<News> {
private List<News> newsList = new ArrayList<>();
@Override
public void crawl() throws CrawlerException {
newsList.clear();
String url = "https://news.sina.com.cn/";
System.out.println("正在爬取新浪新闻...");
try {
String html = HttpUtil.getHtmlContent(url);
extractNews(html);
} catch (Exception e) {
throw new CrawlerException("爬取新闻失败", e);
}
System.out.println("新闻爬取完成,共获取 " + newsList.size() + " 条新闻");
}
private void extractNews(String html) throws CrawlerException {
try {
Document doc = Jsoup.parse(html);
Elements items = doc.select("a[href*=sina.com.cn]");
Set<String> seenTitles = new HashSet<>();
for (Element item : items) {
String title = item.text().trim();
String link = item.attr("href");
if (title.length() > 5 && title.length() < 100 && !seenTitles.contains(title)) {
News news = new News();
news.setTitle(title);
news.setLink(link);
newsList.add(news);
seenTitles.add(title);
}
if (newsList.size() >= 50) {
break;
}
}
} catch (Exception e) {
throw new ParseException("解析新闻数据失败", e);
}
}
@Override
public void saveToFile() throws CrawlerException {
List<String> headers = List.of("新闻标题", "链接");
List<List<String>> data = new ArrayList<>();
for (News news : newsList) {
List<String> row = new ArrayList<>();
row.add(news.getTitle());
row.add(news.getLink());
data.add(row);
}
FileUtil.writeCsv("news.csv", headers, data);
}
@Override
public int getCount() {
return newsList.size();
}
}