You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

59 lines
2.3 KiB

package com.example.moviecli.strategy;
import com.example.moviecli.model.Movie;
import com.example.moviecli.exception.ParseFailedException;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
public class SinaNewsStrategy implements MovieCrawlStrategy {
@Override
public boolean supports(String url) {
return url.contains("news.sina.com.cn");
}
@Override
public List<Movie> parse(Document doc) throws ParseFailedException {
try {
List<Movie> newsList = new ArrayList<>();
Set<String> titleSet = new LinkedHashSet<>();
// 抓取所有 a 标签,过滤出标题较长的(通常新闻标题长度 > 8)
Elements allLinks = doc.select("a");
for (Element link : allLinks) {
String title = link.text().trim();
// 过滤短文本、纯数字、纯符号、常见非标题文字
if (title.length() > 8 && !title.matches("^[\\d\\s]+$")
&& !title.contains("评论") && !title.contains("举报")) {
// 进一步过滤:通常新闻标题不会太短且不会包含过多标点
titleSet.add(title);
}
if (titleSet.size() >= 120) break; // 最多抓取120条
}
// 如果数量不够 30,再尝试抓取特定区域
if (titleSet.size() < 30) {
Elements newsItems = doc.select(".news-item, .blk, .main-content a");
for (Element item : newsItems) {
String title = item.text().trim();
if (title.length() > 8 && !titleSet.contains(title)) {
titleSet.add(title);
}
if (titleSet.size() >= 120) break;
}
}
int rank = 1;
for (String title : titleSet) {
newsList.add(new Movie(rank++, title, "", "新闻", "新浪", ""));
}
return newsList;
} catch (Exception e) {
throw new ParseFailedException("新浪新闻解析失败", e);
}
}
}