You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
59 lines
2.3 KiB
59 lines
2.3 KiB
package com.example.moviecli.strategy;
|
|
|
|
import com.example.moviecli.model.Movie;
|
|
import com.example.moviecli.exception.ParseFailedException;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import java.util.ArrayList;
|
|
import java.util.LinkedHashSet;
|
|
import java.util.List;
|
|
import java.util.Set;
|
|
|
|
public class SinaNewsStrategy implements MovieCrawlStrategy {
|
|
@Override
|
|
public boolean supports(String url) {
|
|
return url.contains("news.sina.com.cn");
|
|
}
|
|
|
|
@Override
|
|
public List<Movie> parse(Document doc) throws ParseFailedException {
|
|
try {
|
|
List<Movie> newsList = new ArrayList<>();
|
|
Set<String> titleSet = new LinkedHashSet<>();
|
|
|
|
// 抓取所有 a 标签,过滤出标题较长的(通常新闻标题长度 > 8)
|
|
Elements allLinks = doc.select("a");
|
|
for (Element link : allLinks) {
|
|
String title = link.text().trim();
|
|
// 过滤短文本、纯数字、纯符号、常见非标题文字
|
|
if (title.length() > 8 && !title.matches("^[\\d\\s]+$")
|
|
&& !title.contains("评论") && !title.contains("举报")) {
|
|
// 进一步过滤:通常新闻标题不会太短且不会包含过多标点
|
|
titleSet.add(title);
|
|
}
|
|
if (titleSet.size() >= 120) break; // 最多抓取120条
|
|
}
|
|
|
|
// 如果数量不够 30,再尝试抓取特定区域
|
|
if (titleSet.size() < 30) {
|
|
Elements newsItems = doc.select(".news-item, .blk, .main-content a");
|
|
for (Element item : newsItems) {
|
|
String title = item.text().trim();
|
|
if (title.length() > 8 && !titleSet.contains(title)) {
|
|
titleSet.add(title);
|
|
}
|
|
if (titleSet.size() >= 120) break;
|
|
}
|
|
}
|
|
|
|
int rank = 1;
|
|
for (String title : titleSet) {
|
|
newsList.add(new Movie(rank++, title, "", "新闻", "新浪", ""));
|
|
}
|
|
return newsList;
|
|
} catch (Exception e) {
|
|
throw new ParseFailedException("新浪新闻解析失败", e);
|
|
}
|
|
}
|
|
}
|