package com.example.datacollect.strategy; import com.example.datacollect.model.Article; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; public class GenericNewsStrategy implements CrawlStrategy { private static final Pattern PATTERN = Pattern.compile(".*\\.(news|press|article)s?\\..*"); private static final int PRIORITY = 5; @Override public boolean supports(String url) { return PATTERN.matcher(url).find(); } @Override public List
parse(String url, Document doc) { List
articles = new ArrayList<>(); Elements items = doc.select("article, .news-item, .article-item, [class*='news'], [class*='article']"); for (Element item : items) { String title = item.selectFirst("h1, h2, h3, .title, [class*='title']") != null ? item.selectFirst("h1, h2, h3, .title, [class*='title']").text().trim() : ""; String articleUrl = item.selectFirst("a[href]") != null ? item.selectFirst("a[href]").attr("abs:href") : url; String content = item.selectFirst("p, .content, [class*='content']") != null ? item.selectFirst("p, .content, [class*='content']").text().trim() : ""; if (!title.isEmpty()) { articles.add(new Article(title, articleUrl, content)); } } return articles; } @Override public int getPriority() { return PRIORITY; } @Override public Pattern getPattern() { return PATTERN; } }