package com.example.datacollect.strategy; import com.example.datacollect.exception.ParseException; import com.example.datacollect.model.Article; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.List; public class NewsStrategy implements CrawlStrategy { private static final Logger logger = LoggerFactory.getLogger(NewsStrategy.class); @Override public boolean supports(String url) { boolean supported = url.contains("news.example.com"); logger.debug("NewsStrategy supports {}: {}", url, supported); return supported; } @Override public List
parse(String url, Document doc) throws ParseException { List
articles = new ArrayList<>(); try { Elements headlines = doc.select(".headline"); if (headlines.isEmpty()) { logger.warn("No .headline elements found for URL: {}", url); throw new ParseException("No .headline elements found on page: " + url); } for (Element e : headlines) { String title = e.text(); String link = e.hasAttr("href") ? e.attr("abs:href") : url; if (title == null || title.isBlank()) { logger.warn("Found empty headline at URL: {}", url); continue; } articles.add(new Article(title, link, "")); logger.debug("Parsed news article: {}", title); } } catch (Exception e) { logger.error("Parse error for URL {}: {}", url, e.getMessage(), e); throw new ParseException("Failed to parse news page: " + url, e); } return articles; } }