package com.example.datacollect.strategy; import com.example.datacollect.exception.ParseException; import com.example.datacollect.model.Article; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.List; public class BlogStrategy implements CrawlStrategy { private static final Logger logger = LoggerFactory.getLogger(BlogStrategy.class); @Override public boolean supports(String url) { boolean supported = url.contains("blog.example.com"); logger.debug("BlogStrategy supports {}: {}", url, supported); return supported; } @Override public List
parse(String url, Document doc) throws ParseException { List
articles = new ArrayList<>(); try { Elements titles = doc.select(".post-title"); if (titles.isEmpty()) { logger.warn("No .post-title elements found for URL: {}", url); throw new ParseException("No .post-title elements found on page: " + url); } for (Element e : titles) { String title = e.text(); if (title == null || title.isBlank()) { logger.warn("Found empty title at URL: {}", url); continue; } articles.add(new Article(title, url, "")); logger.debug("Parsed article: {}", title); } } catch (Exception e) { logger.error("Parse error for URL {}: {}", url, e.getMessage(), e); throw new ParseException("Failed to parse blog page: " + url, e); } return articles; } }