4 changed files with 167 additions and 0 deletions
@ -0,0 +1,49 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class BlogStrategy implements CrawlStrategy { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(BlogStrategy.class); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
boolean supported = url.contains("blog.example.com"); |
||||
|
logger.debug("BlogStrategy supports {}: {}", url, supported); |
||||
|
return supported; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) throws ParseException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
Elements titles = doc.select(".post-title"); |
||||
|
if (titles.isEmpty()) { |
||||
|
logger.warn("No .post-title elements found for URL: {}", url); |
||||
|
throw new ParseException("No .post-title elements found on page: " + url); |
||||
|
} |
||||
|
|
||||
|
for (Element e : titles) { |
||||
|
String title = e.text(); |
||||
|
if (title == null || title.isBlank()) { |
||||
|
logger.warn("Found empty title at URL: {}", url); |
||||
|
continue; |
||||
|
} |
||||
|
articles.add(new Article(title, url, "")); |
||||
|
logger.debug("Parsed article: {}", title); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Parse error for URL {}: {}", url, e.getMessage(), e); |
||||
|
throw new ParseException("Failed to parse blog page: " + url, e); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,25 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlStrategy { |
||||
|
/** |
||||
|
* 解析文档并提取文章列表 |
||||
|
* @param url 原始URL |
||||
|
* @param doc Jsoup文档对象 |
||||
|
* @return 文章列表 |
||||
|
* @throws ParseException 解析失败时抛出 |
||||
|
*/ |
||||
|
List<Article> parse(String url, Document doc) throws ParseException; |
||||
|
|
||||
|
/** |
||||
|
* 判断该策略是否支持指定的URL |
||||
|
* @param url 目标URL |
||||
|
* @return 是否支持 |
||||
|
*/ |
||||
|
boolean supports(String url); |
||||
|
} |
||||
@ -0,0 +1,50 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.exception.ParseException; |
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class NewsStrategy implements CrawlStrategy { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(NewsStrategy.class); |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
boolean supported = url.contains("news.example.com"); |
||||
|
logger.debug("NewsStrategy supports {}: {}", url, supported); |
||||
|
return supported; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) throws ParseException { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
try { |
||||
|
Elements headlines = doc.select(".headline"); |
||||
|
if (headlines.isEmpty()) { |
||||
|
logger.warn("No .headline elements found for URL: {}", url); |
||||
|
throw new ParseException("No .headline elements found on page: " + url); |
||||
|
} |
||||
|
|
||||
|
for (Element e : headlines) { |
||||
|
String title = e.text(); |
||||
|
String link = e.hasAttr("href") ? e.attr("abs:href") : url; |
||||
|
if (title == null || title.isBlank()) { |
||||
|
logger.warn("Found empty headline at URL: {}", url); |
||||
|
continue; |
||||
|
} |
||||
|
articles.add(new Article(title, link, "")); |
||||
|
logger.debug("Parsed news article: {}", title); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
logger.error("Parse error for URL {}: {}", url, e.getMessage(), e); |
||||
|
throw new ParseException("Failed to parse news page: " + url, e); |
||||
|
} |
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,43 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class StrategyFactory { |
||||
|
private static final Logger logger = LoggerFactory.getLogger(StrategyFactory.class); |
||||
|
private final List<CrawlStrategy> strategies = new ArrayList<>(); |
||||
|
|
||||
|
public StrategyFactory() { |
||||
|
// 注册所有策略
|
||||
|
strategies.add(new BlogStrategy()); |
||||
|
strategies.add(new NewsStrategy()); |
||||
|
logger.info("StrategyFactory initialized with {} strategies", strategies.size()); |
||||
|
} |
||||
|
|
||||
|
public CrawlStrategy getStrategy(String url) { |
||||
|
if (url == null || url.isBlank()) { |
||||
|
logger.warn("Null or blank URL provided to getStrategy"); |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
for (CrawlStrategy strategy : strategies) { |
||||
|
if (strategy.supports(url)) { |
||||
|
logger.debug("Found strategy {} for URL: {}", strategy.getClass().getSimpleName(), url); |
||||
|
return strategy; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logger.warn("No strategy found for URL: {}", url); |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public void registerStrategy(CrawlStrategy strategy) { |
||||
|
if (strategy != null) { |
||||
|
strategies.add(strategy); |
||||
|
logger.info("Registered new strategy: {}", strategy.getClass().getSimpleName()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue