package com.example.datacollect.strategy; import com.example.datacollect.exception.ParseException; import com.example.datacollect.model.Article; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; public class DefaultStrategy implements CrawlStrategy { private static final Pattern URL_PATTERN = Pattern.compile("https?://[\\w\\-.]+(?:/[\\w\\-./?%&=]*)?"); @Override public boolean supports(String url) { return true; } @Override public List
parse(String url, Document doc) throws ParseException { List
articles = new ArrayList<>(); String title = doc.title(); String content = ""; Element contentEl = doc.selectFirst("article, .article, #content, .content, main"); if (contentEl != null) { content = contentEl.text().trim(); } else { content = doc.body().text().trim(); } if (content.length() > 200) { content = content.substring(0, 200) + "..."; } articles.add(new Article(title, url, content)); Elements links = doc.select("a[href]"); for (Element link : links) { String href = link.attr("abs:href"); String linkText = link.text().trim(); if (!linkText.isEmpty() && href.matches("https?://.*")) { articles.add(new Article(linkText, href, "")); } } return articles; } @Override public int getPriority() { return Integer.MIN_VALUE; } }