package com.crawler.strategy; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.crawler.model.Article; public class NewsCrawlStrategy implements CrawlStrategy { @Override public List
crawl(String url) { List
articles = new ArrayList<>(); try { URL urlObj = new URL(url); HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); connection.setRequestMethod("GET"); connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); connection.setConnectTimeout(10000); connection.setReadTimeout(10000); StringBuilder content = new StringBuilder(); try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { String line; while ((line = reader.readLine()) != null) { content.append(line).append("\n"); } } Article article = new Article(); article.setTitle("News: " + extractTitle(content.toString())); article.setUrl(url); article.setSource("news"); article.setContent(extractText(content.toString())); article.setAuthor("News Reporter"); articles.add(article); } catch (Exception e) { Article errorArticle = new Article(); errorArticle.setTitle("Error crawling news: " + url); errorArticle.setUrl(url); errorArticle.setContent("Error details: " + e.getMessage()); errorArticle.setSource("news"); articles.add(errorArticle); } return articles; } private String extractTitle(String html) { Pattern pattern = Pattern.compile("]*>([^<]+)", Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(html); if (matcher.find()) { return matcher.group(1).trim(); } return "Untitled News"; } private String extractText(String html) { return html.replaceAll("]*>[\\s\\S]*?", "") .replaceAll("]*>[\\s\\S]*?", "") .replaceAll("<[^>]+>", " ") .replaceAll("\\s+", " ") .trim(); } @Override public String getStrategyName() { return "news"; } }