You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

76 lines
2.6 KiB

package com.crawler.strategy;
import com.crawler.model.Article;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BlogCrawlStrategy implements CrawlStrategy {
@Override
public List<Article> crawl(String url) {
List<Article> articles = new ArrayList<>();
try {
URL urlObj = new URL(url);
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
connection.setConnectTimeout(10000);
connection.setReadTimeout(10000);
StringBuilder content = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
String line;
while ((line = reader.readLine()) != null) {
content.append(line).append("\n");
}
}
Article article = new Article();
article.setTitle("Blog: " + extractTitle(content.toString()));
article.setUrl(url);
article.setSource("blog");
article.setContent(extractText(content.toString()));
article.setAuthor("Blog Author");
articles.add(article);
} catch (Exception e) {
Article errorArticle = new Article();
errorArticle.setTitle("Error crawling blog: " + url);
errorArticle.setUrl(url);
errorArticle.setContent("Error details: " + e.getMessage());
errorArticle.setSource("blog");
articles.add(errorArticle);
}
return articles;
}
private String extractTitle(String html) {
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(html);
if (matcher.find()) {
return matcher.group(1).trim();
}
return "Untitled Blog";
}
private String extractText(String html) {
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "")
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "")
.replaceAll("<[^>]+>", " ")
.replaceAll("\\s+", " ")
.trim();
}
@Override
public String getStrategyName() {
return "blog";
}
}