You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
75 lines
2.5 KiB
75 lines
2.5 KiB
package com.crawler.strategy;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.InputStreamReader;
|
|
import java.net.HttpURLConnection;
|
|
import java.net.URL;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
import com.crawler.model.Article;
|
|
|
|
public class JsoupCrawlStrategy implements CrawlStrategy {
|
|
|
|
@Override
|
|
public List<Article> crawl(String url) {
|
|
List<Article> articles = new ArrayList<>();
|
|
try {
|
|
URL urlObj = new URL(url);
|
|
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
|
|
connection.setRequestMethod("GET");
|
|
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
|
|
connection.setConnectTimeout(10000);
|
|
connection.setReadTimeout(10000);
|
|
|
|
StringBuilder content = new StringBuilder();
|
|
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
|
|
String line;
|
|
while ((line = reader.readLine()) != null) {
|
|
content.append(line).append("\n");
|
|
}
|
|
}
|
|
|
|
Article article = new Article();
|
|
article.setTitle(extractTitle(content.toString()));
|
|
article.setUrl(url);
|
|
article.setSource(url);
|
|
article.setContent(extractText(content.toString()));
|
|
|
|
articles.add(article);
|
|
|
|
} catch (Exception e) {
|
|
Article errorArticle = new Article();
|
|
errorArticle.setTitle("Error crawling: " + url);
|
|
errorArticle.setUrl(url);
|
|
errorArticle.setContent("Error details: " + e.getMessage());
|
|
errorArticle.setSource(url);
|
|
articles.add(errorArticle);
|
|
}
|
|
return articles;
|
|
}
|
|
|
|
private String extractTitle(String html) {
|
|
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE);
|
|
Matcher matcher = pattern.matcher(html);
|
|
if (matcher.find()) {
|
|
return matcher.group(1).trim();
|
|
}
|
|
return "Untitled Page";
|
|
}
|
|
|
|
private String extractText(String html) {
|
|
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "")
|
|
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "")
|
|
.replaceAll("<[^>]+>", " ")
|
|
.replaceAll("\\s+", " ")
|
|
.trim();
|
|
}
|
|
|
|
@Override
|
|
public String getStrategyName() {
|
|
return "jsoup";
|
|
}
|
|
}
|
|
|