6 changed files with 433 additions and 0 deletions
@ -0,0 +1,27 @@ |
|||
package com.crawler.factory; |
|||
|
|||
import com.crawler.strategy.*; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
|
|||
public class StrategyFactory { |
|||
private static final Map<String, CrawlStrategy> strategies = new HashMap<>(); |
|||
|
|||
static { |
|||
strategies.put("blog", new BlogCrawlStrategy()); |
|||
strategies.put("news", new NewsCrawlStrategy()); |
|||
strategies.put("jsoup", new JsoupCrawlStrategy()); |
|||
} |
|||
|
|||
public static CrawlStrategy getStrategy(String strategyName) { |
|||
return strategies.getOrDefault(strategyName.toLowerCase(), new JsoupCrawlStrategy()); |
|||
} |
|||
|
|||
public static boolean hasStrategy(String strategyName) { |
|||
return strategies.containsKey(strategyName.toLowerCase()); |
|||
} |
|||
|
|||
public static String[] getAvailableStrategies() { |
|||
return strategies.keySet().toArray(new String[0]); |
|||
} |
|||
} |
|||
@ -0,0 +1,76 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import com.crawler.model.Article; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class BlogCrawlStrategy implements CrawlStrategy { |
|||
|
|||
@Override |
|||
public List<Article> crawl(String url) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
URL urlObj = new URL(url); |
|||
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); |
|||
connection.setRequestMethod("GET"); |
|||
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); |
|||
connection.setConnectTimeout(10000); |
|||
connection.setReadTimeout(10000); |
|||
|
|||
StringBuilder content = new StringBuilder(); |
|||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { |
|||
String line; |
|||
while ((line = reader.readLine()) != null) { |
|||
content.append(line).append("\n"); |
|||
} |
|||
} |
|||
|
|||
Article article = new Article(); |
|||
article.setTitle("Blog: " + extractTitle(content.toString())); |
|||
article.setUrl(url); |
|||
article.setSource("blog"); |
|||
article.setContent(extractText(content.toString())); |
|||
article.setAuthor("Blog Author"); |
|||
|
|||
articles.add(article); |
|||
|
|||
} catch (Exception e) { |
|||
Article errorArticle = new Article(); |
|||
errorArticle.setTitle("Error crawling blog: " + url); |
|||
errorArticle.setUrl(url); |
|||
errorArticle.setContent("Error details: " + e.getMessage()); |
|||
errorArticle.setSource("blog"); |
|||
articles.add(errorArticle); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
private String extractTitle(String html) { |
|||
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE); |
|||
Matcher matcher = pattern.matcher(html); |
|||
if (matcher.find()) { |
|||
return matcher.group(1).trim(); |
|||
} |
|||
return "Untitled Blog"; |
|||
} |
|||
|
|||
private String extractText(String html) { |
|||
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "") |
|||
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "") |
|||
.replaceAll("<[^>]+>", " ") |
|||
.replaceAll("\\s+", " ") |
|||
.trim(); |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "blog"; |
|||
} |
|||
} |
|||
@ -0,0 +1,9 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import com.crawler.model.Article; |
|||
import java.util.List; |
|||
|
|||
public interface CrawlStrategy { |
|||
List<Article> crawl(String url) throws Exception; |
|||
String getStrategyName(); |
|||
} |
|||
@ -0,0 +1,170 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import com.crawler.model.Article; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class DoubanTop250Strategy implements CrawlStrategy { |
|||
|
|||
private static final int TOTAL_MOVIES = 250; |
|||
private static final int MOVIES_PER_PAGE = 25; |
|||
|
|||
@Override |
|||
public List<Article> crawl(String url) { |
|||
List<Article> allMovies = new ArrayList<>(); |
|||
try { |
|||
System.out.println("🎬 开始爬取豆瓣电影 Top 250..."); |
|||
System.out.println("⏳ 预计需要爬取 " + (TOTAL_MOVIES / MOVIES_PER_PAGE) + " 页"); |
|||
|
|||
for (int page = 0; page < TOTAL_MOVIES; page += MOVIES_PER_PAGE) { |
|||
String pageUrl = "https://movie.douban.com/top250?start=" + page + "&filter="; |
|||
System.out.println("📄 正在爬取第 " + (page / MOVIES_PER_PAGE + 1) + " 页..."); |
|||
|
|||
List<Article> pageMovies = crawlPage(pageUrl, page / MOVIES_PER_PAGE + 1); |
|||
allMovies.addAll(pageMovies); |
|||
|
|||
System.out.println("✅ 第 " + (page / MOVIES_PER_PAGE + 1) + " 页完成,已获取 " + allMovies.size() + " 部电影"); |
|||
|
|||
try { |
|||
Thread.sleep(1000); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
System.out.println("🎉 完成!共爬取 " + allMovies.size() + " 部电影"); |
|||
} catch (Exception e) { |
|||
System.err.println("❌ 爬取失败: " + e.getMessage()); |
|||
Article errorArticle = new Article(); |
|||
errorArticle.setTitle("Error crawling Douban Top 250"); |
|||
errorArticle.setUrl(url); |
|||
errorArticle.setContent("Error details: " + e.getMessage()); |
|||
errorArticle.setSource("douban"); |
|||
allMovies.add(errorArticle); |
|||
} |
|||
return allMovies; |
|||
} |
|||
|
|||
private List<Article> crawlPage(String url, int pageNum) { |
|||
List<Article> movies = new ArrayList<>(); |
|||
try { |
|||
URL urlObj = new URL(url); |
|||
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); |
|||
connection.setRequestMethod("GET"); |
|||
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); |
|||
connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); |
|||
connection.setConnectTimeout(15000); |
|||
connection.setReadTimeout(15000); |
|||
|
|||
StringBuilder html = new StringBuilder(); |
|||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { |
|||
String line; |
|||
while ((line = reader.readLine()) != null) { |
|||
html.append(line).append("\n"); |
|||
} |
|||
} |
|||
|
|||
movies = parseMovies(html.toString()); |
|||
} catch (Exception e) { |
|||
System.err.println("⚠️ 第 " + pageNum + " 页爬取失败: " + e.getMessage()); |
|||
} |
|||
return movies; |
|||
} |
|||
|
|||
private List<Article> parseMovies(String html) { |
|||
List<Article> movies = new ArrayList<>(); |
|||
|
|||
String moviePattern = "<div class=\"item\">[\\s\\S]*?</div>\\s*</div>\\s*</div>"; |
|||
Pattern pattern = Pattern.compile(moviePattern, Pattern.DOTALL); |
|||
Matcher matcher = pattern.matcher(html); |
|||
|
|||
while (matcher.find()) { |
|||
try { |
|||
Article movie = parseSingleMovie(matcher.group()); |
|||
if (movie != null) { |
|||
movies.add(movie); |
|||
} |
|||
} catch (Exception e) { |
|||
continue; |
|||
} |
|||
} |
|||
return movies; |
|||
} |
|||
|
|||
private Article parseSingleMovie(String movieHtml) { |
|||
Article movie = new Article(); |
|||
movie.setSource("douban"); |
|||
|
|||
try { |
|||
Pattern titlePattern = Pattern.compile("<span class=\"title\">(.*?)</span>"); |
|||
Matcher titleMatcher = titlePattern.matcher(movieHtml); |
|||
if (titleMatcher.find()) { |
|||
movie.setTitle(titleMatcher.group(1)); |
|||
} |
|||
|
|||
Pattern linkPattern = Pattern.compile("<a href=\"(.*?)\""); |
|||
Matcher linkMatcher = linkPattern.matcher(movieHtml); |
|||
if (linkMatcher.find()) { |
|||
movie.setUrl(linkMatcher.group(1)); |
|||
} |
|||
|
|||
Pattern ratingPattern = Pattern.compile("<span class=\"rating_num\">(.*?)</span>"); |
|||
Matcher ratingMatcher = ratingPattern.matcher(movieHtml); |
|||
String rating = ""; |
|||
if (ratingMatcher.find()) { |
|||
rating = ratingMatcher.group(1); |
|||
} |
|||
|
|||
Pattern yearPattern = Pattern.compile("(\\d{4})\\s*/"); |
|||
Matcher yearMatcher = yearPattern.matcher(movieHtml); |
|||
String year = ""; |
|||
if (yearMatcher.find()) { |
|||
year = yearMatcher.group(1); |
|||
} |
|||
|
|||
Pattern quotePattern = Pattern.compile("<span class=\"inq\">(.*?)</span>"); |
|||
Matcher quoteMatcher = quotePattern.matcher(movieHtml); |
|||
String quote = ""; |
|||
if (quoteMatcher.find()) { |
|||
quote = quoteMatcher.group(1); |
|||
} |
|||
|
|||
Pattern infoPattern = Pattern.compile("<p class=\"\">(.*?)</p>", Pattern.DOTALL); |
|||
Matcher infoMatcher = infoPattern.matcher(movieHtml); |
|||
String info = ""; |
|||
if (infoMatcher.find()) { |
|||
info = infoMatcher.group(1).replaceAll("<br\\s*/?>", "\n").replaceAll("<[^>]+>", "").trim(); |
|||
} |
|||
|
|||
StringBuilder content = new StringBuilder(); |
|||
content.append("🎬 电影名称: ").append(movie.getTitle()).append("\n"); |
|||
content.append("⭐ 评分: ").append(rating).append("\n"); |
|||
content.append("📅 年份: ").append(year).append("\n"); |
|||
if (!quote.isEmpty()) { |
|||
content.append("💬 简介: ").append(quote).append("\n"); |
|||
} |
|||
content.append("\n📝 详细信息:\n").append(info); |
|||
|
|||
movie.setContent(content.toString()); |
|||
movie.setAuthor("豆瓣电影"); |
|||
|
|||
} catch (Exception e) { |
|||
return null; |
|||
} |
|||
|
|||
return movie; |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "douban"; |
|||
} |
|||
} |
|||
@ -0,0 +1,75 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
import com.crawler.model.Article; |
|||
|
|||
public class JsoupCrawlStrategy implements CrawlStrategy { |
|||
|
|||
@Override |
|||
public List<Article> crawl(String url) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
URL urlObj = new URL(url); |
|||
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); |
|||
connection.setRequestMethod("GET"); |
|||
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); |
|||
connection.setConnectTimeout(10000); |
|||
connection.setReadTimeout(10000); |
|||
|
|||
StringBuilder content = new StringBuilder(); |
|||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { |
|||
String line; |
|||
while ((line = reader.readLine()) != null) { |
|||
content.append(line).append("\n"); |
|||
} |
|||
} |
|||
|
|||
Article article = new Article(); |
|||
article.setTitle(extractTitle(content.toString())); |
|||
article.setUrl(url); |
|||
article.setSource(url); |
|||
article.setContent(extractText(content.toString())); |
|||
|
|||
articles.add(article); |
|||
|
|||
} catch (Exception e) { |
|||
Article errorArticle = new Article(); |
|||
errorArticle.setTitle("Error crawling: " + url); |
|||
errorArticle.setUrl(url); |
|||
errorArticle.setContent("Error details: " + e.getMessage()); |
|||
errorArticle.setSource(url); |
|||
articles.add(errorArticle); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
private String extractTitle(String html) { |
|||
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE); |
|||
Matcher matcher = pattern.matcher(html); |
|||
if (matcher.find()) { |
|||
return matcher.group(1).trim(); |
|||
} |
|||
return "Untitled Page"; |
|||
} |
|||
|
|||
private String extractText(String html) { |
|||
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "") |
|||
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "") |
|||
.replaceAll("<[^>]+>", " ") |
|||
.replaceAll("\\s+", " ") |
|||
.trim(); |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "jsoup"; |
|||
} |
|||
} |
|||
@ -0,0 +1,76 @@ |
|||
package com.crawler.strategy; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
import com.crawler.model.Article; |
|||
|
|||
public class NewsCrawlStrategy implements CrawlStrategy { |
|||
|
|||
@Override |
|||
public List<Article> crawl(String url) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
try { |
|||
URL urlObj = new URL(url); |
|||
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); |
|||
connection.setRequestMethod("GET"); |
|||
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); |
|||
connection.setConnectTimeout(10000); |
|||
connection.setReadTimeout(10000); |
|||
|
|||
StringBuilder content = new StringBuilder(); |
|||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { |
|||
String line; |
|||
while ((line = reader.readLine()) != null) { |
|||
content.append(line).append("\n"); |
|||
} |
|||
} |
|||
|
|||
Article article = new Article(); |
|||
article.setTitle("News: " + extractTitle(content.toString())); |
|||
article.setUrl(url); |
|||
article.setSource("news"); |
|||
article.setContent(extractText(content.toString())); |
|||
article.setAuthor("News Reporter"); |
|||
|
|||
articles.add(article); |
|||
|
|||
} catch (Exception e) { |
|||
Article errorArticle = new Article(); |
|||
errorArticle.setTitle("Error crawling news: " + url); |
|||
errorArticle.setUrl(url); |
|||
errorArticle.setContent("Error details: " + e.getMessage()); |
|||
errorArticle.setSource("news"); |
|||
articles.add(errorArticle); |
|||
} |
|||
return articles; |
|||
} |
|||
|
|||
private String extractTitle(String html) { |
|||
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE); |
|||
Matcher matcher = pattern.matcher(html); |
|||
if (matcher.find()) { |
|||
return matcher.group(1).trim(); |
|||
} |
|||
return "Untitled News"; |
|||
} |
|||
|
|||
private String extractText(String html) { |
|||
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "") |
|||
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "") |
|||
.replaceAll("<[^>]+>", " ") |
|||
.replaceAll("\\s+", " ") |
|||
.trim(); |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "news"; |
|||
} |
|||
} |
|||
Loading…
Reference in new issue