Browse Source

feat(W10):W10-孟鑫垚-202506010204

main
Mengxinyao 2 weeks ago
parent
commit
f68b929417
  1. 27
      w10/factory/StrategyFactory.java
  2. 76
      w10/strategy/BlogCrawlStrategy.java
  3. 9
      w10/strategy/CrawlStrategy.java
  4. 170
      w10/strategy/DoubanTop250Strategy.java
  5. 75
      w10/strategy/JsoupCrawlStrategy.java
  6. 76
      w10/strategy/NewsCrawlStrategy.java

27
w10/factory/StrategyFactory.java

@ -0,0 +1,27 @@
package com.crawler.factory;
import com.crawler.strategy.*;
import java.util.HashMap;
import java.util.Map;
public class StrategyFactory {
private static final Map<String, CrawlStrategy> strategies = new HashMap<>();
static {
strategies.put("blog", new BlogCrawlStrategy());
strategies.put("news", new NewsCrawlStrategy());
strategies.put("jsoup", new JsoupCrawlStrategy());
}
public static CrawlStrategy getStrategy(String strategyName) {
return strategies.getOrDefault(strategyName.toLowerCase(), new JsoupCrawlStrategy());
}
public static boolean hasStrategy(String strategyName) {
return strategies.containsKey(strategyName.toLowerCase());
}
public static String[] getAvailableStrategies() {
return strategies.keySet().toArray(new String[0]);
}
}

76
w10/strategy/BlogCrawlStrategy.java

@ -0,0 +1,76 @@
package com.crawler.strategy;
import com.crawler.model.Article;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BlogCrawlStrategy implements CrawlStrategy {
@Override
public List<Article> crawl(String url) {
List<Article> articles = new ArrayList<>();
try {
URL urlObj = new URL(url);
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
connection.setConnectTimeout(10000);
connection.setReadTimeout(10000);
StringBuilder content = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
String line;
while ((line = reader.readLine()) != null) {
content.append(line).append("\n");
}
}
Article article = new Article();
article.setTitle("Blog: " + extractTitle(content.toString()));
article.setUrl(url);
article.setSource("blog");
article.setContent(extractText(content.toString()));
article.setAuthor("Blog Author");
articles.add(article);
} catch (Exception e) {
Article errorArticle = new Article();
errorArticle.setTitle("Error crawling blog: " + url);
errorArticle.setUrl(url);
errorArticle.setContent("Error details: " + e.getMessage());
errorArticle.setSource("blog");
articles.add(errorArticle);
}
return articles;
}
private String extractTitle(String html) {
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(html);
if (matcher.find()) {
return matcher.group(1).trim();
}
return "Untitled Blog";
}
private String extractText(String html) {
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "")
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "")
.replaceAll("<[^>]+>", " ")
.replaceAll("\\s+", " ")
.trim();
}
@Override
public String getStrategyName() {
return "blog";
}
}

9
w10/strategy/CrawlStrategy.java

@ -0,0 +1,9 @@
package com.crawler.strategy;
import com.crawler.model.Article;
import java.util.List;
public interface CrawlStrategy {
List<Article> crawl(String url) throws Exception;
String getStrategyName();
}

170
w10/strategy/DoubanTop250Strategy.java

@ -0,0 +1,170 @@
package com.crawler.strategy;
import com.crawler.model.Article;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DoubanTop250Strategy implements CrawlStrategy {
private static final int TOTAL_MOVIES = 250;
private static final int MOVIES_PER_PAGE = 25;
@Override
public List<Article> crawl(String url) {
List<Article> allMovies = new ArrayList<>();
try {
System.out.println("🎬 开始爬取豆瓣电影 Top 250...");
System.out.println("⏳ 预计需要爬取 " + (TOTAL_MOVIES / MOVIES_PER_PAGE) + " 页");
for (int page = 0; page < TOTAL_MOVIES; page += MOVIES_PER_PAGE) {
String pageUrl = "https://movie.douban.com/top250?start=" + page + "&filter=";
System.out.println("📄 正在爬取第 " + (page / MOVIES_PER_PAGE + 1) + " 页...");
List<Article> pageMovies = crawlPage(pageUrl, page / MOVIES_PER_PAGE + 1);
allMovies.addAll(pageMovies);
System.out.println("✅ 第 " + (page / MOVIES_PER_PAGE + 1) + " 页完成,已获取 " + allMovies.size() + " 部电影");
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
System.out.println("🎉 完成!共爬取 " + allMovies.size() + " 部电影");
} catch (Exception e) {
System.err.println("❌ 爬取失败: " + e.getMessage());
Article errorArticle = new Article();
errorArticle.setTitle("Error crawling Douban Top 250");
errorArticle.setUrl(url);
errorArticle.setContent("Error details: " + e.getMessage());
errorArticle.setSource("douban");
allMovies.add(errorArticle);
}
return allMovies;
}
private List<Article> crawlPage(String url, int pageNum) {
List<Article> movies = new ArrayList<>();
try {
URL urlObj = new URL(url);
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
connection.setConnectTimeout(15000);
connection.setReadTimeout(15000);
StringBuilder html = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
String line;
while ((line = reader.readLine()) != null) {
html.append(line).append("\n");
}
}
movies = parseMovies(html.toString());
} catch (Exception e) {
System.err.println("⚠️ 第 " + pageNum + " 页爬取失败: " + e.getMessage());
}
return movies;
}
private List<Article> parseMovies(String html) {
List<Article> movies = new ArrayList<>();
String moviePattern = "<div class=\"item\">[\\s\\S]*?</div>\\s*</div>\\s*</div>";
Pattern pattern = Pattern.compile(moviePattern, Pattern.DOTALL);
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
try {
Article movie = parseSingleMovie(matcher.group());
if (movie != null) {
movies.add(movie);
}
} catch (Exception e) {
continue;
}
}
return movies;
}
private Article parseSingleMovie(String movieHtml) {
Article movie = new Article();
movie.setSource("douban");
try {
Pattern titlePattern = Pattern.compile("<span class=\"title\">(.*?)</span>");
Matcher titleMatcher = titlePattern.matcher(movieHtml);
if (titleMatcher.find()) {
movie.setTitle(titleMatcher.group(1));
}
Pattern linkPattern = Pattern.compile("<a href=\"(.*?)\"");
Matcher linkMatcher = linkPattern.matcher(movieHtml);
if (linkMatcher.find()) {
movie.setUrl(linkMatcher.group(1));
}
Pattern ratingPattern = Pattern.compile("<span class=\"rating_num\">(.*?)</span>");
Matcher ratingMatcher = ratingPattern.matcher(movieHtml);
String rating = "";
if (ratingMatcher.find()) {
rating = ratingMatcher.group(1);
}
Pattern yearPattern = Pattern.compile("(\\d{4})\\s*/");
Matcher yearMatcher = yearPattern.matcher(movieHtml);
String year = "";
if (yearMatcher.find()) {
year = yearMatcher.group(1);
}
Pattern quotePattern = Pattern.compile("<span class=\"inq\">(.*?)</span>");
Matcher quoteMatcher = quotePattern.matcher(movieHtml);
String quote = "";
if (quoteMatcher.find()) {
quote = quoteMatcher.group(1);
}
Pattern infoPattern = Pattern.compile("<p class=\"\">(.*?)</p>", Pattern.DOTALL);
Matcher infoMatcher = infoPattern.matcher(movieHtml);
String info = "";
if (infoMatcher.find()) {
info = infoMatcher.group(1).replaceAll("<br\\s*/?>", "\n").replaceAll("<[^>]+>", "").trim();
}
StringBuilder content = new StringBuilder();
content.append("🎬 电影名称: ").append(movie.getTitle()).append("\n");
content.append("⭐ 评分: ").append(rating).append("\n");
content.append("📅 年份: ").append(year).append("\n");
if (!quote.isEmpty()) {
content.append("💬 简介: ").append(quote).append("\n");
}
content.append("\n📝 详细信息:\n").append(info);
movie.setContent(content.toString());
movie.setAuthor("豆瓣电影");
} catch (Exception e) {
return null;
}
return movie;
}
@Override
public String getStrategyName() {
return "douban";
}
}

75
w10/strategy/JsoupCrawlStrategy.java

@ -0,0 +1,75 @@
package com.crawler.strategy;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.crawler.model.Article;
public class JsoupCrawlStrategy implements CrawlStrategy {
@Override
public List<Article> crawl(String url) {
List<Article> articles = new ArrayList<>();
try {
URL urlObj = new URL(url);
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
connection.setConnectTimeout(10000);
connection.setReadTimeout(10000);
StringBuilder content = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
String line;
while ((line = reader.readLine()) != null) {
content.append(line).append("\n");
}
}
Article article = new Article();
article.setTitle(extractTitle(content.toString()));
article.setUrl(url);
article.setSource(url);
article.setContent(extractText(content.toString()));
articles.add(article);
} catch (Exception e) {
Article errorArticle = new Article();
errorArticle.setTitle("Error crawling: " + url);
errorArticle.setUrl(url);
errorArticle.setContent("Error details: " + e.getMessage());
errorArticle.setSource(url);
articles.add(errorArticle);
}
return articles;
}
private String extractTitle(String html) {
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(html);
if (matcher.find()) {
return matcher.group(1).trim();
}
return "Untitled Page";
}
private String extractText(String html) {
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "")
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "")
.replaceAll("<[^>]+>", " ")
.replaceAll("\\s+", " ")
.trim();
}
@Override
public String getStrategyName() {
return "jsoup";
}
}

76
w10/strategy/NewsCrawlStrategy.java

@ -0,0 +1,76 @@
package com.crawler.strategy;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.crawler.model.Article;
public class NewsCrawlStrategy implements CrawlStrategy {
@Override
public List<Article> crawl(String url) {
List<Article> articles = new ArrayList<>();
try {
URL urlObj = new URL(url);
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
connection.setRequestMethod("GET");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
connection.setConnectTimeout(10000);
connection.setReadTimeout(10000);
StringBuilder content = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) {
String line;
while ((line = reader.readLine()) != null) {
content.append(line).append("\n");
}
}
Article article = new Article();
article.setTitle("News: " + extractTitle(content.toString()));
article.setUrl(url);
article.setSource("news");
article.setContent(extractText(content.toString()));
article.setAuthor("News Reporter");
articles.add(article);
} catch (Exception e) {
Article errorArticle = new Article();
errorArticle.setTitle("Error crawling news: " + url);
errorArticle.setUrl(url);
errorArticle.setContent("Error details: " + e.getMessage());
errorArticle.setSource("news");
articles.add(errorArticle);
}
return articles;
}
private String extractTitle(String html) {
Pattern pattern = Pattern.compile("<title[^>]*>([^<]+)</title>", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(html);
if (matcher.find()) {
return matcher.group(1).trim();
}
return "Untitled News";
}
private String extractText(String html) {
return html.replaceAll("<script[^>]*>[\\s\\S]*?</script>", "")
.replaceAll("<style[^>]*>[\\s\\S]*?</style>", "")
.replaceAll("<[^>]+>", " ")
.replaceAll("\\s+", " ")
.trim();
}
@Override
public String getStrategyName() {
return "news";
}
}
Loading…
Cancel
Save