From f68b9294171049c04ba103a26002caabca30485d Mon Sep 17 00:00:00 2001 From: Mengxinyao Date: Thu, 4 Jun 2026 22:04:07 +0800 Subject: [PATCH] =?UTF-8?q?feat(W10):W10-=E5=AD=9F=E9=91=AB=E5=9E=9A-20250?= =?UTF-8?q?6010204?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- w10/factory/StrategyFactory.java | 27 ++++ w10/strategy/BlogCrawlStrategy.java | 76 +++++++++++ w10/strategy/CrawlStrategy.java | 9 ++ w10/strategy/DoubanTop250Strategy.java | 170 +++++++++++++++++++++++++ w10/strategy/JsoupCrawlStrategy.java | 75 +++++++++++ w10/strategy/NewsCrawlStrategy.java | 76 +++++++++++ 6 files changed, 433 insertions(+) create mode 100644 w10/factory/StrategyFactory.java create mode 100644 w10/strategy/BlogCrawlStrategy.java create mode 100644 w10/strategy/CrawlStrategy.java create mode 100644 w10/strategy/DoubanTop250Strategy.java create mode 100644 w10/strategy/JsoupCrawlStrategy.java create mode 100644 w10/strategy/NewsCrawlStrategy.java diff --git a/w10/factory/StrategyFactory.java b/w10/factory/StrategyFactory.java new file mode 100644 index 0000000..b8a0310 --- /dev/null +++ b/w10/factory/StrategyFactory.java @@ -0,0 +1,27 @@ +package com.crawler.factory; + +import com.crawler.strategy.*; +import java.util.HashMap; +import java.util.Map; + +public class StrategyFactory { + private static final Map strategies = new HashMap<>(); + + static { + strategies.put("blog", new BlogCrawlStrategy()); + strategies.put("news", new NewsCrawlStrategy()); + strategies.put("jsoup", new JsoupCrawlStrategy()); + } + + public static CrawlStrategy getStrategy(String strategyName) { + return strategies.getOrDefault(strategyName.toLowerCase(), new JsoupCrawlStrategy()); + } + + public static boolean hasStrategy(String strategyName) { + return strategies.containsKey(strategyName.toLowerCase()); + } + + public static String[] getAvailableStrategies() { + return strategies.keySet().toArray(new String[0]); + } +} diff --git a/w10/strategy/BlogCrawlStrategy.java b/w10/strategy/BlogCrawlStrategy.java new file mode 100644 index 0000000..f85782a --- /dev/null +++ b/w10/strategy/BlogCrawlStrategy.java @@ -0,0 +1,76 @@ +package com.crawler.strategy; + +import com.crawler.model.Article; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class BlogCrawlStrategy implements CrawlStrategy { + + @Override + public List
crawl(String url) { + List
articles = new ArrayList<>(); + try { + URL urlObj = new URL(url); + HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); + connection.setRequestMethod("GET"); + connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); + connection.setConnectTimeout(10000); + connection.setReadTimeout(10000); + + StringBuilder content = new StringBuilder(); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { + String line; + while ((line = reader.readLine()) != null) { + content.append(line).append("\n"); + } + } + + Article article = new Article(); + article.setTitle("Blog: " + extractTitle(content.toString())); + article.setUrl(url); + article.setSource("blog"); + article.setContent(extractText(content.toString())); + article.setAuthor("Blog Author"); + + articles.add(article); + + } catch (Exception e) { + Article errorArticle = new Article(); + errorArticle.setTitle("Error crawling blog: " + url); + errorArticle.setUrl(url); + errorArticle.setContent("Error details: " + e.getMessage()); + errorArticle.setSource("blog"); + articles.add(errorArticle); + } + return articles; + } + + private String extractTitle(String html) { + Pattern pattern = Pattern.compile("]*>([^<]+)", Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(html); + if (matcher.find()) { + return matcher.group(1).trim(); + } + return "Untitled Blog"; + } + + private String extractText(String html) { + return html.replaceAll("]*>[\\s\\S]*?", "") + .replaceAll("]*>[\\s\\S]*?", "") + .replaceAll("<[^>]+>", " ") + .replaceAll("\\s+", " ") + .trim(); + } + + @Override + public String getStrategyName() { + return "blog"; + } +} diff --git a/w10/strategy/CrawlStrategy.java b/w10/strategy/CrawlStrategy.java new file mode 100644 index 0000000..1aa1e59 --- /dev/null +++ b/w10/strategy/CrawlStrategy.java @@ -0,0 +1,9 @@ +package com.crawler.strategy; + +import com.crawler.model.Article; +import java.util.List; + +public interface CrawlStrategy { + List
crawl(String url) throws Exception; + String getStrategyName(); +} diff --git a/w10/strategy/DoubanTop250Strategy.java b/w10/strategy/DoubanTop250Strategy.java new file mode 100644 index 0000000..8446a9c --- /dev/null +++ b/w10/strategy/DoubanTop250Strategy.java @@ -0,0 +1,170 @@ +package com.crawler.strategy; + +import com.crawler.model.Article; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DoubanTop250Strategy implements CrawlStrategy { + + private static final int TOTAL_MOVIES = 250; + private static final int MOVIES_PER_PAGE = 25; + + @Override + public List
crawl(String url) { + List
allMovies = new ArrayList<>(); + try { + System.out.println("🎬 开始爬取豆瓣电影 Top 250..."); + System.out.println("⏳ 预计需要爬取 " + (TOTAL_MOVIES / MOVIES_PER_PAGE) + " 页"); + + for (int page = 0; page < TOTAL_MOVIES; page += MOVIES_PER_PAGE) { + String pageUrl = "https://movie.douban.com/top250?start=" + page + "&filter="; + System.out.println("📄 正在爬取第 " + (page / MOVIES_PER_PAGE + 1) + " 页..."); + + List
pageMovies = crawlPage(pageUrl, page / MOVIES_PER_PAGE + 1); + allMovies.addAll(pageMovies); + + System.out.println("✅ 第 " + (page / MOVIES_PER_PAGE + 1) + " 页完成,已获取 " + allMovies.size() + " 部电影"); + + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + } + + System.out.println("🎉 完成!共爬取 " + allMovies.size() + " 部电影"); + } catch (Exception e) { + System.err.println("❌ 爬取失败: " + e.getMessage()); + Article errorArticle = new Article(); + errorArticle.setTitle("Error crawling Douban Top 250"); + errorArticle.setUrl(url); + errorArticle.setContent("Error details: " + e.getMessage()); + errorArticle.setSource("douban"); + allMovies.add(errorArticle); + } + return allMovies; + } + + private List
crawlPage(String url, int pageNum) { + List
movies = new ArrayList<>(); + try { + URL urlObj = new URL(url); + HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); + connection.setRequestMethod("GET"); + connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); + connection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); + connection.setConnectTimeout(15000); + connection.setReadTimeout(15000); + + StringBuilder html = new StringBuilder(); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { + String line; + while ((line = reader.readLine()) != null) { + html.append(line).append("\n"); + } + } + + movies = parseMovies(html.toString()); + } catch (Exception e) { + System.err.println("⚠️ 第 " + pageNum + " 页爬取失败: " + e.getMessage()); + } + return movies; + } + + private List
parseMovies(String html) { + List
movies = new ArrayList<>(); + + String moviePattern = "
[\\s\\S]*?
\\s*\\s*"; + Pattern pattern = Pattern.compile(moviePattern, Pattern.DOTALL); + Matcher matcher = pattern.matcher(html); + + while (matcher.find()) { + try { + Article movie = parseSingleMovie(matcher.group()); + if (movie != null) { + movies.add(movie); + } + } catch (Exception e) { + continue; + } + } + return movies; + } + + private Article parseSingleMovie(String movieHtml) { + Article movie = new Article(); + movie.setSource("douban"); + + try { + Pattern titlePattern = Pattern.compile("(.*?)"); + Matcher titleMatcher = titlePattern.matcher(movieHtml); + if (titleMatcher.find()) { + movie.setTitle(titleMatcher.group(1)); + } + + Pattern linkPattern = Pattern.compile("(.*?)"); + Matcher ratingMatcher = ratingPattern.matcher(movieHtml); + String rating = ""; + if (ratingMatcher.find()) { + rating = ratingMatcher.group(1); + } + + Pattern yearPattern = Pattern.compile("(\\d{4})\\s*/"); + Matcher yearMatcher = yearPattern.matcher(movieHtml); + String year = ""; + if (yearMatcher.find()) { + year = yearMatcher.group(1); + } + + Pattern quotePattern = Pattern.compile("(.*?)"); + Matcher quoteMatcher = quotePattern.matcher(movieHtml); + String quote = ""; + if (quoteMatcher.find()) { + quote = quoteMatcher.group(1); + } + + Pattern infoPattern = Pattern.compile("

(.*?)

", Pattern.DOTALL); + Matcher infoMatcher = infoPattern.matcher(movieHtml); + String info = ""; + if (infoMatcher.find()) { + info = infoMatcher.group(1).replaceAll("", "\n").replaceAll("<[^>]+>", "").trim(); + } + + StringBuilder content = new StringBuilder(); + content.append("🎬 电影名称: ").append(movie.getTitle()).append("\n"); + content.append("⭐ 评分: ").append(rating).append("\n"); + content.append("📅 年份: ").append(year).append("\n"); + if (!quote.isEmpty()) { + content.append("💬 简介: ").append(quote).append("\n"); + } + content.append("\n📝 详细信息:\n").append(info); + + movie.setContent(content.toString()); + movie.setAuthor("豆瓣电影"); + + } catch (Exception e) { + return null; + } + + return movie; + } + + @Override + public String getStrategyName() { + return "douban"; + } +} \ No newline at end of file diff --git a/w10/strategy/JsoupCrawlStrategy.java b/w10/strategy/JsoupCrawlStrategy.java new file mode 100644 index 0000000..e02fe25 --- /dev/null +++ b/w10/strategy/JsoupCrawlStrategy.java @@ -0,0 +1,75 @@ +package com.crawler.strategy; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.crawler.model.Article; + +public class JsoupCrawlStrategy implements CrawlStrategy { + + @Override + public List
crawl(String url) { + List
articles = new ArrayList<>(); + try { + URL urlObj = new URL(url); + HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); + connection.setRequestMethod("GET"); + connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); + connection.setConnectTimeout(10000); + connection.setReadTimeout(10000); + + StringBuilder content = new StringBuilder(); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { + String line; + while ((line = reader.readLine()) != null) { + content.append(line).append("\n"); + } + } + + Article article = new Article(); + article.setTitle(extractTitle(content.toString())); + article.setUrl(url); + article.setSource(url); + article.setContent(extractText(content.toString())); + + articles.add(article); + + } catch (Exception e) { + Article errorArticle = new Article(); + errorArticle.setTitle("Error crawling: " + url); + errorArticle.setUrl(url); + errorArticle.setContent("Error details: " + e.getMessage()); + errorArticle.setSource(url); + articles.add(errorArticle); + } + return articles; + } + + private String extractTitle(String html) { + Pattern pattern = Pattern.compile("]*>([^<]+)", Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(html); + if (matcher.find()) { + return matcher.group(1).trim(); + } + return "Untitled Page"; + } + + private String extractText(String html) { + return html.replaceAll("]*>[\\s\\S]*?", "") + .replaceAll("]*>[\\s\\S]*?", "") + .replaceAll("<[^>]+>", " ") + .replaceAll("\\s+", " ") + .trim(); + } + + @Override + public String getStrategyName() { + return "jsoup"; + } +} diff --git a/w10/strategy/NewsCrawlStrategy.java b/w10/strategy/NewsCrawlStrategy.java new file mode 100644 index 0000000..82a5450 --- /dev/null +++ b/w10/strategy/NewsCrawlStrategy.java @@ -0,0 +1,76 @@ +package com.crawler.strategy; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.crawler.model.Article; + +public class NewsCrawlStrategy implements CrawlStrategy { + + @Override + public List
crawl(String url) { + List
articles = new ArrayList<>(); + try { + URL urlObj = new URL(url); + HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection(); + connection.setRequestMethod("GET"); + connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); + connection.setConnectTimeout(10000); + connection.setReadTimeout(10000); + + StringBuilder content = new StringBuilder(); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"))) { + String line; + while ((line = reader.readLine()) != null) { + content.append(line).append("\n"); + } + } + + Article article = new Article(); + article.setTitle("News: " + extractTitle(content.toString())); + article.setUrl(url); + article.setSource("news"); + article.setContent(extractText(content.toString())); + article.setAuthor("News Reporter"); + + articles.add(article); + + } catch (Exception e) { + Article errorArticle = new Article(); + errorArticle.setTitle("Error crawling news: " + url); + errorArticle.setUrl(url); + errorArticle.setContent("Error details: " + e.getMessage()); + errorArticle.setSource("news"); + articles.add(errorArticle); + } + return articles; + } + + private String extractTitle(String html) { + Pattern pattern = Pattern.compile("]*>([^<]+)", Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(html); + if (matcher.find()) { + return matcher.group(1).trim(); + } + return "Untitled News"; + } + + private String extractText(String html) { + return html.replaceAll("]*>[\\s\\S]*?", "") + .replaceAll("]*>[\\s\\S]*?", "") + .replaceAll("<[^>]+>", " ") + .replaceAll("\\s+", " ") + .trim(); + } + + @Override + public String getStrategyName() { + return "news"; + } +}