5 changed files with 187 additions and 0 deletions
@ -0,0 +1,33 @@ |
|||
package com.example.moviecli.strategy; |
|||
|
|||
import com.example.moviecli.model.Movie; |
|||
import com.example.moviecli.exception.ParseFailedException; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class DoubanBookStrategy implements MovieCrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("book.douban.com/top250"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> parse(Document doc) throws ParseFailedException { |
|||
try { |
|||
List<Movie> books = new ArrayList<>(); |
|||
Elements items = doc.select(".item"); |
|||
int rank = 1; |
|||
for (Element item : items) { |
|||
String title = item.select(".pl2 a").text().trim(); |
|||
String score = item.select(".rating_nums").text(); |
|||
books.add(new Movie(rank++, title, "", score, "图书", "豆瓣图书")); |
|||
} |
|||
return books; |
|||
} catch (Exception e) { |
|||
throw new ParseFailedException("豆瓣图书解析失败", e); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,59 @@ |
|||
package com.example.moviecli.strategy; |
|||
|
|||
import com.example.moviecli.model.Movie; |
|||
import com.example.moviecli.exception.ParseFailedException; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class DoubanTop250Strategy implements MovieCrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("movie.douban.com/top250"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> parse(Document doc) throws ParseFailedException { |
|||
try { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
Elements items = doc.select(".item"); |
|||
for (Element item : items) { |
|||
String rankText = item.select(".pic em").text(); |
|||
int rank = Integer.parseInt(rankText); |
|||
String title = item.select(".title").first().text(); |
|||
String originalTitle = ""; |
|||
Elements titles = item.select(".title"); |
|||
if (titles.size() > 1) { |
|||
originalTitle = titles.get(1).text().replace("/", "").trim(); |
|||
} |
|||
String score = item.select(".rating_num").text(); |
|||
String info = item.select(".bd p").first().text(); |
|||
String year = extractYear(info); |
|||
String director = extractDirector(info); |
|||
movies.add(new Movie(rank, title, originalTitle, score, year, director)); |
|||
} |
|||
return movies; |
|||
} catch (Exception e) { |
|||
throw new ParseFailedException("豆瓣电影解析失败", e); |
|||
} |
|||
} |
|||
|
|||
private String extractYear(String info) { |
|||
for (String part : info.split(" ")) { |
|||
if (part.matches("\\d{4}")) return part; |
|||
} |
|||
return "未知"; |
|||
} |
|||
|
|||
private String extractDirector(String info) { |
|||
if (info.contains("导演:")) { |
|||
int start = info.indexOf("导演:") + 3; |
|||
int end = info.indexOf(" ", start); |
|||
if (end == -1) end = info.length(); |
|||
return info.substring(start, end).trim(); |
|||
} |
|||
return "未知"; |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.example.moviecli.strategy; |
|||
|
|||
import com.example.moviecli.model.Movie; |
|||
import com.example.moviecli.exception.ParseFailedException; |
|||
import org.jsoup.nodes.Document; |
|||
import java.util.List; |
|||
|
|||
public interface MovieCrawlStrategy { |
|||
boolean supports(String url); |
|||
List<Movie> parse(Document doc) throws ParseFailedException; |
|||
} |
|||
@ -0,0 +1,25 @@ |
|||
package com.example.moviecli.strategy; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class MovieStrategyFactory { |
|||
private final List<MovieCrawlStrategy> strategies = new ArrayList<>(); |
|||
|
|||
public MovieStrategyFactory() { |
|||
strategies.add(new DoubanTop250Strategy()); |
|||
strategies.add(new SinaNewsStrategy()); // 新增
|
|||
strategies.add(new DoubanBookStrategy()); |
|||
} |
|||
|
|||
public MovieCrawlStrategy getStrategy(String url) { |
|||
for (MovieCrawlStrategy s : strategies) { |
|||
if (s.supports(url)) return s; |
|||
} |
|||
return null; |
|||
} |
|||
|
|||
public void register(MovieCrawlStrategy strategy) { |
|||
strategies.add(strategy); |
|||
} |
|||
} |
|||
@ -0,0 +1,59 @@ |
|||
package com.example.moviecli.strategy; |
|||
|
|||
import com.example.moviecli.model.Movie; |
|||
import com.example.moviecli.exception.ParseFailedException; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import java.util.ArrayList; |
|||
import java.util.LinkedHashSet; |
|||
import java.util.List; |
|||
import java.util.Set; |
|||
|
|||
public class SinaNewsStrategy implements MovieCrawlStrategy { |
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("news.sina.com.cn"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> parse(Document doc) throws ParseFailedException { |
|||
try { |
|||
List<Movie> newsList = new ArrayList<>(); |
|||
Set<String> titleSet = new LinkedHashSet<>(); |
|||
|
|||
// 抓取所有 a 标签,过滤出标题较长的(通常新闻标题长度 > 8)
|
|||
Elements allLinks = doc.select("a"); |
|||
for (Element link : allLinks) { |
|||
String title = link.text().trim(); |
|||
// 过滤短文本、纯数字、纯符号、常见非标题文字
|
|||
if (title.length() > 8 && !title.matches("^[\\d\\s]+$") |
|||
&& !title.contains("评论") && !title.contains("举报")) { |
|||
// 进一步过滤:通常新闻标题不会太短且不会包含过多标点
|
|||
titleSet.add(title); |
|||
} |
|||
if (titleSet.size() >= 120) break; // 最多抓取120条
|
|||
} |
|||
|
|||
// 如果数量不够 30,再尝试抓取特定区域
|
|||
if (titleSet.size() < 30) { |
|||
Elements newsItems = doc.select(".news-item, .blk, .main-content a"); |
|||
for (Element item : newsItems) { |
|||
String title = item.text().trim(); |
|||
if (title.length() > 8 && !titleSet.contains(title)) { |
|||
titleSet.add(title); |
|||
} |
|||
if (titleSet.size() >= 120) break; |
|||
} |
|||
} |
|||
|
|||
int rank = 1; |
|||
for (String title : titleSet) { |
|||
newsList.add(new Movie(rank++, title, "", "新闻", "新浪", "")); |
|||
} |
|||
return newsList; |
|||
} catch (Exception e) { |
|||
throw new ParseFailedException("新浪新闻解析失败", e); |
|||
} |
|||
} |
|||
} |
|||
Loading…
Reference in new issue