2 changed files with 104 additions and 0 deletions
@ -0,0 +1,76 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import com.example.datacollect.model.Article; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class HnuNoticeStrategy implements CrawlStrategy { |
|||
|
|||
@Override |
|||
public boolean supports(String url) { |
|||
return url.contains("hnu.edu.cn/tzgg") || url.contains("www.hnu.edu.cn"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Article> parse(String url, Document doc) { |
|||
List<Article> articles = new ArrayList<>(); |
|||
|
|||
Elements listItems = doc.select("div.list-content li, ul.news_list li, div.news-list li, table.list tr"); |
|||
|
|||
for (Element item : listItems) { |
|||
Element link = item.selectFirst("a"); |
|||
if (link == null) continue; |
|||
|
|||
String articleUrl = link.attr("href"); |
|||
if (!articleUrl.startsWith("http")) { |
|||
if (articleUrl.startsWith("/")) { |
|||
articleUrl = "https://www.hnu.edu.cn" + articleUrl; |
|||
} else if (articleUrl.startsWith("../")) { |
|||
articleUrl = "https://www.hnu.edu.cn/" + articleUrl.replace("../", ""); |
|||
} else { |
|||
articleUrl = "https://www.hnu.edu.cn/" + articleUrl; |
|||
} |
|||
} |
|||
|
|||
String title = link.text().trim(); |
|||
|
|||
String date = ""; |
|||
Element dateEl = item.selectFirst("span.date, span.time, font.time, .news-date"); |
|||
if (dateEl != null) { |
|||
date = dateEl.text().trim(); |
|||
} |
|||
|
|||
String summary = ""; |
|||
Element summaryEl = item.selectFirst("p.summary, span.summary, .news-summary"); |
|||
if (summaryEl != null) { |
|||
summary = summaryEl.text().trim(); |
|||
} |
|||
|
|||
String content = "Date: " + date + "\nSummary: " + summary; |
|||
|
|||
if (!title.isEmpty()) { |
|||
articles.add(new Article(title, articleUrl, content)); |
|||
} |
|||
} |
|||
|
|||
if (articles.isEmpty()) { |
|||
Elements items = doc.select("a[href*='info']"); |
|||
for (Element link : items) { |
|||
String articleUrl = link.attr("href"); |
|||
if (!articleUrl.startsWith("http")) { |
|||
articleUrl = "https://www.hnu.edu.cn" + (articleUrl.startsWith("/") ? "" : "/") + articleUrl; |
|||
} |
|||
String title = link.text().trim(); |
|||
if (!title.isEmpty() && articleUrl.contains("/info/")) { |
|||
articles.add(new Article(title, articleUrl, "")); |
|||
} |
|||
} |
|||
} |
|||
|
|||
return articles; |
|||
} |
|||
} |
|||
@ -0,0 +1,28 @@ |
|||
package com.example.datacollect.strategy; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class StrategyFactory { |
|||
private final List<CrawlStrategy> strategies = new ArrayList<>(); |
|||
|
|||
public StrategyFactory() { |
|||
strategies.add(new HnuNewsStrategy()); |
|||
strategies.add(new BlogStrategy()); |
|||
strategies.add(new NewsStrategy()); |
|||
strategies.add(new HnuNoticeStrategy()); |
|||
} |
|||
|
|||
public CrawlStrategy getStrategy(String url) { |
|||
for (CrawlStrategy s : strategies) { |
|||
if (s.supports(url)) { |
|||
return s; |
|||
} |
|||
} |
|||
return null; |
|||
} |
|||
|
|||
public void register(CrawlStrategy strategy) { |
|||
strategies.add(strategy); |
|||
} |
|||
} |
|||
Loading…
Reference in new issue