2 changed files with 104 additions and 0 deletions
@ -0,0 +1,76 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import com.example.datacollect.model.Article; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class HnuNoticeStrategy implements CrawlStrategy { |
||||
|
|
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("hnu.edu.cn/tzgg") || url.contains("www.hnu.edu.cn"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Article> parse(String url, Document doc) { |
||||
|
List<Article> articles = new ArrayList<>(); |
||||
|
|
||||
|
Elements listItems = doc.select("div.list-content li, ul.news_list li, div.news-list li, table.list tr"); |
||||
|
|
||||
|
for (Element item : listItems) { |
||||
|
Element link = item.selectFirst("a"); |
||||
|
if (link == null) continue; |
||||
|
|
||||
|
String articleUrl = link.attr("href"); |
||||
|
if (!articleUrl.startsWith("http")) { |
||||
|
if (articleUrl.startsWith("/")) { |
||||
|
articleUrl = "https://www.hnu.edu.cn" + articleUrl; |
||||
|
} else if (articleUrl.startsWith("../")) { |
||||
|
articleUrl = "https://www.hnu.edu.cn/" + articleUrl.replace("../", ""); |
||||
|
} else { |
||||
|
articleUrl = "https://www.hnu.edu.cn/" + articleUrl; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
String title = link.text().trim(); |
||||
|
|
||||
|
String date = ""; |
||||
|
Element dateEl = item.selectFirst("span.date, span.time, font.time, .news-date"); |
||||
|
if (dateEl != null) { |
||||
|
date = dateEl.text().trim(); |
||||
|
} |
||||
|
|
||||
|
String summary = ""; |
||||
|
Element summaryEl = item.selectFirst("p.summary, span.summary, .news-summary"); |
||||
|
if (summaryEl != null) { |
||||
|
summary = summaryEl.text().trim(); |
||||
|
} |
||||
|
|
||||
|
String content = "Date: " + date + "\nSummary: " + summary; |
||||
|
|
||||
|
if (!title.isEmpty()) { |
||||
|
articles.add(new Article(title, articleUrl, content)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (articles.isEmpty()) { |
||||
|
Elements items = doc.select("a[href*='info']"); |
||||
|
for (Element link : items) { |
||||
|
String articleUrl = link.attr("href"); |
||||
|
if (!articleUrl.startsWith("http")) { |
||||
|
articleUrl = "https://www.hnu.edu.cn" + (articleUrl.startsWith("/") ? "" : "/") + articleUrl; |
||||
|
} |
||||
|
String title = link.text().trim(); |
||||
|
if (!title.isEmpty() && articleUrl.contains("/info/")) { |
||||
|
articles.add(new Article(title, articleUrl, "")); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return articles; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,28 @@ |
|||||
|
package com.example.datacollect.strategy; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class StrategyFactory { |
||||
|
private final List<CrawlStrategy> strategies = new ArrayList<>(); |
||||
|
|
||||
|
public StrategyFactory() { |
||||
|
strategies.add(new HnuNewsStrategy()); |
||||
|
strategies.add(new BlogStrategy()); |
||||
|
strategies.add(new NewsStrategy()); |
||||
|
strategies.add(new HnuNoticeStrategy()); |
||||
|
} |
||||
|
|
||||
|
public CrawlStrategy getStrategy(String url) { |
||||
|
for (CrawlStrategy s : strategies) { |
||||
|
if (s.supports(url)) { |
||||
|
return s; |
||||
|
} |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public void register(CrawlStrategy strategy) { |
||||
|
strategies.add(strategy); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue