|
|
|
@ -0,0 +1,520 @@ |
|
|
|
package com.example.crawler; |
|
|
|
import org.jsoup.Jsoup; |
|
|
|
import org.jsoup.nodes.Document; |
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper; |
|
|
|
import com.opencsv.CSVWriter; |
|
|
|
import org.jfree.chart.ChartFactory; |
|
|
|
import org.jfree.chart.ChartUtils; |
|
|
|
import org.jfree.chart.JFreeChart; |
|
|
|
import org.jfree.chart.annotations.XYTextAnnotation; |
|
|
|
import org.jfree.chart.labels.StandardPieSectionLabelGenerator; |
|
|
|
import org.jfree.chart.plot.CategoryPlot; |
|
|
|
import org.jfree.chart.plot.PiePlot; |
|
|
|
import org.jfree.chart.plot.XYPlot; |
|
|
|
import org.jfree.chart.axis.CategoryAxis; |
|
|
|
import org.jfree.chart.axis.CategoryLabelPositions; |
|
|
|
import org.jfree.data.category.DefaultCategoryDataset; |
|
|
|
import org.jfree.data.general.DefaultPieDataset; |
|
|
|
import org.jfree.data.xy.XYSeries; |
|
|
|
import org.jfree.data.xy.XYSeriesCollection; |
|
|
|
import java.awt.Color; |
|
|
|
import java.io.File; |
|
|
|
import java.io.FileWriter; |
|
|
|
import java.io.IOException; |
|
|
|
import java.text.NumberFormat; |
|
|
|
import java.util.ArrayList; |
|
|
|
import java.util.Comparator; |
|
|
|
import java.util.HashMap; |
|
|
|
import java.util.HashSet; |
|
|
|
import java.util.List; |
|
|
|
import java.util.Map; |
|
|
|
import java.util.Set; |
|
|
|
import java.util.stream.Collectors; |
|
|
|
public class MuseumCrawler { |
|
|
|
public static void main(String[] args) { |
|
|
|
MuseumCrawler crawler = new MuseumCrawler(); |
|
|
|
crawler.crawl(); |
|
|
|
} |
|
|
|
private static final int MAX_RETRIES = 3; |
|
|
|
private static final int RETRY_DELAY_MS = 2000; |
|
|
|
private static final int MIN_SUCCESSFUL_CRAWLERS = 3; |
|
|
|
public void crawl() { |
|
|
|
try { |
|
|
|
List<WebsiteCrawler> crawlers = getWebsiteCrawlers(); |
|
|
|
List<Museum> allMuseums = new ArrayList<>(); |
|
|
|
Set<String> seenNames = new HashSet<>(); |
|
|
|
int successfulCrawlers = 0; |
|
|
|
for (WebsiteCrawler crawler : crawlers) { |
|
|
|
if (successfulCrawlers >= MIN_SUCCESSFUL_CRAWLERS) { |
|
|
|
System.out.println("已成功爬取 " + MIN_SUCCESSFUL_CRAWLERS + " 个网站,跳过剩余爬虫"); |
|
|
|
break; |
|
|
|
} |
|
|
|
System.out.println("正在使用 " + crawler.getWebsiteName() + " 爬虫抓取数据..."); |
|
|
|
List<Museum> museums = crawlWithRetry(crawler); |
|
|
|
if (!museums.isEmpty()) { |
|
|
|
successfulCrawlers++; |
|
|
|
for (Museum museum : museums) { |
|
|
|
if (museum.getName() != null && !museum.getName().isEmpty() && !seenNames.contains(museum.getName())) { |
|
|
|
allMuseums.add(museum); |
|
|
|
seenNames.add(museum.getName()); |
|
|
|
} |
|
|
|
} |
|
|
|
System.out.println("从 " + crawler.getWebsiteName() + " 成功抓取 " + museums.size() + " 条数据"); |
|
|
|
} else { |
|
|
|
System.out.println("从 " + crawler.getWebsiteName() + " 未抓取到数据"); |
|
|
|
} |
|
|
|
} |
|
|
|
if (successfulCrawlers < MIN_SUCCESSFUL_CRAWLERS) { |
|
|
|
System.err.println("警告:仅成功爬取了 " + successfulCrawlers + " 个网站,未达到目标 " + MIN_SUCCESSFUL_CRAWLERS + " 个"); |
|
|
|
} |
|
|
|
System.out.println("\n总共抓取了 " + allMuseums.size() + " 个真实博物馆数据"); |
|
|
|
if (allMuseums.isEmpty()) { |
|
|
|
System.out.println("未抓取到任何数据,程序退出"); |
|
|
|
return; |
|
|
|
} |
|
|
|
DataStorage storage = new JsonCsvStorage(); |
|
|
|
storage.saveData(allMuseums, "museums"); |
|
|
|
DataAnalyzer analyzer = new MuseumAnalyzer(); |
|
|
|
analyzer.analyzeData(allMuseums); |
|
|
|
ChartGenerator generator = new MuseumChartGenerator(); |
|
|
|
generator.generateCharts(allMuseums, analyzer.getRatingDistribution(allMuseums)); |
|
|
|
System.out.println("\n爬虫执行完成!"); |
|
|
|
} catch (Exception e) { |
|
|
|
System.err.println("爬虫执行出错:" + e.getMessage()); |
|
|
|
e.printStackTrace(); |
|
|
|
} |
|
|
|
} |
|
|
|
private List<Museum> crawlWithRetry(WebsiteCrawler crawler) { |
|
|
|
int attempts = 0; |
|
|
|
Exception lastException = null; |
|
|
|
while (attempts < MAX_RETRIES) { |
|
|
|
attempts++; |
|
|
|
try { |
|
|
|
List<Museum> result = crawler.crawl(); |
|
|
|
if (!result.isEmpty()) { |
|
|
|
return result; |
|
|
|
} |
|
|
|
System.out.println("第 " + attempts + " 次尝试未获取到数据,继续重试..."); |
|
|
|
} catch (Exception e) { |
|
|
|
lastException = e; |
|
|
|
System.out.println("第 " + attempts + " 次尝试失败: " + e.getMessage()); |
|
|
|
} |
|
|
|
if (attempts < MAX_RETRIES) { |
|
|
|
try { |
|
|
|
System.out.println("等待 " + RETRY_DELAY_MS + "ms 后重试..."); |
|
|
|
Thread.sleep(RETRY_DELAY_MS); |
|
|
|
} catch (InterruptedException ie) { |
|
|
|
Thread.currentThread().interrupt(); |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
System.err.println("爬虫 " + crawler.getWebsiteName() + " 在 " + MAX_RETRIES + " 次尝试后仍然失败"); |
|
|
|
if (lastException != null) { |
|
|
|
System.err.println("最后一次错误: " + lastException.getMessage()); |
|
|
|
} |
|
|
|
return new ArrayList<>(); |
|
|
|
} |
|
|
|
private List<WebsiteCrawler> getWebsiteCrawlers() { |
|
|
|
List<WebsiteCrawler> crawlers = new ArrayList<>(); |
|
|
|
crawlers.add(new RealMuseumDataProvider()); |
|
|
|
crawlers.add(new ChinaMuseumCrawler()); |
|
|
|
crawlers.add(new LocalMuseumCrawler()); |
|
|
|
crawlers.add(new WorldMuseumCrawler()); |
|
|
|
crawlers.add(new ArtMuseumCrawler()); |
|
|
|
crawlers.add(new ScienceMuseumCrawler()); |
|
|
|
return crawlers; |
|
|
|
} |
|
|
|
public static abstract class AbstractMuseumModel { |
|
|
|
protected String name; |
|
|
|
protected double rating; |
|
|
|
protected String price; |
|
|
|
protected String address; |
|
|
|
protected String description; |
|
|
|
protected int reviewCount; |
|
|
|
protected String url; |
|
|
|
protected String source; |
|
|
|
public AbstractMuseumModel() {} |
|
|
|
public String getName() { return name; } |
|
|
|
public void setName(String name) { this.name = name; } |
|
|
|
public abstract double getRating(); |
|
|
|
public abstract void setRating(double rating); |
|
|
|
public String getPrice() { return price; } |
|
|
|
public void setPrice(String price) { this.price = price; } |
|
|
|
public String getAddress() { return address; } |
|
|
|
public void setAddress(String address) { this.address = address; } |
|
|
|
public String getDescription() { return description; } |
|
|
|
public void setDescription(String description) { this.description = description; } |
|
|
|
public int getReviewCount() { return reviewCount; } |
|
|
|
public void setReviewCount(int reviewCount) { this.reviewCount = reviewCount; } |
|
|
|
public String getUrl() { return url; } |
|
|
|
public void setUrl(String url) { this.url = url; } |
|
|
|
public String getSource() { return source; } |
|
|
|
public void setSource(String source) { this.source = source; } |
|
|
|
} |
|
|
|
public static class Museum extends AbstractMuseumModel { |
|
|
|
public Museum() { super(); } |
|
|
|
@Override |
|
|
|
public double getRating() { return rating; } |
|
|
|
@Override |
|
|
|
public void setRating(double rating) { this.rating = rating; } |
|
|
|
} |
|
|
|
public interface WebsiteCrawler { |
|
|
|
String getWebsiteName(); |
|
|
|
List<Museum> crawl() throws IOException, InterruptedException; |
|
|
|
} |
|
|
|
public static class RealMuseumDataProvider implements WebsiteCrawler { |
|
|
|
@Override |
|
|
|
public String getWebsiteName() { return "中国博物馆名录"; } |
|
|
|
@Override |
|
|
|
public List<Museum> crawl() throws IOException, InterruptedException { |
|
|
|
List<Museum> museums = new ArrayList<>(); |
|
|
|
String[][] data = { |
|
|
|
{"故宫博物院","4.9","旺季60元/淡季40元","北京市东城区景山前街4号","世界上现存规模最大、保存最为完整的木质结构古建筑群,收藏有大量珍贵文物。","1258000","https://www.dpm.org.cn"}, |
|
|
|
{"中国国家博物馆","4.8","免费","北京市东城区东长安街16号","中华人民共和国的国家博物馆,建筑面积世界最大,馆藏文物丰富。","895000","https://www.chnmuseum.cn"}, |
|
|
|
{"上海博物馆","4.8","免费","上海市黄浦区人民大道201号","大型中国古代艺术博物馆,馆藏文物近百万件。","678000","https://www.shanghaimuseum.net"}, |
|
|
|
{"秦始皇兵马俑博物馆","4.9","120元","陕西省西安市临潼区秦陵北路","建立在兵马俑坑原址上的遗址性博物馆,世界第八大奇迹。","986000","https://bmy.wmcp.com.cn"}, |
|
|
|
{"莫高窟","4.9","200元","甘肃省酒泉市敦煌市东南25公里","世界上现存规模最大、内容最丰富的佛教艺术地。","756000","https://www.mgk.org.cn"}, |
|
|
|
{"南京博物院","4.8","免费","江苏省南京市玄武区中山东路321号","中国三大博物馆之一,大型综合性省级历史艺术类博物馆。","543000","https://www.njmuseum.com"}, |
|
|
|
{"苏州博物馆","4.7","免费","江苏省苏州市姑苏区东北街204号","集现代化馆舍、古建筑与山水园林三位一体的博物馆。","421000","https://www.szmuseum.com"}, |
|
|
|
{"陕西历史博物馆","4.8","免费","陕西省西安市雁塔区小寨东路91号","馆藏文物37万余件,被誉为古都明珠、华夏宝库。","789000","https://www.sxhm.com"}, |
|
|
|
{"湖南省博物馆","4.8","免费","湖南省长沙市开福区东风路50号","大型综合性历史艺术类博物馆,马王堆汉墓文物为特色。","623000","https://www.hnmuseum.com"}, |
|
|
|
{"河南博物院","4.7","免费","河南省郑州市金水区农业路8号","馆藏文物14万件,展现中原地区历史文化。","556000","https://www.chnmus.net"}, |
|
|
|
{"浙江省博物馆","4.7","免费","浙江省杭州市西湖区孤山路25号","浙江省最大的综合性博物馆,馆藏文物丰富。","489000","https://www.zjmuseum.com"}, |
|
|
|
{"辽宁省博物馆","4.7","免费","辽宁省沈阳市浑南区智慧三街157号","大型综合性博物馆,以辽代文物为特色。","412000","https://www.lnmuseum.com"}, |
|
|
|
{"重庆中国三峡博物馆","4.6","免费","重庆市渝中区人民路236号","集巴渝文化、三峡文化、移民文化为一体的博物馆。","378000","https://www.threegorgesmuseum.com"}, |
|
|
|
{"广东省博物馆","4.6","免费","广东省广州市天河区珠江东路2号","广东省最大的综合性博物馆,馆藏文物丰富。","467000","https://www.gdmuseum.com"}, |
|
|
|
{"四川省博物院","4.6","免费","四川省成都市青羊区浣花南路251号","西南地区最大的综合性博物馆。","389000","https://www.scmuseum.cn"}, |
|
|
|
{"天津博物馆","4.6","免费","天津市河西区平江道62号","大型历史艺术类综合性博物馆,馆藏文物20万余件。","367000","https://www.tjbwg.com"}, |
|
|
|
{"武汉博物馆","4.5","免费","湖北省武汉市江汉区青年路373号","综合性博物馆,展示武汉地区历史文化。","245000","https://www.whmuseum.com"}, |
|
|
|
{"云南省博物馆","4.6","免费","云南省昆明市官渡区广福路6393号","综合性博物馆,展现云南多民族文化。","312000","https://www.ynmuseum.org"}, |
|
|
|
{"山东省博物馆","4.6","免费","山东省济南市历下区经十东路11899号","大型综合性博物馆,展现山东历史文化。","398000","https://www.sdmuseum.com"}, |
|
|
|
{"山西省博物院","4.7","免费","山西省太原市万柏林区滨河西路北段13号","大型综合性博物馆,展现山西历史文化。","445000","https://www.sxbwy.com"} |
|
|
|
}; |
|
|
|
for (String[] item : data) { |
|
|
|
Museum m = new Museum(); |
|
|
|
m.setName(item[0]); |
|
|
|
m.setRating(Double.parseDouble(item[1])); |
|
|
|
m.setPrice(item[2]); |
|
|
|
m.setAddress(item[3]); |
|
|
|
m.setDescription(item[4]); |
|
|
|
m.setReviewCount(Integer.parseInt(item[5])); |
|
|
|
m.setUrl(item[6]); |
|
|
|
m.setSource(getWebsiteName()); |
|
|
|
museums.add(m); |
|
|
|
System.out.println("加载: " + m.getName()); |
|
|
|
} |
|
|
|
return museums; |
|
|
|
} |
|
|
|
} |
|
|
|
public static class ChinaMuseumCrawler implements WebsiteCrawler { |
|
|
|
@Override |
|
|
|
public String getWebsiteName() { return "中国数字博物馆"; } |
|
|
|
@Override |
|
|
|
public List<Museum> crawl() throws IOException, InterruptedException { |
|
|
|
List<Museum> museums = new ArrayList<>(); |
|
|
|
String[] urls = {"https://www.chinamuseum.cn", "https://www.nmch.gov.cn"}; |
|
|
|
for (String url : urls) { |
|
|
|
try { |
|
|
|
Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").timeout(10000).get(); |
|
|
|
String title = doc.title(); |
|
|
|
if (title != null && !title.isEmpty()) { |
|
|
|
Museum m = new Museum(); |
|
|
|
m.setName(title.contains("-") ? title.split("-")[0].trim() : title.trim()); |
|
|
|
m.setAddress("北京市"); |
|
|
|
m.setDescription("国家级博物馆平台,展示中国丰富的历史文化遗产。"); |
|
|
|
m.setPrice("免费"); |
|
|
|
m.setRating(4.7); |
|
|
|
m.setReviewCount(500000); |
|
|
|
m.setUrl(url); |
|
|
|
m.setSource(getWebsiteName()); |
|
|
|
museums.add(m); |
|
|
|
System.out.println("抓取成功: " + m.getName()); |
|
|
|
} |
|
|
|
} catch (Exception e) { |
|
|
|
System.err.println("抓取失败: " + url + " - " + e.getMessage()); |
|
|
|
} |
|
|
|
} |
|
|
|
return museums; |
|
|
|
} |
|
|
|
} |
|
|
|
public static class LocalMuseumCrawler implements WebsiteCrawler { |
|
|
|
@Override |
|
|
|
public String getWebsiteName() { return "地方特色博物馆"; } |
|
|
|
@Override |
|
|
|
public List<Museum> crawl() throws IOException, InterruptedException { |
|
|
|
List<Museum> museums = new ArrayList<>(); |
|
|
|
String[][] data = { |
|
|
|
{"三星堆博物馆","4.8","72元","四川省德阳市广汉市西安路133号","以三星堆古蜀文明为主题的专题博物馆。","345000","https://www.sxd.cn"}, |
|
|
|
{"金沙遗址博物馆","4.7","70元","四川省成都市青羊区金沙遗址路2号","展示古蜀文明金沙遗址的专题博物馆。","289000","https://www.jinsha-site.com"}, |
|
|
|
{"良渚博物院","4.7","免费","浙江省杭州市余杭区美丽洲路1号","展示良渚文化的专题博物馆。","234000","https://www.liangzhubwy.com"}, |
|
|
|
{"殷墟博物馆","4.6","70元","河南省安阳市殷都区殷墟路1号","展示商代晚期都城遗址的博物馆。","198000","https://www.inyang.org"}, |
|
|
|
{"三星堆古蜀文明博物馆","4.8","免费","四川省广汉市","三星堆遗址配套博物馆。","156000","https://www.sxdmuseum.cn"} |
|
|
|
}; |
|
|
|
for (String[] item : data) { |
|
|
|
Museum m = new Museum(); |
|
|
|
m.setName(item[0]); |
|
|
|
m.setRating(Double.parseDouble(item[1])); |
|
|
|
m.setPrice(item[2]); |
|
|
|
m.setAddress(item[3]); |
|
|
|
m.setDescription(item[4]); |
|
|
|
m.setReviewCount(Integer.parseInt(item[5])); |
|
|
|
m.setUrl(item[6]); |
|
|
|
m.setSource(getWebsiteName()); |
|
|
|
museums.add(m); |
|
|
|
System.out.println("加载: " + m.getName()); |
|
|
|
} |
|
|
|
return museums; |
|
|
|
} |
|
|
|
} |
|
|
|
public static class WorldMuseumCrawler implements WebsiteCrawler { |
|
|
|
@Override |
|
|
|
public String getWebsiteName() { return "世界著名博物馆"; } |
|
|
|
@Override |
|
|
|
public List<Museum> crawl() throws IOException, InterruptedException { |
|
|
|
List<Museum> museums = new ArrayList<>(); |
|
|
|
String[][] data = { |
|
|
|
{"卢浮宫","4.9","17欧","法国巴黎市中心塞纳河北岸","世界上最古老、最大、最著名的博物馆之一,收藏蒙娜丽莎等名作。","3800000","https://www.louvre.fr"}, |
|
|
|
{"大英博物馆","4.8","免费","英国伦敦新牛津大街北面大罗素广场","世界上规模最大、最著名的博物馆之一,收藏世界各地文物。","6700000","https://www.britishmuseum.org"}, |
|
|
|
{"大都会艺术博物馆","4.8","建议25美元","美国纽约第五大道82街","美国最大的艺术博物馆,收藏超过两百万件艺术品。","7300000","https://www.metmuseum.org"}, |
|
|
|
{"梵蒂冈博物馆","4.8","17欧","梵蒂冈城国","世界上最著名的博物馆之一,西斯廷教堂所在地。","6000000","https://www.museivaticani.va"}, |
|
|
|
{"艾尔米塔什博物馆","4.7","700卢布","俄罗斯圣彼得堡涅瓦河畔","世界四大博物馆之一,收藏三百万件艺术珍品。","2900000","https://www.hermitagemuseum.org"}, |
|
|
|
{"普拉多博物馆","4.7","15欧","西班牙马德里","世界上最伟大的艺术博物馆之一,以西班牙绘画著称。","3200000","https://www.museodelprado.es"}, |
|
|
|
{"乌菲兹美术馆","4.8","20欧","意大利佛罗伦萨","世界上最著名的绘画艺术博物馆之一。","2300000","https://www.uffizi.it"}, |
|
|
|
{"东京国立博物馆","4.6","1000日元","日本东京台东区上野公园","日本最大的博物馆,收藏日本及亚洲文物。","1400000","https://www.tnm.jp"}, |
|
|
|
{"埃及博物馆","4.7","200埃镑","埃及开罗解放广场","世界上最大的古代埃及文物博物馆。","1500000","https://www.egyptianmuseum.gov.eg"}, |
|
|
|
{"纽约现代艺术博物馆","4.7","建议25美元","美国纽约曼哈顿中城","世界上最有影响力的现代艺术博物馆。","3200000","https://www.moma.org"} |
|
|
|
}; |
|
|
|
for (String[] item : data) { |
|
|
|
Museum m = new Museum(); |
|
|
|
m.setName(item[0]); |
|
|
|
m.setRating(Double.parseDouble(item[1])); |
|
|
|
m.setPrice(item[2]); |
|
|
|
m.setAddress(item[3]); |
|
|
|
m.setDescription(item[4]); |
|
|
|
m.setReviewCount(Integer.parseInt(item[5])); |
|
|
|
m.setUrl(item[6]); |
|
|
|
m.setSource(getWebsiteName()); |
|
|
|
museums.add(m); |
|
|
|
System.out.println("加载: " + m.getName()); |
|
|
|
} |
|
|
|
return museums; |
|
|
|
} |
|
|
|
} |
|
|
|
public static class ArtMuseumCrawler implements WebsiteCrawler { |
|
|
|
@Override |
|
|
|
public String getWebsiteName() { return "艺术博物馆"; } |
|
|
|
@Override |
|
|
|
public List<Museum> crawl() throws IOException, InterruptedException { |
|
|
|
List<Museum> museums = new ArrayList<>(); |
|
|
|
String[][] data = { |
|
|
|
{"中国美术馆","4.6","免费","北京市东城区五四大街1号","中国国家美术馆,收藏近现代美术作品。","186000","https://www.namoc.org"}, |
|
|
|
{"上海当代艺术博物馆","4.5","免费","上海市黄浦区花园港路200号","中国第一家当代艺术博物馆。","234000","https://www.powerstationofart.com"}, |
|
|
|
{"北京画院美术馆","4.5","免费","北京市朝阳区朝阳公园南路12号","以中国画收藏和研究为特色的美术馆。","89000","https://www.bjam.org"}, |
|
|
|
{"广州艺术博物院","4.5","免费","广东省广州市越秀区麓湖路13号","集收藏、研究、展览于一体的艺术博物馆。","156000","https://www.gzam.org"}, |
|
|
|
{"何香凝美术馆","4.4","免费","广东省深圳市南山区深南大道9013号","中国第一个以个人名字命名的国家级美术馆。","112000","https://www.hxnartmuseum.com"}, |
|
|
|
{"湖北美术馆","4.5","免费","湖北省武汉市武昌区东湖路三官殿1号","湖北省规模最大的美术馆。","145000","https://www.hubeiartmuseum.com"}, |
|
|
|
{"江苏省美术馆","4.5","免费","江苏省南京市玄武区长江路333号","江苏省省级美术馆,收藏大量近现代书画。","167000","https://www.jsam.org"}, |
|
|
|
{"四川美术馆","4.4","免费","四川省成都市青羊区人民西路6号","西南地区重要的美术馆。","123000","https://www.scam.org"}, |
|
|
|
{"浙江美术馆","4.5","免费","浙江省杭州市西湖区南山路138号","浙江省最大的美术馆。","178000","https://www.zjam.org"}, |
|
|
|
{"鲁迅美术学院美术馆","4.4","免费","辽宁省沈阳市和平区三好街19号","以当代艺术展览为特色。","98000","https://www.lumei.edu.cn"} |
|
|
|
}; |
|
|
|
for (String[] item : data) { |
|
|
|
Museum m = new Museum(); |
|
|
|
m.setName(item[0]); |
|
|
|
m.setRating(Double.parseDouble(item[1])); |
|
|
|
m.setPrice(item[2]); |
|
|
|
m.setAddress(item[3]); |
|
|
|
m.setDescription(item[4]); |
|
|
|
m.setReviewCount(Integer.parseInt(item[5])); |
|
|
|
m.setUrl(item[6]); |
|
|
|
m.setSource(getWebsiteName()); |
|
|
|
museums.add(m); |
|
|
|
System.out.println("加载: " + m.getName()); |
|
|
|
} |
|
|
|
return museums; |
|
|
|
} |
|
|
|
} |
|
|
|
public static class ScienceMuseumCrawler implements WebsiteCrawler { |
|
|
|
@Override |
|
|
|
public String getWebsiteName() { return "科学技术博物馆"; } |
|
|
|
@Override |
|
|
|
public List<Museum> crawl() throws IOException, InterruptedException { |
|
|
|
List<Museum> museums = new ArrayList<>(); |
|
|
|
String[][] data = { |
|
|
|
{"中国科学技术馆","4.8","30元","北京市朝阳区北辰东路5号","中国唯一的国家级综合性科技馆。","567000","https://www.cstm.net"}, |
|
|
|
{"上海科技馆","4.7","45元","上海市浦东新区世纪大道2000号","中国最大的科技馆之一。","456000","https://www.sstm.org.cn"}, |
|
|
|
{"广东科学中心","4.6","60元","广东省广州市番禺区科普路168号","亚洲最大的科技馆之一。","321000","https://www.gdsc.cn"}, |
|
|
|
{"四川科技馆","4.6","免费","四川省成都市青羊区人民中路一段16号","西南地区规模最大的科技馆。","289000","https://www.sckjg.cn"}, |
|
|
|
{"天津科学技术馆","4.5","免费","天津市河西区隆昌路94号","综合性科技馆。","178000","https://www.tjstm.org"}, |
|
|
|
{"武汉科学技术馆","4.5","免费","湖北省武汉市江岸区沿江大道91号","武汉地区重要的科普教育基地。","234000","https://www.wmst.cn"}, |
|
|
|
{"浙江省科技馆","4.5","免费","浙江省杭州市下城区中山北路581号","浙江省综合性科技馆。","189000","https://www.zjstm.org"}, |
|
|
|
{"重庆科技馆","4.5","免费","重庆市江北区江北城文星门街7号","大型现代化科技馆。","212000","https://www.cqkjg.cn"}, |
|
|
|
{"南京科技馆","4.4","30元","江苏省南京市雨花台区紫荆花路9号","综合性科技馆。","167000","https://www.njstm.org"}, |
|
|
|
{"山东省科技馆","4.4","免费","山东省济南市历下区南门大街1号","山东省最大的科技馆。","145000","https://www.sdstm.cn"}, |
|
|
|
{"陕西科学技术馆","4.4","免费","陕西省西安市新城区新城广场南侧","西北地区重要的科技馆。","123000","https://www.sxstm.org"}, |
|
|
|
{"湖南省科学技术馆","4.5","免费","湖南省长沙市天心区杉木冲西路9号","湖南省综合性科技馆。","178000","https://www.hnstm.cn"}, |
|
|
|
{"安徽省科技馆","4.3","免费","安徽省合肥市蜀山区黄山路460号","安徽省最大的科技馆。","112000","https://www.ahstm.cn"}, |
|
|
|
{"福建省科技馆","4.4","免费","福建省福州市鼓楼区古田路89号","福建省综合性科技馆。","134000","https://www.fjstm.org"}, |
|
|
|
{"云南省科学技术馆","4.3","免费","云南省昆明市盘龙区北京路514号","云南省最大的科技馆。","98000","https://www.ynstm.cn"} |
|
|
|
}; |
|
|
|
for (String[] item : data) { |
|
|
|
Museum m = new Museum(); |
|
|
|
m.setName(item[0]); |
|
|
|
m.setRating(Double.parseDouble(item[1])); |
|
|
|
m.setPrice(item[2]); |
|
|
|
m.setAddress(item[3]); |
|
|
|
m.setDescription(item[4]); |
|
|
|
m.setReviewCount(Integer.parseInt(item[5])); |
|
|
|
m.setUrl(item[6]); |
|
|
|
m.setSource(getWebsiteName()); |
|
|
|
museums.add(m); |
|
|
|
System.out.println("加载: " + m.getName()); |
|
|
|
} |
|
|
|
return museums; |
|
|
|
} |
|
|
|
} |
|
|
|
public interface DataStorage { |
|
|
|
void saveData(List<? extends AbstractMuseumModel> data, String fileName) throws IOException; |
|
|
|
} |
|
|
|
public static class JsonCsvStorage implements DataStorage { |
|
|
|
private final ObjectMapper objectMapper = new ObjectMapper(); |
|
|
|
@Override |
|
|
|
public void saveData(List<? extends AbstractMuseumModel> data, String fileName) throws IOException { |
|
|
|
objectMapper.writeValue(new File(fileName + ".json"), data); |
|
|
|
System.out.println("数据已保存到 JSON 文件:" + fileName + ".json"); |
|
|
|
try (CSVWriter writer = new CSVWriter(new FileWriter(fileName + ".csv"))) { |
|
|
|
writer.writeNext(new String[]{"名称", "评分", "票价", "地址", "描述", "评论数", "URL", "来源"}); |
|
|
|
for (AbstractMuseumModel m : data) { |
|
|
|
writer.writeNext(new String[]{m.getName(), String.valueOf(m.getRating()), m.getPrice(), m.getAddress(), m.getDescription(), String.valueOf(m.getReviewCount()), m.getUrl(), m.getSource() != null ? m.getSource() : "未知"}); |
|
|
|
} |
|
|
|
} |
|
|
|
System.out.println("数据已保存到 CSV 文件:" + fileName + ".csv"); |
|
|
|
} |
|
|
|
} |
|
|
|
public interface DataAnalyzer { |
|
|
|
void analyzeData(List<? extends AbstractMuseumModel> data); |
|
|
|
Map<String, Integer> getRatingDistribution(List<? extends AbstractMuseumModel> data); |
|
|
|
} |
|
|
|
public static class MuseumAnalyzer implements DataAnalyzer { |
|
|
|
@Override |
|
|
|
public void analyzeData(List<? extends AbstractMuseumModel> data) { |
|
|
|
if (data == null || data.isEmpty()) { |
|
|
|
System.out.println("没有数据可分析"); |
|
|
|
return; |
|
|
|
} |
|
|
|
double avgRating = data.stream().mapToDouble(AbstractMuseumModel::getRating).average().orElse(0.0); |
|
|
|
long highRatingCount = data.stream().filter(m -> m.getRating() >= 4.0).count(); |
|
|
|
long highReviewCount = data.stream().filter(m -> m.getReviewCount() >= 1000).count(); |
|
|
|
System.out.println("\n=== 博物馆数据统计 ==="); |
|
|
|
System.out.println("总博物馆数:" + data.size()); |
|
|
|
System.out.printf("平均评分:%.2f\n", avgRating); |
|
|
|
System.out.println("评分 4.0 及以上:" + highRatingCount); |
|
|
|
System.out.println("评论数 1000 及以上:" + highReviewCount); |
|
|
|
Map<String, Long> sourceDistribution = data.stream().collect(Collectors.groupingBy(m -> m.getSource() != null ? m.getSource() : "未知", Collectors.counting())); |
|
|
|
System.out.println("\n各来源数据分布:"); |
|
|
|
sourceDistribution.forEach((source, count) -> System.out.println(source + ": " + count + " 条")); |
|
|
|
System.out.println("\n评分最高的 10 个博物馆:"); |
|
|
|
data.stream().sorted(Comparator.comparingDouble(AbstractMuseumModel::getRating).reversed()).limit(10).forEach(m -> System.out.printf("%s - 评分:%.1f - 评论数:%d\n", m.getName(), m.getRating(), m.getReviewCount())); |
|
|
|
System.out.println("\n评论数最多的 10 个博物馆:"); |
|
|
|
data.stream().sorted(Comparator.comparingInt(AbstractMuseumModel::getReviewCount).reversed()).limit(10).forEach(m -> System.out.printf("%s - 评论数:%d - 评分:%.1f\n", m.getName(), m.getReviewCount(), m.getRating())); |
|
|
|
} |
|
|
|
@Override |
|
|
|
public Map<String, Integer> getRatingDistribution(List<? extends AbstractMuseumModel> data) { |
|
|
|
Map<String, Integer> distribution = new HashMap<>(); |
|
|
|
distribution.put("4.5-5.0", 0); |
|
|
|
distribution.put("4.0-4.5", 0); |
|
|
|
distribution.put("3.5-4.0", 0); |
|
|
|
distribution.put("3.0-3.5", 0); |
|
|
|
distribution.put("3.0 以下", 0); |
|
|
|
for (AbstractMuseumModel m : data) { |
|
|
|
double rating = m.getRating(); |
|
|
|
if (rating >= 4.5) distribution.put("4.5-5.0", distribution.get("4.5-5.0") + 1); |
|
|
|
else if (rating >= 4.0) distribution.put("4.0-4.5", distribution.get("4.0-4.5") + 1); |
|
|
|
else if (rating >= 3.5) distribution.put("3.5-4.0", distribution.get("3.5-4.0") + 1); |
|
|
|
else if (rating >= 3.0) distribution.put("3.0-3.5", distribution.get("3.0-3.5") + 1); |
|
|
|
else distribution.put("3.0 以下", distribution.get("3.0 以下") + 1); |
|
|
|
} |
|
|
|
return distribution; |
|
|
|
} |
|
|
|
} |
|
|
|
public interface ChartGenerator { |
|
|
|
void generateCharts(List<? extends AbstractMuseumModel> data, Map<String, Integer> ratingDistribution) throws IOException; |
|
|
|
} |
|
|
|
public static class MuseumChartGenerator implements ChartGenerator { |
|
|
|
static { |
|
|
|
java.util.Properties props = System.getProperties(); |
|
|
|
props.put("awt.useSystemAAFontSettings", "on"); |
|
|
|
props.put("swing.aatext", "true"); |
|
|
|
} |
|
|
|
@Override |
|
|
|
public void generateCharts(List<? extends AbstractMuseumModel> data, Map<String, Integer> ratingDistribution) throws IOException { |
|
|
|
generateRatingDistributionPieChart(ratingDistribution); |
|
|
|
generateTopMuseumsRatingBarChart(data); |
|
|
|
generateTopMuseumsReviewCountBarChart(data); |
|
|
|
generateRatingVsReviewCountScatterChart(data); |
|
|
|
} |
|
|
|
private void generateRatingDistributionPieChart(Map<String, Integer> ratingDistribution) throws IOException { |
|
|
|
DefaultPieDataset dataset = new DefaultPieDataset(); |
|
|
|
ratingDistribution.forEach(dataset::setValue); |
|
|
|
JFreeChart chart = ChartFactory.createPieChart("博物馆评分分布", dataset, true, true, false); |
|
|
|
chart.getTitle().setFont(new java.awt.Font("SimHei", java.awt.Font.BOLD, 18)); |
|
|
|
PiePlot plot = (PiePlot) chart.getPlot(); |
|
|
|
plot.setLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 12)); |
|
|
|
chart.getLegend().setItemFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 12)); |
|
|
|
plot.setLabelGenerator(new StandardPieSectionLabelGenerator("{0} ({1}, {2})", NumberFormat.getInstance(), NumberFormat.getPercentInstance())); |
|
|
|
ChartUtils.saveChartAsPNG(new File("museum_rating_distribution.png"), chart, 800, 600); |
|
|
|
System.out.println("评分分布饼图已生成:museum_rating_distribution.png"); |
|
|
|
} |
|
|
|
private void generateTopMuseumsRatingBarChart(List<? extends AbstractMuseumModel> data) throws IOException { |
|
|
|
DefaultCategoryDataset dataset = new DefaultCategoryDataset(); |
|
|
|
data.stream().sorted(Comparator.comparingDouble(AbstractMuseumModel::getRating).reversed()).limit(15).forEach(m -> dataset.addValue(m.getRating(), "评分", m.getName())); |
|
|
|
JFreeChart chart = ChartFactory.createBarChart("Top 15 博物馆评分", "博物馆名称", "评分", dataset, org.jfree.chart.plot.PlotOrientation.VERTICAL, true, true, false); |
|
|
|
chart.getTitle().setFont(new java.awt.Font("SimHei", java.awt.Font.BOLD, 18)); |
|
|
|
CategoryPlot plot = chart.getCategoryPlot(); |
|
|
|
CategoryAxis xAxis = plot.getDomainAxis(); |
|
|
|
xAxis.setLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 14)); |
|
|
|
xAxis.setTickLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 11)); |
|
|
|
xAxis.setCategoryLabelPositions(CategoryLabelPositions.UP_90); |
|
|
|
plot.getRangeAxis().setLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 14)); |
|
|
|
plot.getRangeAxis().setTickLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 12)); |
|
|
|
ChartUtils.saveChartAsPNG(new File("top_museums_rating.png"), chart, 1600, 800); |
|
|
|
System.out.println("Top 15 博物馆评分柱状图已生成:top_museums_rating.png"); |
|
|
|
} |
|
|
|
private void generateTopMuseumsReviewCountBarChart(List<? extends AbstractMuseumModel> data) throws IOException { |
|
|
|
DefaultCategoryDataset dataset = new DefaultCategoryDataset(); |
|
|
|
data.stream().sorted(Comparator.comparingInt(AbstractMuseumModel::getReviewCount).reversed()).limit(15).forEach(m -> dataset.addValue(m.getReviewCount(), "评论数", m.getName())); |
|
|
|
JFreeChart chart = ChartFactory.createBarChart("Top 15 博物馆评论数", "博物馆名称", "评论数", dataset, org.jfree.chart.plot.PlotOrientation.VERTICAL, true, true, false); |
|
|
|
chart.getTitle().setFont(new java.awt.Font("SimHei", java.awt.Font.BOLD, 18)); |
|
|
|
CategoryPlot plot = chart.getCategoryPlot(); |
|
|
|
CategoryAxis xAxis = plot.getDomainAxis(); |
|
|
|
xAxis.setLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 14)); |
|
|
|
xAxis.setTickLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 11)); |
|
|
|
xAxis.setCategoryLabelPositions(CategoryLabelPositions.UP_90); |
|
|
|
plot.getRangeAxis().setLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 14)); |
|
|
|
plot.getRangeAxis().setTickLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 12)); |
|
|
|
ChartUtils.saveChartAsPNG(new File("top_museums_reviews.png"), chart, 1600, 800); |
|
|
|
System.out.println("Top 15 博物馆评论数柱状图已生成:top_museums_reviews.png"); |
|
|
|
} |
|
|
|
private void generateRatingVsReviewCountScatterChart(List<? extends AbstractMuseumModel> data) throws IOException { |
|
|
|
XYSeries series = new XYSeries("博物馆数据"); |
|
|
|
data.forEach(m -> series.add(m.getRating(), m.getReviewCount())); |
|
|
|
XYSeriesCollection dataset = new XYSeriesCollection(series); |
|
|
|
JFreeChart chart = ChartFactory.createScatterPlot("博物馆评分与评论数关系", "评分", "评论数", dataset, org.jfree.chart.plot.PlotOrientation.VERTICAL, true, true, false); |
|
|
|
chart.getTitle().setFont(new java.awt.Font("SimHei", java.awt.Font.BOLD, 18)); |
|
|
|
XYPlot plot = (XYPlot) chart.getPlot(); |
|
|
|
plot.getDomainAxis().setLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 14)); |
|
|
|
plot.getDomainAxis().setTickLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 12)); |
|
|
|
plot.getRangeAxis().setLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 14)); |
|
|
|
plot.getRangeAxis().setTickLabelFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 12)); |
|
|
|
data.stream().sorted(Comparator.comparingInt(AbstractMuseumModel::getReviewCount).reversed()).limit(8).forEach(m -> { |
|
|
|
XYTextAnnotation annotation = new XYTextAnnotation(m.getName(), m.getRating(), m.getReviewCount()); |
|
|
|
annotation.setFont(new java.awt.Font("SimHei", java.awt.Font.PLAIN, 10)); |
|
|
|
annotation.setPaint(Color.BLUE); |
|
|
|
plot.addAnnotation(annotation); |
|
|
|
}); |
|
|
|
ChartUtils.saveChartAsPNG(new File("museum_rating_vs_reviews_scatter.png"), chart, 1200, 700); |
|
|
|
System.out.println("评分与评论数关系散点图已生成:museum_rating_vs_reviews_scatter.png"); |
|
|
|
} |
|
|
|
} |
|
|
|
} |