diff --git a/project/movies.csv b/project/movies.csv deleted file mode 100644 index ff07a2f..0000000 --- a/project/movies.csv +++ /dev/null @@ -1,251 +0,0 @@ -Title,Rating,Year,Director -肖申克的救赎,9.7,1994,弗兰克·德拉邦特 -霸王别姬,9.6,1993,陈凯歌 -泰坦尼克号,9.5,1997,詹姆斯·卡梅隆 -阿甘正传,9.5,1994,罗伯特·泽米吉斯 -千与千寻,9.4,2001,宫崎骏 -美丽人生,9.5,1997,罗伯托·贝尼尼 -星际穿越,9.4,2014,克里斯托弗·诺兰 -这个杀手不太冷,9.4,1994,吕克·贝松 -盗梦空间,9.4,2010,克里斯托弗·诺兰 -楚门的世界,9.4,1998,彼得·威尔 -辛德勒的名单,9.5,1993,史蒂文·斯皮尔伯格 -忠犬八公的故事,9.4,2009,莱塞·霍尔斯道姆 -海上钢琴师,9.3,1998,朱塞佩·托纳多雷 -疯狂动物城,9.3,2016,拜伦·霍华德 -三傻大闹宝莱坞,9.2,2009,拉库马·希拉尼 -机器人总动员,9.3,2008,安德鲁·斯坦顿 -放牛班的春天,9.3,2004,克里斯托夫·巴拉蒂 -无间道,9.3,2002,刘伟强 -控方证人,9.6,1957,比利·怀尔德 -寻梦环游记,9.1,2017,李·昂克里奇 -大话西游之大圣娶亲,9.2,1995,刘镇伟 -熔炉,9.3,2011,黄东赫 -触不可及,9.3,2011,奥利维·那卡什 -教父,9.3,1972,弗朗西斯·福特·科波拉 -末代皇帝,9.3,1987,贝纳尔多·贝托鲁奇 -哈利·波特与魔法石,9.2,2001,Chris -当幸福来敲门,9.1,2006,加布里尔·穆奇诺 -龙猫,9.2,1988,宫崎骏 -活着,9.3,1994,张艺谋 -怦然心动,9.1,2010,罗伯·莱纳 -蝙蝠侠:黑暗骑士,9.2,2008,克里斯托弗·诺兰 -指环王3:王者无敌,9.3,2003,彼得·杰克逊 -我不是药神,9.0,2018,文牧野 -乱世佳人,9.3,1939,维克多·弗莱明 -飞屋环游记,9.1,2009,彼特·道格特 -让子弹飞,9.0,2010,姜文 -哈尔的移动城堡,9.1,2004,宫崎骏 -十二怒汉,9.4,1957,西德尼·吕美特 -海蒂和爷爷,9.3,2015,阿兰·葛斯彭纳 -素媛,9.3,2013,李濬益 -猫鼠游戏,9.1,2002,史蒂文·斯皮尔伯格 -天空之城,9.2,1986,宫崎骏 -鬼子来了,9.3,2000,姜文 -摔跤吧!爸爸,9.0,2016,涅提·蒂瓦里 -少年派的奇幻漂流,9.1,2012,李安 -钢琴家,9.3,2002,罗曼·波兰斯基 -指环王2:双塔奇兵,9.2,2002,彼得·杰克逊 -死亡诗社,9.2,1989,彼得·威尔 -大话西游之月光宝盒,9.0,1995,刘镇伟 -绿皮书,8.9,2018,彼得·法雷里 -何以为家,9.1,2018,娜丁·拉巴基 -闻香识女人,9.1,1992,马丁·布莱斯 -大闹天宫,9.4,1961,万籁鸣 -黑客帝国,9.1,1999,安迪·沃卓斯基 -指环王1:护戒使者,9.1,2001,彼得·杰克逊 -罗马假日,9.1,1953,威廉·惠勒 -教父2,9.3,1974,弗朗西斯·福特·科波拉 -狮子王,9.1,1994,Roger -天堂电影院,9.2,1988,朱塞佩·托纳多雷 -饮食男女,9.2,1994,李安 -辩护人,9.2,2013,杨宇硕 -本杰明·巴顿奇事,9.0,2008,大卫·芬奇 -搏击俱乐部,9.0,1999,大卫·芬奇 -美丽心灵,9.1,2001,朗·霍华德 -穿条纹睡衣的男孩,9.2,2008,马克·赫尔曼 -情书,8.9,1995,岩井俊二 -哈利·波特与死亡圣器(下),9.0,2011,大卫·叶茨 -两杆大烟枪,9.1,1998,盖·里奇 -窃听风暴,9.2,2006,弗洛里安·亨克尔·冯·多纳斯马尔克 -音乐之声,9.1,1965,罗伯特·怀斯 -功夫,8.9,2004,周星驰 -哈利·波特与阿兹卡班的囚徒,9.0,2004,阿方索·卡隆 -西西里的美丽传说,8.9,2000,朱塞佩·托纳多雷 -阿凡达,8.8,2009,詹姆斯·卡梅隆 -看不见的客人,8.8,2016,奥里奥尔·保罗 -拯救大兵瑞恩,9.1,1998,史蒂文·斯皮尔伯格 -沉默的羔羊,8.9,1991,乔纳森·戴米 -小鞋子,9.2,1997,马基德·马基迪 -布达佩斯大饭店,8.9,2014,韦斯·安德森 -蝴蝶效应,8.9,2004,埃里克·布雷斯 -飞越疯人院,9.1,1975,米洛斯·福尔曼 -禁闭岛,8.9,2010,Martin -还有明天,9.3,2023,宝拉·柯特莱西 -心灵捕手,9.0,1997,格斯·范·桑特 -致命魔术,8.9,2006,克里斯托弗·诺兰 -低俗小说,8.9,1994,昆汀·塔伦蒂诺 -超脱,9.0,2011,托尼·凯耶 -哈利·波特与密室,8.9,2002,Chris -一一,9.1,2000,杨德昌 -喜剧之王,8.8,1999,周星驰 -摩登时代,9.3,1936,查理·卓别林 -杀人回忆,8.9,2003,奉俊昊 -致命ID,8.9,2003,詹姆斯·曼高德 -春光乍泄,9.0,1997,王家卫 -加勒比海盗,8.8,2003,戈尔·维宾斯基 -海豚湾,9.3,2009,路易·西霍尤斯 -美国往事,9.1,1984,赛尔乔·莱翁内 -红辣椒,9.0,2006,今敏 -七宗罪,8.8,1995,大卫·芬奇 -唐伯虎点秋香,8.8,1993,李力持 -狩猎,9.1,2012,托马斯·温特伯格 -幽灵公主,8.9,1997,宫崎骏 -甜蜜蜜,8.9,1996,陈可辛 -寄生虫,8.8,2019,奉俊昊 -天书奇谭,9.2,1983,王树忱 -蝙蝠侠:黑暗骑士崛起,8.9,2012,克里斯托弗·诺兰 -超能陆战队,8.8,2014,唐·霍尔 -7号房的礼物,8.9,2013,李焕庆 -第六感,8.9,1999,M·奈特·沙马兰 -茶馆,9.5,1982,谢添 -爱在黎明破晓前,8.8,1995,理查德·林克莱特 -爱在日落黄昏时,8.9,2004,理查德·林克莱特 -被嫌弃的松子的一生,8.8,2006,中岛哲也 -哈利·波特与火焰杯,8.8,2005,迈克·内威尔 -头脑特工队,8.8,2015,彼特·道格特 -未麻的部屋,9.1,1997,今敏 -重庆森林,8.8,1994,王家卫 -借东西的小人阿莉埃蒂,8.9,2010,米林宏昌 -菊次郎的夏天,8.9,1999,北野武 -入殓师,8.9,2008,泷田洋二郎 -断背山,8.8,2005,李安 -剪刀手爱德华,8.7,1990,蒂姆·波顿 -勇敢的心,8.9,1995,梅尔·吉布森 -时空恋旅人,8.8,2013,理查德·柯蒂斯 -驯龙高手,8.8,2010,迪恩·德布洛斯 -消失的爱人,8.7,2014,大卫·芬奇 -无人知晓,9.1,2004,是枝裕和 -倩女幽魂,8.8,1987,程小东 -傲慢与偏见,8.7,2005,乔·怀特 -新世界,8.9,2013,朴勋政 -花样年华,8.8,2000,王家卫 -玩具总动员3,8.9,2010,李·昂克里奇 -一个叫欧维的男人决定去死,8.9,2015,汉内斯·赫尔姆 -完美的世界,9.1,1993,克林特·伊斯特伍德 -色,戒,8.7,2007,李安 -阳光灿烂的日子,8.8,1994,姜文 -怪兽电力公司,8.8,2001,彼特·道格特 -小森林 夏秋篇,9.0,2014,森淳一 -天使爱美丽,8.7,2001,让-皮埃尔·热内 -教父3,9.0,1990,弗朗西斯·福特·科波拉 -侧耳倾听,8.9,1995,近藤喜文 -哪吒闹海,9.2,1979,王树忱 -九品芝麻官,8.8,1994,王晶 -被解救的姜戈,8.8,2012,昆汀·塔伦蒂诺 -请以你的名字呼唤我,8.8,2017,卢卡·瓜达尼诺 -幸福终点站,8.8,2004,史蒂文·斯皮尔伯格 -釜山行,8.6,2016,延尚昊 -神偷奶爸,8.7,2010,皮艾尔·柯芬 -小森林 冬春篇,9.0,2015,森淳一 -喜宴,9.0,1993,李安 -萤火之森,8.8,2011,大森贵弘 -告白,8.8,2010,中岛哲也 -玛丽和麦克斯,9.0,2009,亚当·艾略特 -七武士,9.3,1954,黑泽明 -头号玩家,8.6,2018,史蒂文·斯皮尔伯格 -模仿游戏,8.8,2014,莫滕·泰杜姆 -惊魂记,9.0,1960,阿尔弗雷德·希区柯克 -大鱼,8.8,2003,蒂姆·波顿 -心灵奇旅,8.7,2020,彼特·道格特 -射雕英雄传之东成西就,8.7,1993,刘镇伟 -血战钢锯岭,8.7,2016,梅尔·吉布森 -背靠背,脸对脸,9.5,1994,黄建新 -机器人之梦,9.1,2023,巴勃罗·贝格尔 -你的名字。,8.5,2016,新海诚 -我是山姆,9.0,2001,杰茜·尼尔森 -阳光姐妹淘,8.8,2011,姜炯哲 -恐怖直播,8.7,2013,金秉祐 -黑客帝国3:矩阵革命,8.8,2003,拉娜·沃卓斯基 -末路狂花,9.0,1991,雷德利·斯科特 -小丑,8.7,2019,托德·菲利普斯 -三块广告牌,8.7,2017,马丁·麦克唐纳 -谍影重重3,8.9,2007,保罗·格林格拉斯 -电锯惊魂,8.7,2004,詹姆斯·温 -高山下的花环,9.5,1984,谢晋 -无间道2,8.8,2003,刘伟强 -达拉斯买家俱乐部,8.8,2013,让-马克·瓦雷 -疯狂原始人,8.7,2013,科克·德·米科 -绿里奇迹,8.9,1999,弗兰克·德拉邦特 -爱在午夜降临前,8.9,2013,理查德·林克莱特 -疯狂的石头,8.6,2006,宁浩 -雨中曲,9.1,1952,斯坦利·多南 -2001太空漫游,8.9,1968,斯坦利·库布里克 -海街日记,8.8,2015,是枝裕和 -风之谷,8.9,1984,宫崎骏 -上帝之城,9.0,2002,费尔南多·梅里尔斯 -心迷宫,8.7,2014,忻钰坤 -英雄本色,8.6,1986,吴宇森 -记忆碎片,8.7,2000,克里斯托弗·诺兰 -纵横四海,8.8,1991,吴宇森 -无敌破坏王,8.7,2012,瑞奇·莫尔 -卢旺达饭店,8.9,2004,特瑞·乔治 -东京教父,9.0,2003,今敏 -小偷家族,8.7,2018,是枝裕和 -恐怖游轮,8.5,2009,克里斯托弗·史密斯 -牯岭街少年杀人事件,8.9,1991,杨德昌 -冰川时代,8.7,2002,卡洛斯·沙尔丹哈 -魔女宅急便,8.7,1989,宫崎骏 -芙蓉镇,9.3,1987,谢晋 -忠犬八公物语,9.2,1987,神山征二郎 -岁月神偷,8.7,2010,罗启锐 -遗愿清单,8.7,2007,罗伯·莱纳 -荒蛮故事,8.7,2014,达米安·斯兹弗隆 -大佛普拉斯,8.7,2017,黄信尧 -源代码,8.6,2011,邓肯·琼斯 -花束般的恋爱,8.6,2021,土井裕泰 -白日梦想家,8.6,2013,本·斯蒂勒 -疯狂的麦克斯4:狂暴之路,8.7,2015,乔治·米勒 -可可西里,8.9,2004,陆川 -你看起来好像很好吃,8.9,2010,藤森雅也 -爱乐之城,8.4,2016,达米恩·查泽雷 -贫民窟的百万富翁,8.6,2008,丹尼·鲍尔 -波西米亚狂想曲,8.6,2018,布莱恩·辛格 -城市之光,9.3,1931,查理·卓别林 -爆裂鼓手,8.6,2014,达米恩·查泽雷 -青蛇,8.6,1993,徐克 -东邪西毒,8.6,1994,王家卫 -哈利·波特与死亡圣器(上),8.6,2010,大卫·叶茨 -无耻混蛋,8.7,2009,昆汀·塔伦蒂诺 -终结者2:审判日,8.8,1991,詹姆斯·卡梅隆 -大红灯笼高高挂,8.8,1991,张艺谋 -黑天鹅,8.6,2010,达伦·阿罗诺夫斯基 -新龙门客栈,8.7,1992,李惠民 -初恋这件小事,8.5,2010,普特鹏·普罗萨卡·那·萨克那卡林 -人工智能,8.7,2001,史蒂文·斯皮尔伯格 -千钧一发,8.8,1997,安德鲁·尼科尔 -崖上的波妞,8.6,2008,宫崎骏 -雨人,8.7,1988,巴瑞·莱文森 -虎口脱险,8.9,1966,杰拉尔·乌里 -哈利·波特与凤凰社,8.6,2007,大卫·叶茨 -彗星来的那一夜,8.6,2013,詹姆斯·沃德·布柯特 -罗生门,8.8,1950,黑泽明 -海边的曼彻斯特,8.6,2016,肯尼斯·罗纳根 -恋恋笔记本,8.5,2004,尼克·卡索维茨 -真爱至上,8.5,2003,理查德·柯蒂斯 -火星救援,8.5,2015,雷德利·斯科特 -黑客帝国2:重装上阵,8.7,2003,拉娜·沃卓斯基 -步履不停,8.8,2008,是枝裕和 -冰雪奇缘,8.5,2013,克里斯·巴克 -奇迹男孩,8.6,2017,斯蒂芬·卓博斯基 -千年女优,8.8,2001,今敏 -战争之王,8.7,2005,安德鲁·尼科尔 -谍影重重2,8.7,2004,保罗·格林格拉斯 -蜘蛛侠:平行宇宙,8.6,2018,鲍勃·佩尔西凯蒂 -攻壳机动队,9.0,1995,押井守 -血钻,8.7,2006,爱德华·兹威克 -小姐,8.5,2016,朴赞郁 -隐藏人物,8.9,2016,特奥多尔·梅尔菲 -魂断蓝桥,8.8,1940,茂文·勒鲁瓦 -血观音,8.6,2017,杨雅喆 -房间,8.7,2015,伦尼·阿伯拉罕森 diff --git a/project/src/Main.java b/project/src/Main.java deleted file mode 100644 index 66a55e5..0000000 --- a/project/src/Main.java +++ /dev/null @@ -1,30 +0,0 @@ -import project.bean.Movie; -import project.crawler.MovieCrawler; -import project.utils.DataStorage; -import project.display.ResultDisplay; - -import java.util.List; - -public class Main { - public static void main(String[] args) { - try { - System.out.println("Starting to crawl movie data..."); - List movies = MovieCrawler.crawlMovies(10); // Crawl 10 pages of data - System.out.println("Crawling completed, obtained " + movies.size() + " movies data"); - - System.out.println("Saving data to CSV file..."); - DataStorage.saveToCsv(movies, "project/movies.csv"); - System.out.println("Data saved successfully"); - - System.out.println("Analyzing data..."); - ResultDisplay.displayResults(movies); - - System.out.println("Generating charts..."); - ResultDisplay.generateCharts(movies); - System.out.println("Chart generation completed, saved to project directory"); - - } catch (Exception e) { - e.printStackTrace(); - } - } -} \ No newline at end of file diff --git a/project/src/main/java/com/crawler/spider/JobSpider.java b/project/src/main/java/com/crawler/spider/JobSpider.java deleted file mode 100644 index 9083385..0000000 --- a/project/src/main/java/com/crawler/spider/JobSpider.java +++ /dev/null @@ -1,101 +0,0 @@ -import org.jsoup.Jsoup; -import org.jsoup.Connection; -import org.json.JSONArray; -import org.json.JSONObject; -import org.apache.commons.csv.*; - -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -public class JobSpider { - - // ⚠️ 注意:这个 URL 可能会随时间变化,请务必按上面的步骤在 F12 中确认最新的 URL - // 这里的参数 keyword=Java, page=1 是示例,实际需要根据网站调整 - private static final String API_URL = "https://www.iguopin.com/api/job/search?keyword=&page=1&pageSize=20"; - - public static void main(String[] args) { - List jobList = new ArrayList<>(); - - try { - System.out.println("🚀 开始连接国聘网数据接口..."); - - // 1. 构造请求,必须伪装 Header,否则会被拒绝 - String jsonResponse = Jsoup.connect(API_URL) - .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36") - .header("Accept", "application/json, text/plain, */*") - .header("Referer", "https://www.iguopin.com/") // 假装是从首页跳过来的 - .timeout(5000) - .ignoreContentType(true) // 重要!允许接收非 HTML 内容 (即 JSON) - .execute() - .body(); - - // 2. 解析 JSON 数据 - JSONObject root = new JSONObject(jsonResponse); - - // ⚠️ 关键:你需要根据 F12 看到的实际 JSON 结构调整这里的键名 (key) - // 假设数据结构是 { "data": { "list": [...] } } 或者 { "result": [...] } - // 下面是一个通用的猜测逻辑,请根据实际打印结果修改! - - JSONArray jobsArray = null; - - // 尝试几种常见的结构 (你需要打印 root.toString() 来确认到底是哪一层) - if (root.has("data")) { - JSONObject dataObj = root.getJSONObject("data"); - if (dataObj.has("list")) jobsArray = dataObj.getJSONArray("list"); - else if (dataObj.has("jobs")) jobsArray = dataObj.getJSONArray("jobs"); - } else if (root.has("result")) { - jobsArray = root.getJSONArray("result"); - } else if (root.has("jobs")) { - jobsArray = root.getJSONArray("jobs"); - } - - if (jobsArray == null) { - System.err.println("❌ 未找到职位列表数据。JSON 结构可能已变更,请打印查看:\n" + jsonResponse); - return; - } - - System.out.println("✅ 解析成功,共发现 " + jobsArray.length() + " 个职位。"); - - // 3. 提取具体字段 - for (int i = 0; i < jobsArray.length(); i++) { - JSONObject job = jobsArray.getJSONObject(i); - - // ⚠️ 再次强调:这里的 "jobName", "companyName" 必须和你 F12 里看到的一模一样! - String title = job.optString("jobName", "未知职位"); - String company = job.optString("companyName", "未知公司"); - String salary = job.optString("salary", "面议"); - String location = job.optString("workLocation", "未知地点"); - String link = "https://www.iguopin.com/job/detail/" + job.optString("id"); // 拼接详情页链接 - - jobList.add(new String[]{title, company, salary, location, link}); - System.out.println("[" + (i+1) + "] " + title + " | " + company); - } - - // 4. 保存到 CSV - saveToCsv(jobList, "guopin_jobs.csv"); - System.out.println("💾 数据已保存至 guopin_jobs.csv"); - - } catch (IOException e) { - e.printStackTrace(); - System.err.println("❌ 网络请求失败:可能是接口地址变了,或者被反爬拦截。"); - } catch (Exception e) { - e.printStackTrace(); - System.err.println("❌ JSON 解析失败:请检查代码中的 key 名称是否与网页返回的一致。"); - } - } - - private static void saveToCsv(List data, String fileName) throws IOException { - FileWriter out = new FileWriter(fileName); - // 定义表头 - CSVFormat format = CSVFormat.DEFAULT.withHeader("职位名称", "公司名称", "薪资", "地点", "链接"); - CSVPrinter printer = new CSVPrinter(out, format); - - for (String[] row : data) { - printer.printRecord(row); - } - printer.close(); - out.close(); - } -} \ No newline at end of file diff --git a/project/src/project/analysis/MovieAnalyzer.java b/project/src/project/analysis/MovieAnalyzer.java deleted file mode 100644 index 48f5722..0000000 --- a/project/src/project/analysis/MovieAnalyzer.java +++ /dev/null @@ -1,42 +0,0 @@ -package project.analysis; - -import project.bean.Movie; - -import java.util.*; -import java.util.stream.Collectors; - -public class MovieAnalyzer { - public static Map getRatingDistribution(List movies) { - return movies.stream() - .collect(Collectors.groupingBy(Movie::getRating, Collectors.counting())); - } - - public static Map getYearRatingCorrelation(List movies) { - return movies.stream() - .collect(Collectors.groupingBy(Movie::getYear, - Collectors.averagingDouble(Movie::getRating))); - } - - public static Map getDirectorMovieCount(List movies) { - return movies.stream() - .collect(Collectors.groupingBy(Movie::getDirector, Collectors.counting())) - .entrySet().stream() - .filter(entry -> entry.getValue() > 1) - .sorted(Map.Entry.comparingByValue().reversed()) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new)); - } - - public static double getAverageRating(List movies) { - return movies.stream() - .mapToDouble(Movie::getRating) - .average() - .orElse(0.0); - } - - public static List getTopRatedMovies(List movies, int count) { - return movies.stream() - .sorted(Comparator.comparingDouble(Movie::getRating).reversed()) - .limit(count) - .collect(Collectors.toList()); - } -} \ No newline at end of file diff --git a/project/src/project/bean/Movie.java b/project/src/project/bean/Movie.java deleted file mode 100644 index 4391ab7..0000000 --- a/project/src/project/bean/Movie.java +++ /dev/null @@ -1,60 +0,0 @@ -package project.bean; - -public class Movie { - private String title; - private double rating; - private int year; - private String director; - - public Movie() { - } - - public Movie(String title, double rating, int year, String director) { - this.title = title; - this.rating = rating; - this.year = year; - this.director = director; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } - - public double getRating() { - return rating; - } - - public void setRating(double rating) { - this.rating = rating; - } - - public int getYear() { - return year; - } - - public void setYear(int year) { - this.year = year; - } - - public String getDirector() { - return director; - } - - public void setDirector(String director) { - this.director = director; - } - - @Override - public String toString() { - return "Movie{" + - "title='" + title + '\'' + - ", rating=" + rating + - ", year=" + year + - ", director='" + director + '\'' + - '}'; - } -} \ No newline at end of file diff --git a/project/src/project/crawler/MovieCrawler.java b/project/src/project/crawler/MovieCrawler.java deleted file mode 100644 index 9dda973..0000000 --- a/project/src/project/crawler/MovieCrawler.java +++ /dev/null @@ -1,194 +0,0 @@ -package project.crawler; - -import project.bean.Movie; -import project.utils.DataCleaner; -import project.utils.HttpUtils; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class MovieCrawler { - public static List crawlMovies(int pageCount) throws Exception { - List movies = new ArrayList<>(); - - for (int page = 1; page <= pageCount; page++) { - String url = "https://movie.douban.com/top250?start=" + (page - 1) * 25; - System.out.println("Crawling page " + page + " from " + url); - try { - String html = HttpUtils.getHtml(url); - System.out.println("Got HTML content, length: " + html.length()); - - // 打印 HTML 内容的前 500 个字符,了解实际结构 - if (html.length() > 500) { - System.out.println("HTML preview: " + html.substring(0, 500) + "..."); - } - - List pageMovies = parseMovies(html); - System.out.println("Parsed " + pageMovies.size() + " movies from page " + page); - movies.addAll(pageMovies); - } catch (Exception e) { - System.out.println("Error crawling page " + page + ": " + e.getMessage()); - } - Thread.sleep(1000); // 控制请求频率 - } - - System.out.println("Total movies crawled: " + movies.size()); - return movies; - } - - private static List parseMovies(String html) { - List movies = new ArrayList<>(); - - // Find all movie items by looking for
and matching until
at the same nesting level - int startIndex = 0; - int count = 0; - - while (true) { - int itemStart = html.indexOf("
", startIndex); - if (itemStart < 0) break; - - // Find the matching
by counting nested divs - int pos = itemStart + "
".length(); - int depth = 1; - int itemEnd = -1; - - while (pos < html.length() && depth > 0) { - int nextOpen = html.indexOf("", pos); - - if (nextClose < 0) break; // No closing tag found - - if (nextOpen >= 0 && nextOpen < nextClose) { - // Found an opening div before closing - depth++; - pos = nextOpen + 4; - } else { - // Found a closing div - depth--; - if (depth == 0) { - itemEnd = nextClose + 6; - } - pos = nextClose + 6; - } - } - - if (itemEnd > itemStart) { - count++; - String movieHtml = html.substring(itemStart, itemEnd); - // Don't print movie HTML to avoid excessive output - Movie movie = parseMovie(movieHtml); - if (movie != null) { - movies.add(movie); - } - startIndex = itemEnd; - } else { - break; - } - } - - System.out.println("Found " + count + " movie items, parsed " + movies.size() + " valid movies"); - return movies; - } - - private static Movie parseMovie(String movieHtml) { - try { - // Extract title from img alt attribute - String title = ""; - int altIndex = movieHtml.indexOf("alt="); - if (altIndex > 0) { - int start = movieHtml.indexOf('"', altIndex); - int end = movieHtml.indexOf('"', start + 1); - if (start > 0 && end > 0) { - title = movieHtml.substring(start + 1, end).trim(); - } - } - - // Extract rating - double rating = 0.0; - int ratingIndex = movieHtml.indexOf("rating_num"); - if (ratingIndex > 0) { - int start = movieHtml.indexOf('>', ratingIndex); - int end = movieHtml.indexOf("", start); - if (start > 0 && end > 0) { - String ratingStr = movieHtml.substring(start + 1, end).trim(); - try { - rating = Double.parseDouble(ratingStr); - } catch (NumberFormatException e) { - rating = 0.0; - } - } - } - - // Extract year and director from movie info - int year = 0; - String director = "Unknown"; - - // Find the info section which contains year and director - // Look for

tag without class or with specific class - int infoStart = -1; - int pStart = movieHtml.indexOf("

"); - int pClassStart = movieHtml.indexOf("

"); - - if (pStart >= 0) { - infoStart = pStart; - } - if (pClassStart >= 0 && (pStart < 0 || pClassStart < pStart)) { - infoStart = pClassStart; - } - - if (infoStart > 0) { - int infoEnd = movieHtml.indexOf("

", infoStart); - if (infoEnd > infoStart) { - String infoSection = movieHtml.substring(infoStart, infoEnd); - - // Extract year - look for 4-digit year after
tag - int brIndex = infoSection.indexOf("
"); - if (brIndex > 0) { - String afterBr = infoSection.substring(brIndex + 4).trim(); - // Find first 4-digit number - for (int i = 0; i <= afterBr.length() - 4; i++) { - String possibleYear = afterBr.substring(i, i + 4); - if (possibleYear.matches("\\d{4}")) { - try { - year = Integer.parseInt(possibleYear); - break; - } catch (NumberFormatException e) { - // Continue - } - } - } - } - - // Extract director - director info is between "导演:" and " " - // Look for the pattern: 导演: [director name]  - int directorLabelIdx = infoSection.indexOf("\u5bfc\u6f14:"); // Unicode for "导演:" - if (directorLabelIdx >= 0) { - int directorStart = directorLabelIdx + 3; // Skip "导演:" - int directorEnd = infoSection.indexOf(" ", directorStart); - if (directorEnd > directorStart) { - director = infoSection.substring(directorStart, directorEnd).trim(); - // Clean up any remaining HTML - director = director.replaceAll("<[^>]*>", "").trim(); - // Extract only Chinese name (before space) - int spaceIdx = director.indexOf(" "); - if (spaceIdx > 0) { - director = director.substring(0, spaceIdx).trim(); - } - if (director.isEmpty()) director = "Unknown"; - } - } - } - } - - // If title and rating are valid, create movie object - if (!title.isEmpty() && rating > 0) { - return new Movie(title, rating, year, director); - } - } catch (Exception e) { - // Silently handle exceptions - } - return null; - } -} \ No newline at end of file diff --git a/project/src/project/display/ResultDisplay.java b/project/src/project/display/ResultDisplay.java deleted file mode 100644 index c766f81..0000000 --- a/project/src/project/display/ResultDisplay.java +++ /dev/null @@ -1,47 +0,0 @@ -package project.display; - -import project.bean.Movie; -import project.analysis.MovieAnalyzer; - -import java.util.List; -import java.util.Map; - -public class ResultDisplay { - public static void displayResults(List movies) { - System.out.println("===== Movie Data Analysis Results ====="); - System.out.println("Total movies: " + movies.size()); - System.out.printf("Average rating: %.2f\n\n", MovieAnalyzer.getAverageRating(movies)); - - System.out.println("===== Rating Distribution ====="); - Map ratingDistribution = MovieAnalyzer.getRatingDistribution(movies); - ratingDistribution.entrySet().stream() - .sorted(Map.Entry.comparingByKey()) - .forEach(entry -> System.out.printf("Rating %.1f: %d movies\n", entry.getKey(), entry.getValue())); - - System.out.println("\n===== Year-Rating Correlation ====="); - Map yearRating = MovieAnalyzer.getYearRatingCorrelation(movies); - yearRating.entrySet().stream() - .sorted(Map.Entry.comparingByKey()) - .forEach(entry -> System.out.printf("%d: %.2f\n", entry.getKey(), entry.getValue())); - - System.out.println("\n===== Director Movie Count Ranking ====="); - Map directorCount = MovieAnalyzer.getDirectorMovieCount(movies); - directorCount.entrySet().stream() - .limit(10) - .forEach(entry -> System.out.printf("%s: %d movies\n", entry.getKey(), entry.getValue())); - - System.out.println("\n===== Top 10 Highest Rated Movies ====="); - List topRated = MovieAnalyzer.getTopRatedMovies(movies, 10); - for (int i = 0; i < topRated.size(); i++) { - Movie movie = topRated.get(i); - System.out.printf("%d. %s (%.1f) - %d - Director: %s\n", - i + 1, movie.getTitle(), movie.getRating(), movie.getYear(), movie.getDirector()); - } - } - - public static void generateCharts(List movies) throws Exception { - System.out.println("\n===== Chart Generation ====="); - System.out.println("Due to environment limitations, chart generation is not implemented"); - System.out.println("Suggest using JFreeChart or other chart libraries for visualization"); - } -} \ No newline at end of file diff --git a/project/src/project/utils/DataCleaner.java b/project/src/project/utils/DataCleaner.java deleted file mode 100644 index 992d1f9..0000000 --- a/project/src/project/utils/DataCleaner.java +++ /dev/null @@ -1,29 +0,0 @@ -package project.utils; - -public class DataCleaner { - public static String cleanText(String text) { - if (text == null) return ""; - return text.trim() - .replaceAll("<[^>]*>", "") - .replaceAll("\\s+", " ") - .replaceAll("[\\r\\n]", ""); - } - - public static double parseRating(String ratingStr) { - if (ratingStr == null || ratingStr.isEmpty()) return 0.0; - try { - return Double.parseDouble(ratingStr.trim()); - } catch (NumberFormatException e) { - return 0.0; - } - } - - public static int parseYear(String yearStr) { - if (yearStr == null || yearStr.isEmpty()) return 0; - try { - return Integer.parseInt(yearStr.replaceAll("[^0-9]", "")); - } catch (NumberFormatException e) { - return 0; - } - } -} \ No newline at end of file diff --git a/project/src/project/utils/DataStorage.java b/project/src/project/utils/DataStorage.java deleted file mode 100644 index 61fbce8..0000000 --- a/project/src/project/utils/DataStorage.java +++ /dev/null @@ -1,26 +0,0 @@ -package project.utils; - -import project.bean.Movie; - -import java.io.OutputStreamWriter; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.List; - -public class DataStorage { - public static void saveToCsv(List movies, String filePath) throws IOException { - // Use UTF-8 encoding to properly handle Chinese characters - OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(filePath), "UTF-8"); - writer.write("Title,Rating,Year,Director\n"); - - for (Movie movie : movies) { - writer.write(String.format("%s,%.1f,%d,%s\n", - movie.getTitle(), - movie.getRating(), - movie.getYear(), - movie.getDirector())); - } - - writer.close(); - } -} diff --git a/project/src/project/utils/HttpUtils.java b/project/src/project/utils/HttpUtils.java deleted file mode 100644 index babcc52..0000000 --- a/project/src/project/utils/HttpUtils.java +++ /dev/null @@ -1,30 +0,0 @@ -package project.utils; - -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.net.HttpURLConnection; -import java.net.URL; - -public class HttpUtils { - public static String getHtml(String url) throws Exception { - URL obj = new URL(url); - HttpURLConnection con = (HttpURLConnection) obj.openConnection(); - con.setRequestMethod("GET"); - con.setRequestProperty("User-Agent", "Mozilla/5.0"); - - int responseCode = con.getResponseCode(); - if (responseCode != HttpURLConnection.HTTP_OK) { - throw new Exception("HTTP error code: " + responseCode); - } - - BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), "UTF-8")); - String inputLine; - StringBuilder html = new StringBuilder(); - - while ((inputLine = in.readLine()) != null) { - html.append(inputLine); - } - in.close(); - return html.toString(); - } -} \ No newline at end of file