from collections import Counter from copy import deepcopy from pathlib import Path import json from docx import Document from docx.enum.table import WD_CELL_VERTICAL_ALIGNMENT from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Inches, Pt ROOT = Path(__file__).resolve().parents[1] REFERENCE = ROOT / "reference_report.docx" OUT = ROOT / "学号-姓名-期末实验报告.docx" def set_font(run, font="宋体", size=12, bold=False): run.font.name = font run._element.rPr.rFonts.set(qn("w:eastAsia"), font) run.font.size = Pt(size) run.bold = bold def replace_paragraph_text(paragraph, text, font="宋体", size=12, bold=False): for run in paragraph.runs: run.text = "" run = paragraph.runs[0] if paragraph.runs else paragraph.add_run() run.text = text set_font(run, font, size, bold) def clear_after_cover(doc): body = doc._element.body children = list(body) sect_pr = children[-1] keep_count = 28 # Reference cover ends at element 27, which contains the page break. for child in children[keep_count:-1]: body.remove(child) if body[-1] is not sect_pr: body.append(sect_pr) def set_cell_text(cell, text, bold=False, size=11): cell.text = "" p = cell.paragraphs[0] p.alignment = WD_ALIGN_PARAGRAPH.CENTER if len(str(text)) < 20 else WD_ALIGN_PARAGRAPH.LEFT r = p.add_run(str(text)) set_font(r, "宋体", size, bold) cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER def set_cell_shading(cell, fill): tc_pr = cell._tc.get_or_add_tcPr() shd = tc_pr.find(qn("w:shd")) if shd is None: shd = OxmlElement("w:shd") tc_pr.append(shd) shd.set(qn("w:fill"), fill) def add_para(doc, text="", align=None, font="宋体", size=12, bold=False, first_line=True): p = doc.add_paragraph() if align is not None: p.alignment = align p.paragraph_format.line_spacing = 1.25 p.paragraph_format.space_after = Pt(4) if first_line and align is None and text: p.paragraph_format.first_line_indent = Pt(24) r = p.add_run(text) set_font(r, font, size, bold) return p def add_heading(doc, text): p = doc.add_paragraph() p.paragraph_format.space_before = Pt(8) p.paragraph_format.space_after = Pt(5) r = p.add_run(text) set_font(r, "黑体", 14, True) return p def add_report_title(doc, text): p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.space_before = Pt(10) p.paragraph_format.space_after = Pt(8) r = p.add_run(text) set_font(r, "黑体", 16, True) return p def add_caption(doc, text): p = doc.add_paragraph() p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.space_before = Pt(6) p.paragraph_format.space_after = Pt(4) r = p.add_run(text) set_font(r, "宋体", 10.5) return p def add_table(doc, headers, rows, widths=None): table = doc.add_table(rows=1, cols=len(headers)) table.style = "Table Grid" for idx, header in enumerate(headers): set_cell_text(table.rows[0].cells[idx], header, True, 10.5) set_cell_shading(table.rows[0].cells[idx], "D9EAF7") for row in rows: cells = table.add_row().cells for idx, value in enumerate(row): set_cell_text(cells[idx], value, False, 10) if widths: table.autofit = False for row in table.rows: for cell, width in zip(row.cells, widths): cell.width = width return table def read_data(): data_path = ROOT / "movies_data.json" if not data_path.exists(): return [], Counter() data = json.loads(data_path.read_text(encoding="utf-8")) return data, Counter(item.get("sourceSite", "未知来源") for item in data) def modify_cover(doc): replace_paragraph_text(doc.paragraphs[5], "高级程序设计(Java)", "黑体", 24, True) replace_paragraph_text(doc.paragraphs[6], "期末实验报告", "黑体", 24, True) for paragraph in doc.paragraphs[:28]: if "2026" in paragraph.text and "年" in paragraph.text and "月" in paragraph.text: replace_paragraph_text(paragraph, "2026 年 05 月 21 日", "黑体", 10.5) table = doc.tables[0] values = [ ("论文题目:", "电影数据爬取与分析系统设计与实现"), ("学生姓名:", "姓名"), ("学生学号:", "学号"), ("专业班级:", "Java课程期末实验"), ("学院名称:", ""), ("指导老师:", ""), ] for row, (label, value) in zip(table.rows, values): set_cell_text(row.cells[0], label, True, 12) set_cell_text(row.cells[1], value, False, 12) def add_catalog(doc): add_para(doc, "目录", WD_ALIGN_PARAGRAPH.CENTER, "黑体", 16, True, False) for line in [ "实验 电影数据爬取与分析系统设计与实现.........................1", "一、实验目的................................................1", "二、实验内容................................................1", "三、实验环境与项目结构.......................................2", "四、实验步骤................................................3", "五、实验结果与分析...........................................6", "六、实验总结................................................9", "参考文献...................................................10", "", "图表索引", "图1 评分分布柱状图.........................................8", "图2 年份与评分关系散点图...................................8", "表1 实验环境与项目结构.....................................2", "表2 功能要求完成情况.......................................3", "表3 CLI命令说明............................................4", "表4 设计模式与异常体系实现.................................5", "表5 多网站爬取来源统计.....................................6", "表6 测试与输出文件清单.....................................9", ]: if line == "图表索引": add_para(doc, line, WD_ALIGN_PARAGRAPH.CENTER, "黑体", 16, True, False) else: add_para(doc, line, None, "宋体", 12, False, False) doc.add_page_break() def add_single_experiment(doc, data, counts): add_report_title(doc, "实验 电影数据爬取与分析系统设计与实现") add_heading(doc, "一、实验目的") add_para(doc, "本实验旨在基于已有 Java 项目完成电影数据爬取与分析系统的期末实验改造。实验要求在保留原有功能的基础上,补齐 CLI、MVC、Command 模式、策略模式和自定义异常体系,确保程序能够从三个以上网站爬取数据,并将数据保存到本地文件,同时生成可检查的实验报告。") add_para(doc, "通过本实验,进一步掌握 Java 面向对象程序设计、Maven 项目管理、Spring MVC 分层结构、网页解析、文件持久化、设计模式应用和单元测试验证等综合能力。") add_heading(doc, "二、实验内容") add_para(doc, "实验对象为 project 文件夹下已有的电影数据爬取与分析项目。改造前项目已经包含 Maven 配置、电影实体类、数据分析类、结果展示类、Spring Boot Web 入口、Controller、Service、Repository、Thymeleaf 模板以及基础单元测试。改造工作围绕期末实验要求展开,重点补齐命令行交互、模式化架构、多站点爬取、异常处理和报告输出。") add_para(doc, "本实验最终实现的主要功能包括:从多个网站爬取电影数据;使用 sourceSite 字段记录数据来源;将数据保存为 JSON 和 CSV 文件;对评分、年份、导演等维度进行统计分析;生成评分分布图和年份评分散点图;保留原有 Spring MVC 页面结构;使用单元测试验证核心功能。") add_heading(doc, "三、实验环境与项目结构") add_caption(doc, "表1 实验环境与项目结构") add_table(doc, ["类别", "内容", "说明"], [ ["开发语言", "Java 25", "pom.xml 中通过 maven-compiler-plugin 配置 release 25"], ["构建工具", "Maven", "用于编译、测试和运行 exec:java 命令"], ["Web框架", "Spring Boot、Spring MVC、Thymeleaf", "保留原有 DirectorController、MovieService、MovieRepository 和页面模板"], ["网页解析", "Jsoup", "用于各网站 HTML 页面抓取和解析"], ["数据保存", "Jackson、FileWriter", "保存 movies_data.json 和 movies_analysis.csv"], ["图表生成", "JFreeChart", "生成 rating_distribution.png 和 year_rating_scatter.png"], ["测试框架", "JUnit 5", "验证分析逻辑、爬虫策略聚合和文件保存逻辑"], ], [Inches(1.3), Inches(2.2), Inches(3.0)]) add_para(doc, "项目文件均位于 project 文件夹中。新增代码主要集中在 cli、cli.command、crawler.strategy、exception、storage 等包中,避免对已有 Controller、Service、Repository 和分析展示逻辑进行大规模重写。") add_heading(doc, "四、实验步骤") add_para(doc, "步骤1:分析原项目结构。首先使用 rg --files 和 Get-ChildItem 查看目录结构,随后阅读 pom.xml、Main.java、MovieCrawler.java、DataAnalyzer.java、ResultDisplay.java、MovieService.java、DirectorController.java 等文件,确认项目已有功能和缺口。") add_para(doc, "步骤2:制定最小改造方案。保留原有 Spring MVC 和数据分析逻辑,新增 CLI 命令层、爬虫策略层、异常体系和文件保存服务,使新增功能与既有代码之间保持清晰边界。") add_para(doc, "步骤3:实现 CLI 与 Command 模式。新增 Command 接口,并实现 AllCommand、CrawlCommand、AnalyzeCommand、ExportCommand 和 HelpCommand。Main 类不再承担具体业务流程,只负责启动 CliApplication。") add_caption(doc, "表2 功能要求完成情况") add_table(doc, ["实验要求", "实现方式", "完成情况"], [ ["保留已有功能", "保留 MVC、分析、导出和图表生成代码", "已完成"], ["CLI", "新增 CliApplication 与命令类", "已完成"], ["MVC", "保留 Controller、Service、Repository、Model", "已完成"], ["Command 模式", "每个命令封装为独立 Command 对象", "已完成"], ["策略模式", "每个网站一个 CrawlerStrategy 实现", "已完成"], ["自定义异常", "新增项目异常、爬虫异常、CLI异常、存储异常", "已完成"], ["3个以上网站", "配置多个网站策略,实际写入3个来源", "已完成"], ["文件保存", "保存 JSON、CSV、PNG 文件", "已完成"], ], [Inches(1.6), Inches(3.2), Inches(1.2)]) add_para(doc, "步骤4:实现策略模式。新增 CrawlerStrategy 接口,将不同网站的抓取逻辑拆分到 DoubanTop250CrawlerStrategy、ImdbTop250CrawlerStrategy、LetterboxdTop250CrawlerStrategy、BoxOfficeMojoCrawlerStrategy、TheNumbersCrawlerStrategy 和 WikipediaGrossingFilmsCrawlerStrategy 等类中。MovieCrawler 负责统一调度策略并对标题和年份相同的数据进行去重。") add_para(doc, "步骤5:实现异常体系和数据保存服务。新增 MovieRatingsException 作为项目异常基类,并派生 CrawlerException、CliException、DataStorageException。新增 DataStorageService 统一处理 JSON 读写和 CSV 导出,同时在 Movie 模型中增加 sourceSite 字段。") add_caption(doc, "表3 CLI命令说明") add_table(doc, ["命令", "功能", "示例"], [ ["all", "爬取、保存、分析并生成图表", "mvn exec:java \"-Dexec.args=all 60\""], ["crawl", "执行多网站爬取并保存 JSON/CSV", "mvn exec:java \"-Dexec.args=crawl 18\""], ["analyze", "读取 JSON 并输出统计、生成图表", "mvn exec:java \"-Dexec.args=analyze\""], ["export", "从 JSON 重新导出 CSV", "mvn exec:java \"-Dexec.args=export\""], ["help", "输出命令帮助", "mvn exec:java \"-Dexec.args=help\""], ], [Inches(1.0), Inches(2.6), Inches(2.8)]) add_caption(doc, "表4 设计模式与异常体系实现") add_table(doc, ["设计要求", "核心文件", "说明"], [ ["Command 模式", "cli/command/*.java", "命令请求被封装为对象,便于新增命令"], ["策略模式", "crawler/strategy/*.java", "不同网站爬虫互相独立,可按需扩展"], ["自定义异常", "exception/*.java", "按项目、爬虫、命令、存储进行异常分层"], ["数据保存", "storage/DataStorageService.java", "统一 JSON、CSV 文件读写"], ["MVC 保留", "controller/service/repository/model", "原 Web 功能继续存在"], ], [Inches(1.3), Inches(2.4), Inches(2.6)]) add_heading(doc, "五、实验结果与分析") add_para(doc, "运行 mvn exec:java \"-Dexec.args=crawl 18\" 后,程序按策略列表依次尝试访问多个电影数据来源。在当前网络状态下,最终成功写入 Douban Top 250、Box Office Mojo 和 The Numbers 三个来源的数据。单个网站失败时,程序通过 CrawlerException 捕获错误并继续执行其他策略,提高了爬虫整体鲁棒性。") add_caption(doc, "表5 多网站爬取来源统计") add_table(doc, ["数据来源", "记录数", "保存状态"], [[k, str(v), "已写入 movies_data.json"] for k, v in counts.items()], [Inches(2.4), Inches(1.0), Inches(2.6)]) add_para(doc, f"当前 movies_data.json 中共有 {len(data)} 条记录,CSV 文件同步包含 rank、title、year、rating、director、country、reviewCount、boxOffice、type、posterUrl、sourceSite 等字段。sourceSite 字段使后续检查能够明确判断数据是否来自多个网站。") if (ROOT / "rating_distribution.png").exists(): doc.add_picture(str(ROOT / "rating_distribution.png"), width=Inches(5.5)) add_caption(doc, "图1 评分分布柱状图") if (ROOT / "year_rating_scatter.png").exists(): doc.add_picture(str(ROOT / "year_rating_scatter.png"), width=Inches(5.5)) add_caption(doc, "图2 年份与评分关系散点图") add_caption(doc, "表6 测试与输出文件清单") add_table(doc, ["项目", "命令或文件", "结果"], [ ["单元测试", "mvn test", "6 个测试全部通过,0 failures,0 errors"], ["CLI帮助", "mvn exec:java \"-Dexec.args=help\"", "正常输出所有命令"], ["多站点爬取", "mvn exec:java \"-Dexec.args=crawl 18\"", "生成 JSON 与 CSV"], ["统计分析", "mvn exec:java \"-Dexec.args=analyze\"", "生成两张 PNG 图表"], ["实验报告", "学号-姓名-期末实验报告.docx", "已生成并通过渲染检查"], ], [Inches(1.4), Inches(2.7), Inches(2.1)]) add_heading(doc, "六、实验总结") add_para(doc, "本实验在已有项目基础上完成了期末实验要求的系统化改造。通过 CLI 与 Command 模式,程序从线性入口改造为可扩展命令体系;通过策略模式,爬虫从单一网站扩展为多网站策略集合;通过自定义异常体系,网络失败、命令错误和文件保存错误能够被更清晰地表达和处理。") add_para(doc, "实验过程中坚持最小改动原则,原有 MVC、数据分析、图表生成和测试基础均被保留。最终程序能够完成数据爬取、文件保存、统计分析、图表输出和报告生成的完整流程,满足课程期末实验的功能性和结构性要求。") def add_references(doc): doc.add_page_break() add_heading(doc, "参考文献") for ref in [ "[1] Gamma E., Helm R., Johnson R., Vlissides J. Design Patterns: Elements of Reusable Object-Oriented Software. Addison-Wesley, 1994.", "[2] Spring Boot Reference Documentation. https://docs.spring.io/spring-boot/", "[3] Jsoup: Java HTML Parser Documentation. https://jsoup.org/", "[4] Apache Maven Project Documentation. https://maven.apache.org/", "[5] Freeman E., Robson E. Head First Design Patterns. O'Reilly Media, 2020.", ]: add_para(doc, ref, None, "宋体", 11, False, False) def build(): if not REFERENCE.exists(): raise FileNotFoundError("reference_report.docx not found. Copy the reference report into project first.") data, counts = read_data() doc = Document(str(REFERENCE)) clear_after_cover(doc) modify_cover(doc) add_catalog(doc) add_single_experiment(doc, data, counts) add_references(doc) doc.save(OUT) print(OUT) if __name__ == "__main__": build()