You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

292 lines
16 KiB

from collections import Counter
from copy import deepcopy
from pathlib import Path
import json
from docx import Document
from docx.enum.table import WD_CELL_VERTICAL_ALIGNMENT
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Inches, Pt
ROOT = Path(__file__).resolve().parents[1]
REFERENCE = ROOT / "reference_report.docx"
OUT = ROOT / "学号-姓名-期末实验报告.docx"
def set_font(run, font="宋体", size=12, bold=False):
run.font.name = font
run._element.rPr.rFonts.set(qn("w:eastAsia"), font)
run.font.size = Pt(size)
run.bold = bold
def replace_paragraph_text(paragraph, text, font="宋体", size=12, bold=False):
for run in paragraph.runs:
run.text = ""
run = paragraph.runs[0] if paragraph.runs else paragraph.add_run()
run.text = text
set_font(run, font, size, bold)
def clear_after_cover(doc):
body = doc._element.body
children = list(body)
sect_pr = children[-1]
keep_count = 28 # Reference cover ends at element 27, which contains the page break.
for child in children[keep_count:-1]:
body.remove(child)
if body[-1] is not sect_pr:
body.append(sect_pr)
def set_cell_text(cell, text, bold=False, size=11):
cell.text = ""
p = cell.paragraphs[0]
p.alignment = WD_ALIGN_PARAGRAPH.CENTER if len(str(text)) < 20 else WD_ALIGN_PARAGRAPH.LEFT
r = p.add_run(str(text))
set_font(r, "宋体", size, bold)
cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
def set_cell_shading(cell, fill):
tc_pr = cell._tc.get_or_add_tcPr()
shd = tc_pr.find(qn("w:shd"))
if shd is None:
shd = OxmlElement("w:shd")
tc_pr.append(shd)
shd.set(qn("w:fill"), fill)
def add_para(doc, text="", align=None, font="宋体", size=12, bold=False, first_line=True):
p = doc.add_paragraph()
if align is not None:
p.alignment = align
p.paragraph_format.line_spacing = 1.25
p.paragraph_format.space_after = Pt(4)
if first_line and align is None and text:
p.paragraph_format.first_line_indent = Pt(24)
r = p.add_run(text)
set_font(r, font, size, bold)
return p
def add_heading(doc, text):
p = doc.add_paragraph()
p.paragraph_format.space_before = Pt(8)
p.paragraph_format.space_after = Pt(5)
r = p.add_run(text)
set_font(r, "黑体", 14, True)
return p
def add_report_title(doc, text):
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_before = Pt(10)
p.paragraph_format.space_after = Pt(8)
r = p.add_run(text)
set_font(r, "黑体", 16, True)
return p
def add_caption(doc, text):
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_before = Pt(6)
p.paragraph_format.space_after = Pt(4)
r = p.add_run(text)
set_font(r, "宋体", 10.5)
return p
def add_table(doc, headers, rows, widths=None):
table = doc.add_table(rows=1, cols=len(headers))
table.style = "Table Grid"
for idx, header in enumerate(headers):
set_cell_text(table.rows[0].cells[idx], header, True, 10.5)
set_cell_shading(table.rows[0].cells[idx], "D9EAF7")
for row in rows:
cells = table.add_row().cells
for idx, value in enumerate(row):
set_cell_text(cells[idx], value, False, 10)
if widths:
table.autofit = False
for row in table.rows:
for cell, width in zip(row.cells, widths):
cell.width = width
return table
def read_data():
data_path = ROOT / "movies_data.json"
if not data_path.exists():
return [], Counter()
data = json.loads(data_path.read_text(encoding="utf-8"))
return data, Counter(item.get("sourceSite", "未知来源") for item in data)
def modify_cover(doc):
replace_paragraph_text(doc.paragraphs[5], "高级程序设计(Java)", "黑体", 24, True)
replace_paragraph_text(doc.paragraphs[6], "期末实验报告", "黑体", 24, True)
for paragraph in doc.paragraphs[:28]:
if "2026" in paragraph.text and "" in paragraph.text and "" in paragraph.text:
replace_paragraph_text(paragraph, "2026 年 05 月 21 日", "黑体", 10.5)
table = doc.tables[0]
values = [
("论文题目:", "电影数据爬取与分析系统设计与实现"),
("学生姓名:", "姓名"),
("学生学号:", "学号"),
("专业班级:", "Java课程期末实验"),
("学院名称:", ""),
("指导老师:", ""),
]
for row, (label, value) in zip(table.rows, values):
set_cell_text(row.cells[0], label, True, 12)
set_cell_text(row.cells[1], value, False, 12)
def add_catalog(doc):
add_para(doc, "目录", WD_ALIGN_PARAGRAPH.CENTER, "黑体", 16, True, False)
for line in [
"实验 电影数据爬取与分析系统设计与实现.........................1",
"一、实验目的................................................1",
"二、实验内容................................................1",
"三、实验环境与项目结构.......................................2",
"四、实验步骤................................................3",
"五、实验结果与分析...........................................6",
"六、实验总结................................................9",
"参考文献...................................................10",
"",
"图表索引",
"图1 评分分布柱状图.........................................8",
"图2 年份与评分关系散点图...................................8",
"表1 实验环境与项目结构.....................................2",
"表2 功能要求完成情况.......................................3",
"表3 CLI命令说明............................................4",
"表4 设计模式与异常体系实现.................................5",
"表5 多网站爬取来源统计.....................................6",
"表6 测试与输出文件清单.....................................9",
]:
if line == "图表索引":
add_para(doc, line, WD_ALIGN_PARAGRAPH.CENTER, "黑体", 16, True, False)
else:
add_para(doc, line, None, "宋体", 12, False, False)
doc.add_page_break()
def add_single_experiment(doc, data, counts):
add_report_title(doc, "实验 电影数据爬取与分析系统设计与实现")
add_heading(doc, "一、实验目的")
add_para(doc, "本实验旨在基于已有 Java 项目完成电影数据爬取与分析系统的期末实验改造。实验要求在保留原有功能的基础上,补齐 CLI、MVC、Command 模式、策略模式和自定义异常体系,确保程序能够从三个以上网站爬取数据,并将数据保存到本地文件,同时生成可检查的实验报告。")
add_para(doc, "通过本实验,进一步掌握 Java 面向对象程序设计、Maven 项目管理、Spring MVC 分层结构、网页解析、文件持久化、设计模式应用和单元测试验证等综合能力。")
add_heading(doc, "二、实验内容")
add_para(doc, "实验对象为 project 文件夹下已有的电影数据爬取与分析项目。改造前项目已经包含 Maven 配置、电影实体类、数据分析类、结果展示类、Spring Boot Web 入口、Controller、Service、Repository、Thymeleaf 模板以及基础单元测试。改造工作围绕期末实验要求展开,重点补齐命令行交互、模式化架构、多站点爬取、异常处理和报告输出。")
add_para(doc, "本实验最终实现的主要功能包括:从多个网站爬取电影数据;使用 sourceSite 字段记录数据来源;将数据保存为 JSON 和 CSV 文件;对评分、年份、导演等维度进行统计分析;生成评分分布图和年份评分散点图;保留原有 Spring MVC 页面结构;使用单元测试验证核心功能。")
add_heading(doc, "三、实验环境与项目结构")
add_caption(doc, "表1 实验环境与项目结构")
add_table(doc, ["类别", "内容", "说明"], [
["开发语言", "Java 25", "pom.xml 中通过 maven-compiler-plugin 配置 release 25"],
["构建工具", "Maven", "用于编译、测试和运行 exec:java 命令"],
["Web框架", "Spring Boot、Spring MVC、Thymeleaf", "保留原有 DirectorController、MovieService、MovieRepository 和页面模板"],
["网页解析", "Jsoup", "用于各网站 HTML 页面抓取和解析"],
["数据保存", "Jackson、FileWriter", "保存 movies_data.json 和 movies_analysis.csv"],
["图表生成", "JFreeChart", "生成 rating_distribution.png 和 year_rating_scatter.png"],
["测试框架", "JUnit 5", "验证分析逻辑、爬虫策略聚合和文件保存逻辑"],
], [Inches(1.3), Inches(2.2), Inches(3.0)])
add_para(doc, "项目文件均位于 project 文件夹中。新增代码主要集中在 cli、cli.command、crawler.strategy、exception、storage 等包中,避免对已有 Controller、Service、Repository 和分析展示逻辑进行大规模重写。")
add_heading(doc, "四、实验步骤")
add_para(doc, "步骤1:分析原项目结构。首先使用 rg --files 和 Get-ChildItem 查看目录结构,随后阅读 pom.xml、Main.java、MovieCrawler.java、DataAnalyzer.java、ResultDisplay.java、MovieService.java、DirectorController.java 等文件,确认项目已有功能和缺口。")
add_para(doc, "步骤2:制定最小改造方案。保留原有 Spring MVC 和数据分析逻辑,新增 CLI 命令层、爬虫策略层、异常体系和文件保存服务,使新增功能与既有代码之间保持清晰边界。")
add_para(doc, "步骤3:实现 CLI 与 Command 模式。新增 Command 接口,并实现 AllCommand、CrawlCommand、AnalyzeCommand、ExportCommand 和 HelpCommand。Main 类不再承担具体业务流程,只负责启动 CliApplication。")
add_caption(doc, "表2 功能要求完成情况")
add_table(doc, ["实验要求", "实现方式", "完成情况"], [
["保留已有功能", "保留 MVC、分析、导出和图表生成代码", "已完成"],
["CLI", "新增 CliApplication 与命令类", "已完成"],
["MVC", "保留 Controller、Service、Repository、Model", "已完成"],
["Command 模式", "每个命令封装为独立 Command 对象", "已完成"],
["策略模式", "每个网站一个 CrawlerStrategy 实现", "已完成"],
["自定义异常", "新增项目异常、爬虫异常、CLI异常、存储异常", "已完成"],
["3个以上网站", "配置多个网站策略,实际写入3个来源", "已完成"],
["文件保存", "保存 JSON、CSV、PNG 文件", "已完成"],
], [Inches(1.6), Inches(3.2), Inches(1.2)])
add_para(doc, "步骤4:实现策略模式。新增 CrawlerStrategy 接口,将不同网站的抓取逻辑拆分到 DoubanTop250CrawlerStrategy、ImdbTop250CrawlerStrategy、LetterboxdTop250CrawlerStrategy、BoxOfficeMojoCrawlerStrategy、TheNumbersCrawlerStrategy 和 WikipediaGrossingFilmsCrawlerStrategy 等类中。MovieCrawler 负责统一调度策略并对标题和年份相同的数据进行去重。")
add_para(doc, "步骤5:实现异常体系和数据保存服务。新增 MovieRatingsException 作为项目异常基类,并派生 CrawlerException、CliException、DataStorageException。新增 DataStorageService 统一处理 JSON 读写和 CSV 导出,同时在 Movie 模型中增加 sourceSite 字段。")
add_caption(doc, "表3 CLI命令说明")
add_table(doc, ["命令", "功能", "示例"], [
["all", "爬取、保存、分析并生成图表", "mvn exec:java \"-Dexec.args=all 60\""],
["crawl", "执行多网站爬取并保存 JSON/CSV", "mvn exec:java \"-Dexec.args=crawl 18\""],
["analyze", "读取 JSON 并输出统计、生成图表", "mvn exec:java \"-Dexec.args=analyze\""],
["export", "从 JSON 重新导出 CSV", "mvn exec:java \"-Dexec.args=export\""],
["help", "输出命令帮助", "mvn exec:java \"-Dexec.args=help\""],
], [Inches(1.0), Inches(2.6), Inches(2.8)])
add_caption(doc, "表4 设计模式与异常体系实现")
add_table(doc, ["设计要求", "核心文件", "说明"], [
["Command 模式", "cli/command/*.java", "命令请求被封装为对象,便于新增命令"],
["策略模式", "crawler/strategy/*.java", "不同网站爬虫互相独立,可按需扩展"],
["自定义异常", "exception/*.java", "按项目、爬虫、命令、存储进行异常分层"],
["数据保存", "storage/DataStorageService.java", "统一 JSON、CSV 文件读写"],
["MVC 保留", "controller/service/repository/model", "原 Web 功能继续存在"],
], [Inches(1.3), Inches(2.4), Inches(2.6)])
add_heading(doc, "五、实验结果与分析")
add_para(doc, "运行 mvn exec:java \"-Dexec.args=crawl 18\" 后,程序按策略列表依次尝试访问多个电影数据来源。在当前网络状态下,最终成功写入 Douban Top 250、Box Office Mojo 和 The Numbers 三个来源的数据。单个网站失败时,程序通过 CrawlerException 捕获错误并继续执行其他策略,提高了爬虫整体鲁棒性。")
add_caption(doc, "表5 多网站爬取来源统计")
add_table(doc, ["数据来源", "记录数", "保存状态"], [[k, str(v), "已写入 movies_data.json"] for k, v in counts.items()], [Inches(2.4), Inches(1.0), Inches(2.6)])
add_para(doc, f"当前 movies_data.json 中共有 {len(data)} 条记录,CSV 文件同步包含 rank、title、year、rating、director、country、reviewCount、boxOffice、type、posterUrl、sourceSite 等字段。sourceSite 字段使后续检查能够明确判断数据是否来自多个网站。")
if (ROOT / "rating_distribution.png").exists():
doc.add_picture(str(ROOT / "rating_distribution.png"), width=Inches(5.5))
add_caption(doc, "图1 评分分布柱状图")
if (ROOT / "year_rating_scatter.png").exists():
doc.add_picture(str(ROOT / "year_rating_scatter.png"), width=Inches(5.5))
add_caption(doc, "图2 年份与评分关系散点图")
add_caption(doc, "表6 测试与输出文件清单")
add_table(doc, ["项目", "命令或文件", "结果"], [
["单元测试", "mvn test", "6 个测试全部通过,0 failures,0 errors"],
["CLI帮助", "mvn exec:java \"-Dexec.args=help\"", "正常输出所有命令"],
["多站点爬取", "mvn exec:java \"-Dexec.args=crawl 18\"", "生成 JSON 与 CSV"],
["统计分析", "mvn exec:java \"-Dexec.args=analyze\"", "生成两张 PNG 图表"],
["实验报告", "学号-姓名-期末实验报告.docx", "已生成并通过渲染检查"],
], [Inches(1.4), Inches(2.7), Inches(2.1)])
add_heading(doc, "六、实验总结")
add_para(doc, "本实验在已有项目基础上完成了期末实验要求的系统化改造。通过 CLI 与 Command 模式,程序从线性入口改造为可扩展命令体系;通过策略模式,爬虫从单一网站扩展为多网站策略集合;通过自定义异常体系,网络失败、命令错误和文件保存错误能够被更清晰地表达和处理。")
add_para(doc, "实验过程中坚持最小改动原则,原有 MVC、数据分析、图表生成和测试基础均被保留。最终程序能够完成数据爬取、文件保存、统计分析、图表输出和报告生成的完整流程,满足课程期末实验的功能性和结构性要求。")
def add_references(doc):
doc.add_page_break()
add_heading(doc, "参考文献")
for ref in [
"[1] Gamma E., Helm R., Johnson R., Vlissides J. Design Patterns: Elements of Reusable Object-Oriented Software. Addison-Wesley, 1994.",
"[2] Spring Boot Reference Documentation. https://docs.spring.io/spring-boot/",
"[3] Jsoup: Java HTML Parser Documentation. https://jsoup.org/",
"[4] Apache Maven Project Documentation. https://maven.apache.org/",
"[5] Freeman E., Robson E. Head First Design Patterns. O'Reilly Media, 2020.",
]:
add_para(doc, ref, None, "宋体", 11, False, False)
def build():
if not REFERENCE.exists():
raise FileNotFoundError("reference_report.docx not found. Copy the reference report into project first.")
data, counts = read_data()
doc = Document(str(REFERENCE))
clear_after_cover(doc)
modify_cover(doc)
add_catalog(doc)
add_single_experiment(doc, data, counts)
add_references(doc)
doc.save(OUT)
print(OUT)
if __name__ == "__main__":
build()