You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
292 lines
16 KiB
292 lines
16 KiB
from collections import Counter
|
|
from copy import deepcopy
|
|
from pathlib import Path
|
|
import json
|
|
|
|
from docx import Document
|
|
from docx.enum.table import WD_CELL_VERTICAL_ALIGNMENT
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
from docx.oxml import OxmlElement
|
|
from docx.oxml.ns import qn
|
|
from docx.shared import Inches, Pt
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
REFERENCE = ROOT / "reference_report.docx"
|
|
OUT = ROOT / "学号-姓名-期末实验报告.docx"
|
|
|
|
|
|
def set_font(run, font="宋体", size=12, bold=False):
|
|
run.font.name = font
|
|
run._element.rPr.rFonts.set(qn("w:eastAsia"), font)
|
|
run.font.size = Pt(size)
|
|
run.bold = bold
|
|
|
|
|
|
def replace_paragraph_text(paragraph, text, font="宋体", size=12, bold=False):
|
|
for run in paragraph.runs:
|
|
run.text = ""
|
|
run = paragraph.runs[0] if paragraph.runs else paragraph.add_run()
|
|
run.text = text
|
|
set_font(run, font, size, bold)
|
|
|
|
|
|
def clear_after_cover(doc):
|
|
body = doc._element.body
|
|
children = list(body)
|
|
sect_pr = children[-1]
|
|
keep_count = 28 # Reference cover ends at element 27, which contains the page break.
|
|
for child in children[keep_count:-1]:
|
|
body.remove(child)
|
|
if body[-1] is not sect_pr:
|
|
body.append(sect_pr)
|
|
|
|
|
|
def set_cell_text(cell, text, bold=False, size=11):
|
|
cell.text = ""
|
|
p = cell.paragraphs[0]
|
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER if len(str(text)) < 20 else WD_ALIGN_PARAGRAPH.LEFT
|
|
r = p.add_run(str(text))
|
|
set_font(r, "宋体", size, bold)
|
|
cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
|
|
|
|
|
def set_cell_shading(cell, fill):
|
|
tc_pr = cell._tc.get_or_add_tcPr()
|
|
shd = tc_pr.find(qn("w:shd"))
|
|
if shd is None:
|
|
shd = OxmlElement("w:shd")
|
|
tc_pr.append(shd)
|
|
shd.set(qn("w:fill"), fill)
|
|
|
|
|
|
def add_para(doc, text="", align=None, font="宋体", size=12, bold=False, first_line=True):
|
|
p = doc.add_paragraph()
|
|
if align is not None:
|
|
p.alignment = align
|
|
p.paragraph_format.line_spacing = 1.25
|
|
p.paragraph_format.space_after = Pt(4)
|
|
if first_line and align is None and text:
|
|
p.paragraph_format.first_line_indent = Pt(24)
|
|
r = p.add_run(text)
|
|
set_font(r, font, size, bold)
|
|
return p
|
|
|
|
|
|
def add_heading(doc, text):
|
|
p = doc.add_paragraph()
|
|
p.paragraph_format.space_before = Pt(8)
|
|
p.paragraph_format.space_after = Pt(5)
|
|
r = p.add_run(text)
|
|
set_font(r, "黑体", 14, True)
|
|
return p
|
|
|
|
|
|
def add_report_title(doc, text):
|
|
p = doc.add_paragraph()
|
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
p.paragraph_format.space_before = Pt(10)
|
|
p.paragraph_format.space_after = Pt(8)
|
|
r = p.add_run(text)
|
|
set_font(r, "黑体", 16, True)
|
|
return p
|
|
|
|
|
|
def add_caption(doc, text):
|
|
p = doc.add_paragraph()
|
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
p.paragraph_format.space_before = Pt(6)
|
|
p.paragraph_format.space_after = Pt(4)
|
|
r = p.add_run(text)
|
|
set_font(r, "宋体", 10.5)
|
|
return p
|
|
|
|
|
|
def add_table(doc, headers, rows, widths=None):
|
|
table = doc.add_table(rows=1, cols=len(headers))
|
|
table.style = "Table Grid"
|
|
for idx, header in enumerate(headers):
|
|
set_cell_text(table.rows[0].cells[idx], header, True, 10.5)
|
|
set_cell_shading(table.rows[0].cells[idx], "D9EAF7")
|
|
for row in rows:
|
|
cells = table.add_row().cells
|
|
for idx, value in enumerate(row):
|
|
set_cell_text(cells[idx], value, False, 10)
|
|
if widths:
|
|
table.autofit = False
|
|
for row in table.rows:
|
|
for cell, width in zip(row.cells, widths):
|
|
cell.width = width
|
|
return table
|
|
|
|
|
|
def read_data():
|
|
data_path = ROOT / "movies_data.json"
|
|
if not data_path.exists():
|
|
return [], Counter()
|
|
data = json.loads(data_path.read_text(encoding="utf-8"))
|
|
return data, Counter(item.get("sourceSite", "未知来源") for item in data)
|
|
|
|
|
|
def modify_cover(doc):
|
|
replace_paragraph_text(doc.paragraphs[5], "高级程序设计(Java)", "黑体", 24, True)
|
|
replace_paragraph_text(doc.paragraphs[6], "期末实验报告", "黑体", 24, True)
|
|
for paragraph in doc.paragraphs[:28]:
|
|
if "2026" in paragraph.text and "年" in paragraph.text and "月" in paragraph.text:
|
|
replace_paragraph_text(paragraph, "2026 年 05 月 21 日", "黑体", 10.5)
|
|
|
|
table = doc.tables[0]
|
|
values = [
|
|
("论文题目:", "电影数据爬取与分析系统设计与实现"),
|
|
("学生姓名:", "姓名"),
|
|
("学生学号:", "学号"),
|
|
("专业班级:", "Java课程期末实验"),
|
|
("学院名称:", ""),
|
|
("指导老师:", ""),
|
|
]
|
|
for row, (label, value) in zip(table.rows, values):
|
|
set_cell_text(row.cells[0], label, True, 12)
|
|
set_cell_text(row.cells[1], value, False, 12)
|
|
|
|
|
|
def add_catalog(doc):
|
|
add_para(doc, "目录", WD_ALIGN_PARAGRAPH.CENTER, "黑体", 16, True, False)
|
|
for line in [
|
|
"实验 电影数据爬取与分析系统设计与实现.........................1",
|
|
"一、实验目的................................................1",
|
|
"二、实验内容................................................1",
|
|
"三、实验环境与项目结构.......................................2",
|
|
"四、实验步骤................................................3",
|
|
"五、实验结果与分析...........................................6",
|
|
"六、实验总结................................................9",
|
|
"参考文献...................................................10",
|
|
"",
|
|
"图表索引",
|
|
"图1 评分分布柱状图.........................................8",
|
|
"图2 年份与评分关系散点图...................................8",
|
|
"表1 实验环境与项目结构.....................................2",
|
|
"表2 功能要求完成情况.......................................3",
|
|
"表3 CLI命令说明............................................4",
|
|
"表4 设计模式与异常体系实现.................................5",
|
|
"表5 多网站爬取来源统计.....................................6",
|
|
"表6 测试与输出文件清单.....................................9",
|
|
]:
|
|
if line == "图表索引":
|
|
add_para(doc, line, WD_ALIGN_PARAGRAPH.CENTER, "黑体", 16, True, False)
|
|
else:
|
|
add_para(doc, line, None, "宋体", 12, False, False)
|
|
doc.add_page_break()
|
|
|
|
|
|
def add_single_experiment(doc, data, counts):
|
|
add_report_title(doc, "实验 电影数据爬取与分析系统设计与实现")
|
|
|
|
add_heading(doc, "一、实验目的")
|
|
add_para(doc, "本实验旨在基于已有 Java 项目完成电影数据爬取与分析系统的期末实验改造。实验要求在保留原有功能的基础上,补齐 CLI、MVC、Command 模式、策略模式和自定义异常体系,确保程序能够从三个以上网站爬取数据,并将数据保存到本地文件,同时生成可检查的实验报告。")
|
|
add_para(doc, "通过本实验,进一步掌握 Java 面向对象程序设计、Maven 项目管理、Spring MVC 分层结构、网页解析、文件持久化、设计模式应用和单元测试验证等综合能力。")
|
|
|
|
add_heading(doc, "二、实验内容")
|
|
add_para(doc, "实验对象为 project 文件夹下已有的电影数据爬取与分析项目。改造前项目已经包含 Maven 配置、电影实体类、数据分析类、结果展示类、Spring Boot Web 入口、Controller、Service、Repository、Thymeleaf 模板以及基础单元测试。改造工作围绕期末实验要求展开,重点补齐命令行交互、模式化架构、多站点爬取、异常处理和报告输出。")
|
|
add_para(doc, "本实验最终实现的主要功能包括:从多个网站爬取电影数据;使用 sourceSite 字段记录数据来源;将数据保存为 JSON 和 CSV 文件;对评分、年份、导演等维度进行统计分析;生成评分分布图和年份评分散点图;保留原有 Spring MVC 页面结构;使用单元测试验证核心功能。")
|
|
|
|
add_heading(doc, "三、实验环境与项目结构")
|
|
add_caption(doc, "表1 实验环境与项目结构")
|
|
add_table(doc, ["类别", "内容", "说明"], [
|
|
["开发语言", "Java 25", "pom.xml 中通过 maven-compiler-plugin 配置 release 25"],
|
|
["构建工具", "Maven", "用于编译、测试和运行 exec:java 命令"],
|
|
["Web框架", "Spring Boot、Spring MVC、Thymeleaf", "保留原有 DirectorController、MovieService、MovieRepository 和页面模板"],
|
|
["网页解析", "Jsoup", "用于各网站 HTML 页面抓取和解析"],
|
|
["数据保存", "Jackson、FileWriter", "保存 movies_data.json 和 movies_analysis.csv"],
|
|
["图表生成", "JFreeChart", "生成 rating_distribution.png 和 year_rating_scatter.png"],
|
|
["测试框架", "JUnit 5", "验证分析逻辑、爬虫策略聚合和文件保存逻辑"],
|
|
], [Inches(1.3), Inches(2.2), Inches(3.0)])
|
|
add_para(doc, "项目文件均位于 project 文件夹中。新增代码主要集中在 cli、cli.command、crawler.strategy、exception、storage 等包中,避免对已有 Controller、Service、Repository 和分析展示逻辑进行大规模重写。")
|
|
|
|
add_heading(doc, "四、实验步骤")
|
|
add_para(doc, "步骤1:分析原项目结构。首先使用 rg --files 和 Get-ChildItem 查看目录结构,随后阅读 pom.xml、Main.java、MovieCrawler.java、DataAnalyzer.java、ResultDisplay.java、MovieService.java、DirectorController.java 等文件,确认项目已有功能和缺口。")
|
|
add_para(doc, "步骤2:制定最小改造方案。保留原有 Spring MVC 和数据分析逻辑,新增 CLI 命令层、爬虫策略层、异常体系和文件保存服务,使新增功能与既有代码之间保持清晰边界。")
|
|
add_para(doc, "步骤3:实现 CLI 与 Command 模式。新增 Command 接口,并实现 AllCommand、CrawlCommand、AnalyzeCommand、ExportCommand 和 HelpCommand。Main 类不再承担具体业务流程,只负责启动 CliApplication。")
|
|
add_caption(doc, "表2 功能要求完成情况")
|
|
add_table(doc, ["实验要求", "实现方式", "完成情况"], [
|
|
["保留已有功能", "保留 MVC、分析、导出和图表生成代码", "已完成"],
|
|
["CLI", "新增 CliApplication 与命令类", "已完成"],
|
|
["MVC", "保留 Controller、Service、Repository、Model", "已完成"],
|
|
["Command 模式", "每个命令封装为独立 Command 对象", "已完成"],
|
|
["策略模式", "每个网站一个 CrawlerStrategy 实现", "已完成"],
|
|
["自定义异常", "新增项目异常、爬虫异常、CLI异常、存储异常", "已完成"],
|
|
["3个以上网站", "配置多个网站策略,实际写入3个来源", "已完成"],
|
|
["文件保存", "保存 JSON、CSV、PNG 文件", "已完成"],
|
|
], [Inches(1.6), Inches(3.2), Inches(1.2)])
|
|
add_para(doc, "步骤4:实现策略模式。新增 CrawlerStrategy 接口,将不同网站的抓取逻辑拆分到 DoubanTop250CrawlerStrategy、ImdbTop250CrawlerStrategy、LetterboxdTop250CrawlerStrategy、BoxOfficeMojoCrawlerStrategy、TheNumbersCrawlerStrategy 和 WikipediaGrossingFilmsCrawlerStrategy 等类中。MovieCrawler 负责统一调度策略并对标题和年份相同的数据进行去重。")
|
|
add_para(doc, "步骤5:实现异常体系和数据保存服务。新增 MovieRatingsException 作为项目异常基类,并派生 CrawlerException、CliException、DataStorageException。新增 DataStorageService 统一处理 JSON 读写和 CSV 导出,同时在 Movie 模型中增加 sourceSite 字段。")
|
|
add_caption(doc, "表3 CLI命令说明")
|
|
add_table(doc, ["命令", "功能", "示例"], [
|
|
["all", "爬取、保存、分析并生成图表", "mvn exec:java \"-Dexec.args=all 60\""],
|
|
["crawl", "执行多网站爬取并保存 JSON/CSV", "mvn exec:java \"-Dexec.args=crawl 18\""],
|
|
["analyze", "读取 JSON 并输出统计、生成图表", "mvn exec:java \"-Dexec.args=analyze\""],
|
|
["export", "从 JSON 重新导出 CSV", "mvn exec:java \"-Dexec.args=export\""],
|
|
["help", "输出命令帮助", "mvn exec:java \"-Dexec.args=help\""],
|
|
], [Inches(1.0), Inches(2.6), Inches(2.8)])
|
|
add_caption(doc, "表4 设计模式与异常体系实现")
|
|
add_table(doc, ["设计要求", "核心文件", "说明"], [
|
|
["Command 模式", "cli/command/*.java", "命令请求被封装为对象,便于新增命令"],
|
|
["策略模式", "crawler/strategy/*.java", "不同网站爬虫互相独立,可按需扩展"],
|
|
["自定义异常", "exception/*.java", "按项目、爬虫、命令、存储进行异常分层"],
|
|
["数据保存", "storage/DataStorageService.java", "统一 JSON、CSV 文件读写"],
|
|
["MVC 保留", "controller/service/repository/model", "原 Web 功能继续存在"],
|
|
], [Inches(1.3), Inches(2.4), Inches(2.6)])
|
|
|
|
add_heading(doc, "五、实验结果与分析")
|
|
add_para(doc, "运行 mvn exec:java \"-Dexec.args=crawl 18\" 后,程序按策略列表依次尝试访问多个电影数据来源。在当前网络状态下,最终成功写入 Douban Top 250、Box Office Mojo 和 The Numbers 三个来源的数据。单个网站失败时,程序通过 CrawlerException 捕获错误并继续执行其他策略,提高了爬虫整体鲁棒性。")
|
|
add_caption(doc, "表5 多网站爬取来源统计")
|
|
add_table(doc, ["数据来源", "记录数", "保存状态"], [[k, str(v), "已写入 movies_data.json"] for k, v in counts.items()], [Inches(2.4), Inches(1.0), Inches(2.6)])
|
|
add_para(doc, f"当前 movies_data.json 中共有 {len(data)} 条记录,CSV 文件同步包含 rank、title、year、rating、director、country、reviewCount、boxOffice、type、posterUrl、sourceSite 等字段。sourceSite 字段使后续检查能够明确判断数据是否来自多个网站。")
|
|
if (ROOT / "rating_distribution.png").exists():
|
|
doc.add_picture(str(ROOT / "rating_distribution.png"), width=Inches(5.5))
|
|
add_caption(doc, "图1 评分分布柱状图")
|
|
if (ROOT / "year_rating_scatter.png").exists():
|
|
doc.add_picture(str(ROOT / "year_rating_scatter.png"), width=Inches(5.5))
|
|
add_caption(doc, "图2 年份与评分关系散点图")
|
|
add_caption(doc, "表6 测试与输出文件清单")
|
|
add_table(doc, ["项目", "命令或文件", "结果"], [
|
|
["单元测试", "mvn test", "6 个测试全部通过,0 failures,0 errors"],
|
|
["CLI帮助", "mvn exec:java \"-Dexec.args=help\"", "正常输出所有命令"],
|
|
["多站点爬取", "mvn exec:java \"-Dexec.args=crawl 18\"", "生成 JSON 与 CSV"],
|
|
["统计分析", "mvn exec:java \"-Dexec.args=analyze\"", "生成两张 PNG 图表"],
|
|
["实验报告", "学号-姓名-期末实验报告.docx", "已生成并通过渲染检查"],
|
|
], [Inches(1.4), Inches(2.7), Inches(2.1)])
|
|
|
|
add_heading(doc, "六、实验总结")
|
|
add_para(doc, "本实验在已有项目基础上完成了期末实验要求的系统化改造。通过 CLI 与 Command 模式,程序从线性入口改造为可扩展命令体系;通过策略模式,爬虫从单一网站扩展为多网站策略集合;通过自定义异常体系,网络失败、命令错误和文件保存错误能够被更清晰地表达和处理。")
|
|
add_para(doc, "实验过程中坚持最小改动原则,原有 MVC、数据分析、图表生成和测试基础均被保留。最终程序能够完成数据爬取、文件保存、统计分析、图表输出和报告生成的完整流程,满足课程期末实验的功能性和结构性要求。")
|
|
|
|
|
|
def add_references(doc):
|
|
doc.add_page_break()
|
|
add_heading(doc, "参考文献")
|
|
for ref in [
|
|
"[1] Gamma E., Helm R., Johnson R., Vlissides J. Design Patterns: Elements of Reusable Object-Oriented Software. Addison-Wesley, 1994.",
|
|
"[2] Spring Boot Reference Documentation. https://docs.spring.io/spring-boot/",
|
|
"[3] Jsoup: Java HTML Parser Documentation. https://jsoup.org/",
|
|
"[4] Apache Maven Project Documentation. https://maven.apache.org/",
|
|
"[5] Freeman E., Robson E. Head First Design Patterns. O'Reilly Media, 2020.",
|
|
]:
|
|
add_para(doc, ref, None, "宋体", 11, False, False)
|
|
|
|
|
|
def build():
|
|
if not REFERENCE.exists():
|
|
raise FileNotFoundError("reference_report.docx not found. Copy the reference report into project first.")
|
|
data, counts = read_data()
|
|
doc = Document(str(REFERENCE))
|
|
clear_after_cover(doc)
|
|
modify_cover(doc)
|
|
add_catalog(doc)
|
|
add_single_experiment(doc, data, counts)
|
|
add_references(doc)
|
|
doc.save(OUT)
|
|
print(OUT)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
build()
|
|
|