From cdd982888fc49b70ef968ce1077a70b148544908 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kurowski?= Date: Thu, 11 Dec 2025 19:16:14 +0100 Subject: [PATCH 01/13] Integrated basic MD2DOCX code --- LICENSE.MD2DOCX | 23 +++++++++++ examples/generate_so_list.sh | 3 +- requirements.txt | 3 +- setup.py | 4 +- templateprocessor/cli.py | 44 +++++++++++++++++--- templateprocessor/md2docx.py | 51 +++++++++++++++++++++++ templateprocessor/postprocessor.py | 66 ++++++++++++++++++++++++++++++ 7 files changed, 186 insertions(+), 8 deletions(-) create mode 100644 LICENSE.MD2DOCX create mode 100644 templateprocessor/md2docx.py create mode 100644 templateprocessor/postprocessor.py diff --git a/LICENSE.MD2DOCX b/LICENSE.MD2DOCX new file mode 100644 index 0000000..c643323 --- /dev/null +++ b/LICENSE.MD2DOCX @@ -0,0 +1,23 @@ +This license applies to templateprocessor/md2docx.py file + +MIT License + +Copyright (c) 2024 Shlok T + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/examples/generate_so_list.sh b/examples/generate_so_list.sh index 49ad0eb..6f4137a 100755 --- a/examples/generate_so_list.sh +++ b/examples/generate_so_list.sh @@ -1,4 +1,5 @@ #!/bin/bash mkdir -p output template-processor --verbosity info --system-objects ../data/events.csv -o output -t so_list.tmplt -pandoc --pdf-engine=pdfroff --output=output/so_list.pdf output/so_list.md \ No newline at end of file +pandoc --pdf-engine=pdfroff --output=output/so_list.pdf output/so_list.md +template-processor --verbosity info --system-objects ../data/events.csv -o output -t so_list.tmplt -p md2docx diff --git a/requirements.txt b/requirements.txt index 0c82c2b..6a946b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ pytest==7.4.2 black==24.3.0 mako==1.3.10 - +python-docx==1.2.0 +bs4==0.0.2 diff --git a/setup.py b/setup.py index 55d60ff..a36c1f9 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,9 @@ include_package_data=True, python_requires='>=3.8', install_requires=[ - "mako==1.3.10" + "mako==1.3.10", + "python-docx==1.2.0", + "bs4==0.0.2" ], extras_require={ 'dev': [ diff --git a/templateprocessor/cli.py b/templateprocessor/cli.py index e8a18ea..6a42119 100644 --- a/templateprocessor/cli.py +++ b/templateprocessor/cli.py @@ -14,6 +14,12 @@ from templateprocessor.soreader import SOReader from templateprocessor.dvreader import DVReader from templateprocessor.so import SystemObjectType +from templateprocessor.postprocessor import ( + PostprocessorType, + Md2docxPostprocessor, + PassthroughPostprocessor, + Postprocessor, +) def parse_arguments() -> argparse.Namespace: @@ -107,6 +113,16 @@ def get_log_level(level_str: str) -> int: return log_levels.get(level_str.lower(), logging.WARNING) +def get_postprocessor_type(type_str: str) -> PostprocessorType: + types = { + PostprocessorType.NONE.value: PostprocessorType.NONE, + PostprocessorType.HTML2DOCX.value: PostprocessorType.HTML2DOCX, + PostprocessorType.MD2DOCX.value: PostprocessorType.MD2DOCX, + } + + return types.get(type_str.lower(), PostprocessorType.NONE) + + def get_values_dictionary(values: list[str]) -> dict[str, str]: if not values or not isinstance(values, list): return {} @@ -143,8 +159,10 @@ def read_sots(file_names: list[str]) -> dict[str, SystemObjectType]: def instantiate( instantiator: TemplateInstantiator, + postprocessor: Postprocessor, template_file: str, module_directory: str, + type: PostprocessorType, output_directory: str, ): try: @@ -157,10 +175,9 @@ def instantiate( logging.debug(f"Instantiating template:\n {template}") instantiated_template = instantiator.instantiate(template, module_directory) logging.debug(f"Instantiation:\n {instantiated_template}") - output = Path(output_directory) / f"{name}.md" - logging.debug(f"Saving to {output}") - with open(output, "w") as f: - f.write(instantiated_template) + output = Path(output_directory) / f"{name}" + logging.debug(f"Postprocessing") + postprocessor.process(type, instantiated_template, output) except FileNotFoundError as e: logging.error(f"File not found: {e.filename}") except Exception as e: @@ -173,6 +190,7 @@ def main(): args = parse_arguments() logging_level = get_log_level(args.verbosity) logging.basicConfig(level=logging_level) + type = get_postprocessor_type(args.postprocess) logging.info("Template Processor") logging.debug(f"Interface View: {args.iv}") @@ -182,6 +200,7 @@ def main(): logging.debug(f"Templates: {args.template}") logging.debug(f"Output Directory: {args.output}") logging.debug(f"Module directory: {args.module_directory}") + logging.debug(f"Postprocessing: {type.value}") logging.info(f"Reading Interface View from {args.iv}") iv = IVReader().read(args.iv) if args.iv else InterfaceView() @@ -198,10 +217,25 @@ def main(): logging.info(f"Instantiating the TemplateInstantiator") instantiator = TemplateInstantiator(iv, dv, sots, values) + logging.info(f"Instantiating the Postprocessor") + postprocessor = Postprocessor( + { + PostprocessorType.NONE: PassthroughPostprocessor(), + PostprocessorType.MD2DOCX: Md2docxPostprocessor(), + } + ) + if args.template: logging.info(f"Instantiating templates") for template_file in args.template: - instantiate(instantiator, template_file, args.module_directory, args.output) + instantiate( + instantiator, + postprocessor, + template_file, + args.module_directory, + type, + args.output, + ) return 0 diff --git a/templateprocessor/md2docx.py b/templateprocessor/md2docx.py new file mode 100644 index 0000000..7c68a49 --- /dev/null +++ b/templateprocessor/md2docx.py @@ -0,0 +1,51 @@ +""" + +Markdown to DOCX conversion module extracted from md2docx-python project. + +Project address: https://github.com/shloktech/md2docx-python/ +Project LICENSE: LICENSE.MD2DOCX + +The reason for extraction is to align the API and features with the needs + +""" + +import markdown +from docx import Document +from bs4 import BeautifulSoup + + +def markdown_to_word(markdown_source, word_file_path): + # Converting Markdown to HTML + html_content = markdown.markdown(markdown_source) + + # Creating a new Word Document + doc = Document() + + # Converting HTML to text and add it to the Word Document + soup = BeautifulSoup(html_content, "html.parser") + + # Adding content to the Word Document + for element in soup: + if element.name == "h1": + doc.add_heading(element.text, level=1) + elif element.name == "h2": + doc.add_heading(element.text, level=2) + elif element.name == "h3": + doc.add_heading(element.text, level=3) + elif element.name == "p": + paragraph = doc.add_paragraph() + for child in element.children: + if child.name == "strong": + paragraph.add_run(child.text).bold = True + elif child.name == "em": + paragraph.add_run(child.text).italic = True + else: + paragraph.add_run(child) + elif element.name == "ul": + for li in element.find_all("li"): + doc.add_paragraph(li.text.strip(), style="List Bullet") + elif element.name == "ol": + for li in element.find_all("li"): + doc.add_paragraph(li.text.strip(), style="List Number") + + doc.save(word_file_path) diff --git a/templateprocessor/postprocessor.py b/templateprocessor/postprocessor.py new file mode 100644 index 0000000..f82777a --- /dev/null +++ b/templateprocessor/postprocessor.py @@ -0,0 +1,66 @@ +""" +Postprocessor. + +This module is responsible for postprocessing the instantiated text into the target format. +""" + +from enum import Enum +from pathlib import Path +from templateprocessor import md2docx +from abc import ABC, abstractmethod +from typing import Dict + + +class PostprocessorType(Enum): + NONE = "none" + HTML2DOCX = "html2docx" + MD2DOCX = "md2docx" + + +class AbstractPostprocessor(ABC): + + @abstractmethod + def process(self, text: str, base_file_name: str) -> None: + """ + Process the input text and write to output file. + + Args: + text: Input text string to process + base_file_name: Path to output file, without extension + """ + pass + + +class Md2docxPostprocessor(AbstractPostprocessor): + + def process(self, text: str, base_file_name: str) -> None: + output_file_name = f"{base_file_name}.docx" + md2docx.markdown_to_word(text, output_file_name) + + +class PassthroughPostprocessor(AbstractPostprocessor): + + def process(self, text: str, base_file_name: str) -> None: + output_file_name = f"{base_file_name}.md" + with open(output_file_name, "w") as f: + f.write(text) + + +class Postprocessor: + registry: Dict[PostprocessorType, AbstractPostprocessor] + + def __init__(self, registry: Dict[PostprocessorType, AbstractPostprocessor]): + self.registry = registry + + def process(self, type: PostprocessorType, text: str, base_file_name: str) -> None: + """ + Process the input text and write to output file based on processor type. + + Args: + type: Desired postprocessor type + text: Input text string to process + base_file_name: Path to output file, without extension + """ + if not type in self.registry.keys(): + raise ValueError("Not supported postprocessor {type.value}") + self.registry[type].process(text, base_file_name) From a66b0078a857870061dc5efede70580489424cfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kurowski?= Date: Thu, 11 Dec 2025 20:11:14 +0100 Subject: [PATCH 02/13] Switched to Markdown2 and added HTML output --- requirements.txt | 1 + setup.py | 3 ++- templateprocessor/cli.py | 7 ++++-- templateprocessor/md2docx.py | 35 +++++++++++++++++++++++++++--- templateprocessor/postprocessor.py | 11 ++++++++++ 5 files changed, 51 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6a946b4..06fb71a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ black==24.3.0 mako==1.3.10 python-docx==1.2.0 bs4==0.0.2 +markdown2==2.5.4 \ No newline at end of file diff --git a/setup.py b/setup.py index a36c1f9..60511db 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,8 @@ install_requires=[ "mako==1.3.10", "python-docx==1.2.0", - "bs4==0.0.2" + "bs4==0.0.2", + "markdown2==2.5.4" ], extras_require={ 'dev': [ diff --git a/templateprocessor/cli.py b/templateprocessor/cli.py index 6a42119..cdcb0d5 100644 --- a/templateprocessor/cli.py +++ b/templateprocessor/cli.py @@ -17,6 +17,7 @@ from templateprocessor.postprocessor import ( PostprocessorType, Md2docxPostprocessor, + Md2HtmlPostprocessor, PassthroughPostprocessor, Postprocessor, ) @@ -94,7 +95,7 @@ def parse_arguments() -> argparse.Namespace: parser.add_argument( "-p", "--postprocess", - choices=["none", "md2docx"], + choices=["none", "md2docx", "md2html"], help="Output postprocessing", default="none", ) @@ -118,6 +119,7 @@ def get_postprocessor_type(type_str: str) -> PostprocessorType: PostprocessorType.NONE.value: PostprocessorType.NONE, PostprocessorType.HTML2DOCX.value: PostprocessorType.HTML2DOCX, PostprocessorType.MD2DOCX.value: PostprocessorType.MD2DOCX, + PostprocessorType.MD2HTML.value: PostprocessorType.MD2HTML, } return types.get(type_str.lower(), PostprocessorType.NONE) @@ -175,7 +177,7 @@ def instantiate( logging.debug(f"Instantiating template:\n {template}") instantiated_template = instantiator.instantiate(template, module_directory) logging.debug(f"Instantiation:\n {instantiated_template}") - output = Path(output_directory) / f"{name}" + output = str(Path(output_directory) / f"{name}") logging.debug(f"Postprocessing") postprocessor.process(type, instantiated_template, output) except FileNotFoundError as e: @@ -222,6 +224,7 @@ def main(): { PostprocessorType.NONE: PassthroughPostprocessor(), PostprocessorType.MD2DOCX: Md2docxPostprocessor(), + PostprocessorType.MD2HTML: Md2HtmlPostprocessor(), } ) diff --git a/templateprocessor/md2docx.py b/templateprocessor/md2docx.py index 7c68a49..5f52a45 100644 --- a/templateprocessor/md2docx.py +++ b/templateprocessor/md2docx.py @@ -5,18 +5,22 @@ Project address: https://github.com/shloktech/md2docx-python/ Project LICENSE: LICENSE.MD2DOCX -The reason for extraction is to align the API and features with the needs +The reason for extraction is to align the API and features with the needs. +Changes: +- input is text, not file +- markdown2 is used instead of markdown +- table support is added via markdown2 extras and additional HTML processing """ -import markdown +import markdown2 from docx import Document from bs4 import BeautifulSoup def markdown_to_word(markdown_source, word_file_path): # Converting Markdown to HTML - html_content = markdown.markdown(markdown_source) + html_content = markdown2.markdown(markdown_source, extras=["tables", "wiki-tables"]) # Creating a new Word Document doc = Document() @@ -47,5 +51,30 @@ def markdown_to_word(markdown_source, word_file_path): elif element.name == "ol": for li in element.find_all("li"): doc.add_paragraph(li.text.strip(), style="List Number") + elif element.name == "table": + rows_data = [] + for row in element.find_all("tr"): + cells = row.find_all(["th", "td"]) + row_data = [cell.get_text(strip=True) for cell in cells] + if row_data: + rows_data.append(row_data) + + if rows_data: + columns_count = len(rows_data[0]) + table = doc.add_table(rows=len(rows_data), cols=columns_count) + table.style = "Table Grid" + + for row_index, row_data in enumerate(rows_data): + for column_index, cell_text in enumerate(row_data): + if column_index < columns_count: + table.rows[row_index].cells[column_index].text = cell_text + + # Make the first row bold if it is a header + first_row = element.find("tr") + if first_row and first_row.find("th"): + for cell in table.rows[0].cells: + for paragraph in cell.paragraphs: + for run in paragraph.runs: + run.bold = True doc.save(word_file_path) diff --git a/templateprocessor/postprocessor.py b/templateprocessor/postprocessor.py index f82777a..549ab30 100644 --- a/templateprocessor/postprocessor.py +++ b/templateprocessor/postprocessor.py @@ -9,10 +9,12 @@ from templateprocessor import md2docx from abc import ABC, abstractmethod from typing import Dict +import markdown2 class PostprocessorType(Enum): NONE = "none" + MD2HTML = "md2html" HTML2DOCX = "html2docx" MD2DOCX = "md2docx" @@ -38,6 +40,15 @@ def process(self, text: str, base_file_name: str) -> None: md2docx.markdown_to_word(text, output_file_name) +class Md2HtmlPostprocessor(AbstractPostprocessor): + + def process(self, text: str, base_file_name: str) -> None: + output_file_name = f"{base_file_name}.html" + html_content = markdown2.markdown(text, extras=["tables", "wiki-tables"]) + with open(output_file_name, "w") as f: + f.write(html_content) + + class PassthroughPostprocessor(AbstractPostprocessor): def process(self, text: str, base_file_name: str) -> None: From fa937fc33165a5bfe58b7472fc37e65546d88cc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kurowski?= Date: Thu, 11 Dec 2025 20:11:28 +0100 Subject: [PATCH 03/13] Demo script refactor --- examples/generate_ecss_demo.sh | 43 +++++++++++++++------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/examples/generate_ecss_demo.sh b/examples/generate_ecss_demo.sh index ba562eb..092509c 100755 --- a/examples/generate_ecss_demo.sh +++ b/examples/generate_ecss_demo.sh @@ -1,26 +1,21 @@ #!/bin/bash mkdir -p output - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_4_1_software_static_architecture.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_4_1_software_static_architecture.pdf output/ecss-e-st-40c_4_1_software_static_architecture.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_4_2_software_dynamic_architecture.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_4_2_software_dynamic_architecture.pdf output/ecss-e-st-40c_4_2_software_dynamic_architecture.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_4_4_interfaces_context.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_4_4_interfaces_context.pdf output/ecss-e-st-40c_4_4_interfaces_context.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_2_overall_architecture.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_2_overall_architecture.pdf output/ecss-e-st-40c_5_2_overall_architecture.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_3_software_components_design.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_3_software_components_design.pdf output/ecss-e-st-40c_5_3_software_components_design.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_4_aspects_of_each_component.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_4_aspects_of_each_component.pdf output/ecss-e-st-40c_5_4_aspects_of_each_component.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_5_internal_interface_design.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_5_internal_interface_design.pdf output/ecss-e-st-40c_5_5_internal_interface_design.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_6_requirement_traceability.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_6_requirement_traceability.pdf output/ecss-e-st-40c_6_requirement_traceability.md \ No newline at end of file +# List of template names +templates=( + "ecss-e-st-40c_4_1_software_static_architecture" + "ecss-e-st-40c_4_2_software_dynamic_architecture" + "ecss-e-st-40c_4_4_interfaces_context" + "ecss-e-st-40c_5_2_overall_architecture" + "ecss-e-st-40c_5_3_software_components_design" + "ecss-e-st-40c_5_4_aspects_of_each_component" + "ecss-e-st-40c_5_5_internal_interface_design" + "ecss-e-st-40c_6_requirement_traceability" +) + +# Loop through templates +for template in "${templates[@]}"; do + template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/${template}.tmplt + template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/${template}.tmplt -p md2docx + template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/${template}.tmplt -p md2html + pandoc --pdf-engine=pdfroff --output=output/${template}.pdf output/${template}.md +done \ No newline at end of file From 9bf7e750e2d8cc7f5ef6d937e362da1c622d9e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kurowski?= Date: Thu, 11 Dec 2025 20:37:45 +0100 Subject: [PATCH 04/13] Added support for nested tags --- templateprocessor/md2docx.py | 43 +++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/templateprocessor/md2docx.py b/templateprocessor/md2docx.py index 5f52a45..840fa9b 100644 --- a/templateprocessor/md2docx.py +++ b/templateprocessor/md2docx.py @@ -15,10 +15,43 @@ import markdown2 from docx import Document -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag -def markdown_to_word(markdown_source, word_file_path): +def get_element_text(element: Tag) -> str: + if hasattr(element, "get_text"): + return element.get_text(strip=True) + else: + return str(element).strip() + + +def process_list_items(list_element: Tag, doc: Document, style_base: str, level=0): + # Get direct children li elements only (not nested) + for li in list_element.find_all("li", recursive=False): + # Get text content, excluding nested lists + text_parts = [] + for child in li.children: + if child.name not in ["ul", "ol"]: + text_parts.append(get_element_text(child)) + + text = " ".join(text_parts).strip() + + # Add paragraph with appropriate indentation level + if text: + style = style_base if level == 0 else f"{style_base} {level + 1}" + doc.add_paragraph(text, style=style) + + # Process nested lists + nested_ul = li.find("ul", recursive=False) + nested_ol = li.find("ol", recursive=False) + + if nested_ul: + process_list_items(nested_ul, doc, "List Bullet", level + 1) + if nested_ol: + process_list_items(nested_ol, doc, "List Number", level + 1) + + +def markdown_to_word(markdown_source: str, word_file_path: str): # Converting Markdown to HTML html_content = markdown2.markdown(markdown_source, extras=["tables", "wiki-tables"]) @@ -46,11 +79,9 @@ def markdown_to_word(markdown_source, word_file_path): else: paragraph.add_run(child) elif element.name == "ul": - for li in element.find_all("li"): - doc.add_paragraph(li.text.strip(), style="List Bullet") + process_list_items(element, doc, "List Bullet") elif element.name == "ol": - for li in element.find_all("li"): - doc.add_paragraph(li.text.strip(), style="List Number") + process_list_items(element, doc, "List Number") elif element.name == "table": rows_data = [] for row in element.find_all("tr"): From df595b82dcc1ab57cd5f15b8bd1498d0e09dae10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kurowski?= Date: Thu, 11 Dec 2025 20:41:21 +0100 Subject: [PATCH 05/13] Refactored md2docx for testing --- templateprocessor/md2docx.py | 9 +++++++-- templateprocessor/postprocessor.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/templateprocessor/md2docx.py b/templateprocessor/md2docx.py index 840fa9b..6af2c7b 100644 --- a/templateprocessor/md2docx.py +++ b/templateprocessor/md2docx.py @@ -51,7 +51,12 @@ def process_list_items(list_element: Tag, doc: Document, style_base: str, level= process_list_items(nested_ol, doc, "List Number", level + 1) -def markdown_to_word(markdown_source: str, word_file_path: str): +def markdown_to_word_file(markdown_source: str, word_file_path: str): + doc = markdown_to_word_object(markdown_source) + doc.save(word_file_path) + + +def markdown_to_word_object(markdown_source: str) -> Document: # Converting Markdown to HTML html_content = markdown2.markdown(markdown_source, extras=["tables", "wiki-tables"]) @@ -108,4 +113,4 @@ def markdown_to_word(markdown_source: str, word_file_path: str): for run in paragraph.runs: run.bold = True - doc.save(word_file_path) + return doc diff --git a/templateprocessor/postprocessor.py b/templateprocessor/postprocessor.py index 549ab30..a3733fe 100644 --- a/templateprocessor/postprocessor.py +++ b/templateprocessor/postprocessor.py @@ -37,7 +37,7 @@ class Md2docxPostprocessor(AbstractPostprocessor): def process(self, text: str, base_file_name: str) -> None: output_file_name = f"{base_file_name}.docx" - md2docx.markdown_to_word(text, output_file_name) + md2docx.markdown_to_word_file(text, output_file_name) class Md2HtmlPostprocessor(AbstractPostprocessor): From 8a8cba5e0bc08187c89c1eaf5884fac281187452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kurowski?= Date: Thu, 11 Dec 2025 20:52:48 +0100 Subject: [PATCH 06/13] Added tests --- tests/Makefile | 3 +- tests/test_md2docx.py | 157 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 tests/test_md2docx.py diff --git a/tests/Makefile b/tests/Makefile index 6eff3a1..04217df 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -5,7 +5,8 @@ TESTS = \ test_ivreader.py \ test_dvreader.py \ test_soreader.py \ - test_templateinstantiator.py + test_templateinstantiator.py \ + test_md2docx.py .PHONY: \ check diff --git a/tests/test_md2docx.py b/tests/test_md2docx.py new file mode 100644 index 0000000..21957dc --- /dev/null +++ b/tests/test_md2docx.py @@ -0,0 +1,157 @@ +""" +Tests for md2docx module +""" + +import pytest +from docx import Document +from docx.document import Document as DocumentType +from templateprocessor.md2docx import markdown_to_word_object + + +class TestMarkdownToWordObject: + """ + Test cases for markdown_to_word_object function. + markdown_to_word_file is not tested, as it is a simple file write. + """ + + def test_simple_text(self): + """Test converting simple text paragraphs.""" + # Prepare + markdown = "This is a simple paragraph.\n\nThis is another paragraph." + + # Execute + doc = markdown_to_word_object(markdown) + # Verify + assert isinstance(doc, DocumentType) + paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] + paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] + assert len(paragraphs) == 2 + assert "This is a simple paragraph." in paragraphs[0] + assert "This is another paragraph." in paragraphs[1] + + def test_simple_list(self): + """Test converting a simple bullet list.""" + # Prepare + markdown = """ +- First item +- Second item +- Third item +""" + + # Execute + doc = markdown_to_word_object(markdown) + # Verify + assert isinstance(doc, DocumentType) + paragraphs = [p for p in doc.paragraphs if p.text.strip()] + assert len(paragraphs) == 3 + assert len(paragraphs) == 3 + assert paragraphs[0].text == "First item" + assert paragraphs[1].text == "Second item" + assert paragraphs[2].text == "Third item" + # Verify style + assert "List Bullet" in paragraphs[0].style.name + assert "List Bullet" in paragraphs[1].style.name + assert "List Bullet" in paragraphs[2].style.name + + def test_nested_list_two_levels(self): + """Test converting a nested list with 2 levels.""" + # Prepare + markdown = """ +- Top level item 1 + - Nested item 1.1 + - Nested item 1.2 +- Top level item 2 + - Nested item 2.1 +""" + + # Execute + doc = markdown_to_word_object(markdown) + # Verify + assert isinstance(doc, DocumentType) + paragraphs = [p for p in doc.paragraphs if p.text.strip()] + assert len(paragraphs) == 5 + assert len(paragraphs) == 5 + + # Check text content + assert paragraphs[0].text == "Top level item 1" + assert paragraphs[1].text == "Nested item 1.1" + assert paragraphs[2].text == "Nested item 1.2" + assert paragraphs[3].text == "Top level item 2" + assert paragraphs[4].text == "Nested item 2.1" + + # Verify top level uses base style + assert "List Bullet" in paragraphs[0].style.name + assert "List Bullet" in paragraphs[3].style.name + + # Verify nested items use appropriate style + assert "List Bullet 2" in paragraphs[1].style.name + assert "List Bullet 2" in paragraphs[2].style.name + + def test_table(self): + """Test converting a markdown table.""" + # Prepare + markdown = """ +| Header 1 | Header 2 | Header 3 | +|----------|----------|----------| +| Row 1 Col 1 | Row 1 Col 2 | Row 1 Col 3 | +| Row 2 Col 1 | Row 2 Col 2 | Row 2 Col 3 | +""" + + # Execute + doc = markdown_to_word_object(markdown) + # Verify + assert isinstance(doc, DocumentType) + assert len(doc.tables) == 1 + assert len(doc.tables) == 1 + + table = doc.tables[0] + assert len(table.rows) == 3 + assert len(table.columns) == 3 + + # Check header row + assert table.rows[0].cells[0].text == "Header 1" + assert table.rows[0].cells[1].text == "Header 2" + assert table.rows[0].cells[2].text == "Header 3" + + # Check data rows + assert table.rows[1].cells[0].text == "Row 1 Col 1" + assert table.rows[1].cells[1].text == "Row 1 Col 2" + assert table.rows[1].cells[2].text == "Row 1 Col 3" + + assert table.rows[2].cells[0].text == "Row 2 Col 1" + assert table.rows[2].cells[1].text == "Row 2 Col 2" + assert table.rows[2].cells[2].text == "Row 2 Col 3" + + # Verify header row is bold + first_cell_runs = table.rows[0].cells[0].paragraphs[0].runs + assert first_cell_runs is not None + assert first_cell_runs[0].bold + + def test_header(self): + """Test converting markdown headers.""" + # Prepare + markdown = """ +# Header 1 + +## Header 2 + +### Header 3 +""" + + # Execute + doc = markdown_to_word_object(markdown) + + # Verify + assert isinstance(doc, DocumentType) + paragraphs = [p for p in doc.paragraphs if p.text.strip()] + assert len(paragraphs) == 3 + + # Check header text + assert paragraphs[0].text == "Header 1" + assert paragraphs[1].text == "Header 2" + assert paragraphs[2].text == "Header 3" + + # Verify heading styles + assert "Heading 1" in paragraphs[0].style.name + assert "Heading 2" in paragraphs[1].style.name + assert "Heading 3" in paragraphs[2].style.name From d37d1e179a4e2bc33c04d6e18049614c341c4e95 Mon Sep 17 00:00:00 2001 From: Lurkerpas Date: Thu, 11 Dec 2025 21:02:59 +0100 Subject: [PATCH 07/13] Update templateprocessor/postprocessor.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- templateprocessor/postprocessor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/templateprocessor/postprocessor.py b/templateprocessor/postprocessor.py index a3733fe..d724f08 100644 --- a/templateprocessor/postprocessor.py +++ b/templateprocessor/postprocessor.py @@ -63,15 +63,15 @@ class Postprocessor: def __init__(self, registry: Dict[PostprocessorType, AbstractPostprocessor]): self.registry = registry - def process(self, type: PostprocessorType, text: str, base_file_name: str) -> None: + def process(self, postprocessor_type: PostprocessorType, text: str, base_file_name: str) -> None: """ Process the input text and write to output file based on processor type. Args: - type: Desired postprocessor type + postprocessor_type: Desired postprocessor type text: Input text string to process base_file_name: Path to output file, without extension """ - if not type in self.registry.keys(): - raise ValueError("Not supported postprocessor {type.value}") - self.registry[type].process(text, base_file_name) + if not postprocessor_type in self.registry.keys(): + raise ValueError(f"Not supported postprocessor {postprocessor_type.value}") + self.registry[postprocessor_type].process(text, base_file_name) From a9102acbc4399949b2d74df5e596829eba4e01ee Mon Sep 17 00:00:00 2001 From: Lurkerpas Date: Thu, 11 Dec 2025 21:03:21 +0100 Subject: [PATCH 08/13] Update templateprocessor/md2docx.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- templateprocessor/md2docx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templateprocessor/md2docx.py b/templateprocessor/md2docx.py index 6af2c7b..56588b1 100644 --- a/templateprocessor/md2docx.py +++ b/templateprocessor/md2docx.py @@ -63,7 +63,7 @@ def markdown_to_word_object(markdown_source: str) -> Document: # Creating a new Word Document doc = Document() - # Converting HTML to text and add it to the Word Document + # Converting HTML to text and adding it to the Word Document soup = BeautifulSoup(html_content, "html.parser") # Adding content to the Word Document From 4c6213cc9f426c321be58b7dd6347dc18a768b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kurowski?= Date: Thu, 11 Dec 2025 21:06:02 +0100 Subject: [PATCH 09/13] Review fixes --- tests/test_md2docx.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test_md2docx.py b/tests/test_md2docx.py index 21957dc..6f65365 100644 --- a/tests/test_md2docx.py +++ b/tests/test_md2docx.py @@ -24,7 +24,6 @@ def test_simple_text(self): # Verify assert isinstance(doc, DocumentType) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] - paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] assert len(paragraphs) == 2 assert "This is a simple paragraph." in paragraphs[0] assert "This is another paragraph." in paragraphs[1] @@ -44,7 +43,6 @@ def test_simple_list(self): assert isinstance(doc, DocumentType) paragraphs = [p for p in doc.paragraphs if p.text.strip()] assert len(paragraphs) == 3 - assert len(paragraphs) == 3 assert paragraphs[0].text == "First item" assert paragraphs[1].text == "Second item" assert paragraphs[2].text == "Third item" @@ -70,7 +68,6 @@ def test_nested_list_two_levels(self): assert isinstance(doc, DocumentType) paragraphs = [p for p in doc.paragraphs if p.text.strip()] assert len(paragraphs) == 5 - assert len(paragraphs) == 5 # Check text content assert paragraphs[0].text == "Top level item 1" @@ -102,7 +99,6 @@ def test_table(self): # Verify assert isinstance(doc, DocumentType) assert len(doc.tables) == 1 - assert len(doc.tables) == 1 table = doc.tables[0] assert len(table.rows) == 3 From 343e96c4202c7404772ab18c0575939f9be439bb Mon Sep 17 00:00:00 2001 From: Lurkerpas Date: Thu, 11 Dec 2025 21:08:27 +0100 Subject: [PATCH 10/13] Update tests/test_md2docx.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/test_md2docx.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_md2docx.py b/tests/test_md2docx.py index 6f65365..d122a81 100644 --- a/tests/test_md2docx.py +++ b/tests/test_md2docx.py @@ -3,7 +3,6 @@ """ import pytest -from docx import Document from docx.document import Document as DocumentType from templateprocessor.md2docx import markdown_to_word_object From 4bc1605d613e1156fb0d9eb79181ef4dbc069662 Mon Sep 17 00:00:00 2001 From: Lurkerpas Date: Thu, 11 Dec 2025 21:08:53 +0100 Subject: [PATCH 11/13] Update templateprocessor/postprocessor.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- templateprocessor/postprocessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/templateprocessor/postprocessor.py b/templateprocessor/postprocessor.py index d724f08..dc9e552 100644 --- a/templateprocessor/postprocessor.py +++ b/templateprocessor/postprocessor.py @@ -5,7 +5,6 @@ """ from enum import Enum -from pathlib import Path from templateprocessor import md2docx from abc import ABC, abstractmethod from typing import Dict From b08c1d11cfea3eac90b1be9522f1338dee815cde Mon Sep 17 00:00:00 2001 From: Lurkerpas Date: Thu, 11 Dec 2025 21:10:51 +0100 Subject: [PATCH 12/13] Update setup.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 60511db..1359fc0 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ install_requires=[ "mako==1.3.10", "python-docx==1.2.0", - "bs4==0.0.2", + "beautifulsoup4==4.12.3", "markdown2==2.5.4" ], extras_require={ From 14ef8c0583224fd8892e7e1242e5b79cd6a0d654 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kurowski?= Date: Fri, 12 Dec 2025 10:51:53 +0100 Subject: [PATCH 13/13] Review fixes --- templateprocessor/cli.py | 13 ++++++------- templateprocessor/postprocessor.py | 7 ++++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/templateprocessor/cli.py b/templateprocessor/cli.py index cdcb0d5..4e0770d 100644 --- a/templateprocessor/cli.py +++ b/templateprocessor/cli.py @@ -117,7 +117,6 @@ def get_log_level(level_str: str) -> int: def get_postprocessor_type(type_str: str) -> PostprocessorType: types = { PostprocessorType.NONE.value: PostprocessorType.NONE, - PostprocessorType.HTML2DOCX.value: PostprocessorType.HTML2DOCX, PostprocessorType.MD2DOCX.value: PostprocessorType.MD2DOCX, PostprocessorType.MD2HTML.value: PostprocessorType.MD2HTML, } @@ -164,7 +163,7 @@ def instantiate( postprocessor: Postprocessor, template_file: str, module_directory: str, - type: PostprocessorType, + postprocessor_type: PostprocessorType, output_directory: str, ): try: @@ -178,8 +177,8 @@ def instantiate( instantiated_template = instantiator.instantiate(template, module_directory) logging.debug(f"Instantiation:\n {instantiated_template}") output = str(Path(output_directory) / f"{name}") - logging.debug(f"Postprocessing") - postprocessor.process(type, instantiated_template, output) + logging.debug(f"Postprocessing with {postprocessor_type}") + postprocessor.process(postprocessor_type, instantiated_template, output) except FileNotFoundError as e: logging.error(f"File not found: {e.filename}") except Exception as e: @@ -192,7 +191,7 @@ def main(): args = parse_arguments() logging_level = get_log_level(args.verbosity) logging.basicConfig(level=logging_level) - type = get_postprocessor_type(args.postprocess) + postprocessor_type = get_postprocessor_type(args.postprocess) logging.info("Template Processor") logging.debug(f"Interface View: {args.iv}") @@ -202,7 +201,7 @@ def main(): logging.debug(f"Templates: {args.template}") logging.debug(f"Output Directory: {args.output}") logging.debug(f"Module directory: {args.module_directory}") - logging.debug(f"Postprocessing: {type.value}") + logging.debug(f"Postprocessing: {postprocessor_type.value}") logging.info(f"Reading Interface View from {args.iv}") iv = IVReader().read(args.iv) if args.iv else InterfaceView() @@ -236,7 +235,7 @@ def main(): postprocessor, template_file, args.module_directory, - type, + postprocessor_type, args.output, ) diff --git a/templateprocessor/postprocessor.py b/templateprocessor/postprocessor.py index dc9e552..ccee679 100644 --- a/templateprocessor/postprocessor.py +++ b/templateprocessor/postprocessor.py @@ -14,7 +14,6 @@ class PostprocessorType(Enum): NONE = "none" MD2HTML = "md2html" - HTML2DOCX = "html2docx" MD2DOCX = "md2docx" @@ -62,7 +61,9 @@ class Postprocessor: def __init__(self, registry: Dict[PostprocessorType, AbstractPostprocessor]): self.registry = registry - def process(self, postprocessor_type: PostprocessorType, text: str, base_file_name: str) -> None: + def process( + self, postprocessor_type: PostprocessorType, text: str, base_file_name: str + ) -> None: """ Process the input text and write to output file based on processor type. @@ -71,6 +72,6 @@ def process(self, postprocessor_type: PostprocessorType, text: str, base_file_na text: Input text string to process base_file_name: Path to output file, without extension """ - if not postprocessor_type in self.registry.keys(): + if postprocessor_type not in self.registry: raise ValueError(f"Not supported postprocessor {postprocessor_type.value}") self.registry[postprocessor_type].process(text, base_file_name)