diff --git a/LICENSE.MD2DOCX b/LICENSE.MD2DOCX new file mode 100644 index 0000000..c643323 --- /dev/null +++ b/LICENSE.MD2DOCX @@ -0,0 +1,23 @@ +This license applies to templateprocessor/md2docx.py file + +MIT License + +Copyright (c) 2024 Shlok T + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/examples/generate_ecss_demo.sh b/examples/generate_ecss_demo.sh index ba562eb..092509c 100755 --- a/examples/generate_ecss_demo.sh +++ b/examples/generate_ecss_demo.sh @@ -1,26 +1,21 @@ #!/bin/bash mkdir -p output - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_4_1_software_static_architecture.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_4_1_software_static_architecture.pdf output/ecss-e-st-40c_4_1_software_static_architecture.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_4_2_software_dynamic_architecture.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_4_2_software_dynamic_architecture.pdf output/ecss-e-st-40c_4_2_software_dynamic_architecture.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_4_4_interfaces_context.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_4_4_interfaces_context.pdf output/ecss-e-st-40c_4_4_interfaces_context.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_2_overall_architecture.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_2_overall_architecture.pdf output/ecss-e-st-40c_5_2_overall_architecture.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_3_software_components_design.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_3_software_components_design.pdf output/ecss-e-st-40c_5_3_software_components_design.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_4_aspects_of_each_component.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_4_aspects_of_each_component.pdf output/ecss-e-st-40c_5_4_aspects_of_each_component.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_5_5_internal_interface_design.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_5_5_internal_interface_design.pdf output/ecss-e-st-40c_5_5_internal_interface_design.md - -template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/ecss-e-st-40c_6_requirement_traceability.tmplt -pandoc --pdf-engine=pdfroff --output=output/ecss-e-st-40c_6_requirement_traceability.pdf output/ecss-e-st-40c_6_requirement_traceability.md \ No newline at end of file +# List of template names +templates=( + "ecss-e-st-40c_4_1_software_static_architecture" + "ecss-e-st-40c_4_2_software_dynamic_architecture" + "ecss-e-st-40c_4_4_interfaces_context" + "ecss-e-st-40c_5_2_overall_architecture" + "ecss-e-st-40c_5_3_software_components_design" + "ecss-e-st-40c_5_4_aspects_of_each_component" + "ecss-e-st-40c_5_5_internal_interface_design" + "ecss-e-st-40c_6_requirement_traceability" +) + +# Loop through templates +for template in "${templates[@]}"; do + template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/${template}.tmplt + template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/${template}.tmplt -p md2docx + template-processor --verbosity info --value TARGET=ASW --iv demo-project/interfaceview.xml --dv demo-project/deploymentview.dv.xml -o output -t ../data/ecss-template/${template}.tmplt -p md2html + pandoc --pdf-engine=pdfroff --output=output/${template}.pdf output/${template}.md +done \ No newline at end of file diff --git a/examples/generate_so_list.sh b/examples/generate_so_list.sh index 49ad0eb..6f4137a 100755 --- a/examples/generate_so_list.sh +++ b/examples/generate_so_list.sh @@ -1,4 +1,5 @@ #!/bin/bash mkdir -p output template-processor --verbosity info --system-objects ../data/events.csv -o output -t so_list.tmplt -pandoc --pdf-engine=pdfroff --output=output/so_list.pdf output/so_list.md \ No newline at end of file +pandoc --pdf-engine=pdfroff --output=output/so_list.pdf output/so_list.md +template-processor --verbosity info --system-objects ../data/events.csv -o output -t so_list.tmplt -p md2docx diff --git a/requirements.txt b/requirements.txt index 0c82c2b..06fb71a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ pytest==7.4.2 black==24.3.0 mako==1.3.10 - +python-docx==1.2.0 +bs4==0.0.2 +markdown2==2.5.4 \ No newline at end of file diff --git a/setup.py b/setup.py index 55d60ff..1359fc0 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,10 @@ include_package_data=True, python_requires='>=3.8', install_requires=[ - "mako==1.3.10" + "mako==1.3.10", + "python-docx==1.2.0", + "beautifulsoup4==4.12.3", + "markdown2==2.5.4" ], extras_require={ 'dev': [ diff --git a/templateprocessor/cli.py b/templateprocessor/cli.py index e8a18ea..4e0770d 100644 --- a/templateprocessor/cli.py +++ b/templateprocessor/cli.py @@ -14,6 +14,13 @@ from templateprocessor.soreader import SOReader from templateprocessor.dvreader import DVReader from templateprocessor.so import SystemObjectType +from templateprocessor.postprocessor import ( + PostprocessorType, + Md2docxPostprocessor, + Md2HtmlPostprocessor, + PassthroughPostprocessor, + Postprocessor, +) def parse_arguments() -> argparse.Namespace: @@ -88,7 +95,7 @@ def parse_arguments() -> argparse.Namespace: parser.add_argument( "-p", "--postprocess", - choices=["none", "md2docx"], + choices=["none", "md2docx", "md2html"], help="Output postprocessing", default="none", ) @@ -107,6 +114,16 @@ def get_log_level(level_str: str) -> int: return log_levels.get(level_str.lower(), logging.WARNING) +def get_postprocessor_type(type_str: str) -> PostprocessorType: + types = { + PostprocessorType.NONE.value: PostprocessorType.NONE, + PostprocessorType.MD2DOCX.value: PostprocessorType.MD2DOCX, + PostprocessorType.MD2HTML.value: PostprocessorType.MD2HTML, + } + + return types.get(type_str.lower(), PostprocessorType.NONE) + + def get_values_dictionary(values: list[str]) -> dict[str, str]: if not values or not isinstance(values, list): return {} @@ -143,8 +160,10 @@ def read_sots(file_names: list[str]) -> dict[str, SystemObjectType]: def instantiate( instantiator: TemplateInstantiator, + postprocessor: Postprocessor, template_file: str, module_directory: str, + postprocessor_type: PostprocessorType, output_directory: str, ): try: @@ -157,10 +176,9 @@ def instantiate( logging.debug(f"Instantiating template:\n {template}") instantiated_template = instantiator.instantiate(template, module_directory) logging.debug(f"Instantiation:\n {instantiated_template}") - output = Path(output_directory) / f"{name}.md" - logging.debug(f"Saving to {output}") - with open(output, "w") as f: - f.write(instantiated_template) + output = str(Path(output_directory) / f"{name}") + logging.debug(f"Postprocessing with {postprocessor_type}") + postprocessor.process(postprocessor_type, instantiated_template, output) except FileNotFoundError as e: logging.error(f"File not found: {e.filename}") except Exception as e: @@ -173,6 +191,7 @@ def main(): args = parse_arguments() logging_level = get_log_level(args.verbosity) logging.basicConfig(level=logging_level) + postprocessor_type = get_postprocessor_type(args.postprocess) logging.info("Template Processor") logging.debug(f"Interface View: {args.iv}") @@ -182,6 +201,7 @@ def main(): logging.debug(f"Templates: {args.template}") logging.debug(f"Output Directory: {args.output}") logging.debug(f"Module directory: {args.module_directory}") + logging.debug(f"Postprocessing: {postprocessor_type.value}") logging.info(f"Reading Interface View from {args.iv}") iv = IVReader().read(args.iv) if args.iv else InterfaceView() @@ -198,10 +218,26 @@ def main(): logging.info(f"Instantiating the TemplateInstantiator") instantiator = TemplateInstantiator(iv, dv, sots, values) + logging.info(f"Instantiating the Postprocessor") + postprocessor = Postprocessor( + { + PostprocessorType.NONE: PassthroughPostprocessor(), + PostprocessorType.MD2DOCX: Md2docxPostprocessor(), + PostprocessorType.MD2HTML: Md2HtmlPostprocessor(), + } + ) + if args.template: logging.info(f"Instantiating templates") for template_file in args.template: - instantiate(instantiator, template_file, args.module_directory, args.output) + instantiate( + instantiator, + postprocessor, + template_file, + args.module_directory, + postprocessor_type, + args.output, + ) return 0 diff --git a/templateprocessor/md2docx.py b/templateprocessor/md2docx.py new file mode 100644 index 0000000..56588b1 --- /dev/null +++ b/templateprocessor/md2docx.py @@ -0,0 +1,116 @@ +""" + +Markdown to DOCX conversion module extracted from md2docx-python project. + +Project address: https://github.com/shloktech/md2docx-python/ +Project LICENSE: LICENSE.MD2DOCX + +The reason for extraction is to align the API and features with the needs. +Changes: +- input is text, not file +- markdown2 is used instead of markdown +- table support is added via markdown2 extras and additional HTML processing + +""" + +import markdown2 +from docx import Document +from bs4 import BeautifulSoup, Tag + + +def get_element_text(element: Tag) -> str: + if hasattr(element, "get_text"): + return element.get_text(strip=True) + else: + return str(element).strip() + + +def process_list_items(list_element: Tag, doc: Document, style_base: str, level=0): + # Get direct children li elements only (not nested) + for li in list_element.find_all("li", recursive=False): + # Get text content, excluding nested lists + text_parts = [] + for child in li.children: + if child.name not in ["ul", "ol"]: + text_parts.append(get_element_text(child)) + + text = " ".join(text_parts).strip() + + # Add paragraph with appropriate indentation level + if text: + style = style_base if level == 0 else f"{style_base} {level + 1}" + doc.add_paragraph(text, style=style) + + # Process nested lists + nested_ul = li.find("ul", recursive=False) + nested_ol = li.find("ol", recursive=False) + + if nested_ul: + process_list_items(nested_ul, doc, "List Bullet", level + 1) + if nested_ol: + process_list_items(nested_ol, doc, "List Number", level + 1) + + +def markdown_to_word_file(markdown_source: str, word_file_path: str): + doc = markdown_to_word_object(markdown_source) + doc.save(word_file_path) + + +def markdown_to_word_object(markdown_source: str) -> Document: + # Converting Markdown to HTML + html_content = markdown2.markdown(markdown_source, extras=["tables", "wiki-tables"]) + + # Creating a new Word Document + doc = Document() + + # Converting HTML to text and adding it to the Word Document + soup = BeautifulSoup(html_content, "html.parser") + + # Adding content to the Word Document + for element in soup: + if element.name == "h1": + doc.add_heading(element.text, level=1) + elif element.name == "h2": + doc.add_heading(element.text, level=2) + elif element.name == "h3": + doc.add_heading(element.text, level=3) + elif element.name == "p": + paragraph = doc.add_paragraph() + for child in element.children: + if child.name == "strong": + paragraph.add_run(child.text).bold = True + elif child.name == "em": + paragraph.add_run(child.text).italic = True + else: + paragraph.add_run(child) + elif element.name == "ul": + process_list_items(element, doc, "List Bullet") + elif element.name == "ol": + process_list_items(element, doc, "List Number") + elif element.name == "table": + rows_data = [] + for row in element.find_all("tr"): + cells = row.find_all(["th", "td"]) + row_data = [cell.get_text(strip=True) for cell in cells] + if row_data: + rows_data.append(row_data) + + if rows_data: + columns_count = len(rows_data[0]) + table = doc.add_table(rows=len(rows_data), cols=columns_count) + table.style = "Table Grid" + + for row_index, row_data in enumerate(rows_data): + for column_index, cell_text in enumerate(row_data): + if column_index < columns_count: + table.rows[row_index].cells[column_index].text = cell_text + + # Make the first row bold if it is a header + first_row = element.find("tr") + if first_row and first_row.find("th"): + for cell in table.rows[0].cells: + for paragraph in cell.paragraphs: + for run in paragraph.runs: + run.bold = True + + return doc diff --git a/templateprocessor/postprocessor.py b/templateprocessor/postprocessor.py new file mode 100644 index 0000000..ccee679 --- /dev/null +++ b/templateprocessor/postprocessor.py @@ -0,0 +1,77 @@ +""" +Postprocessor. + +This module is responsible for postprocessing the instantiated text into the target format. +""" + +from enum import Enum +from templateprocessor import md2docx +from abc import ABC, abstractmethod +from typing import Dict +import markdown2 + + +class PostprocessorType(Enum): + NONE = "none" + MD2HTML = "md2html" + MD2DOCX = "md2docx" + + +class AbstractPostprocessor(ABC): + + @abstractmethod + def process(self, text: str, base_file_name: str) -> None: + """ + Process the input text and write to output file. + + Args: + text: Input text string to process + base_file_name: Path to output file, without extension + """ + pass + + +class Md2docxPostprocessor(AbstractPostprocessor): + + def process(self, text: str, base_file_name: str) -> None: + output_file_name = f"{base_file_name}.docx" + md2docx.markdown_to_word_file(text, output_file_name) + + +class Md2HtmlPostprocessor(AbstractPostprocessor): + + def process(self, text: str, base_file_name: str) -> None: + output_file_name = f"{base_file_name}.html" + html_content = markdown2.markdown(text, extras=["tables", "wiki-tables"]) + with open(output_file_name, "w") as f: + f.write(html_content) + + +class PassthroughPostprocessor(AbstractPostprocessor): + + def process(self, text: str, base_file_name: str) -> None: + output_file_name = f"{base_file_name}.md" + with open(output_file_name, "w") as f: + f.write(text) + + +class Postprocessor: + registry: Dict[PostprocessorType, AbstractPostprocessor] + + def __init__(self, registry: Dict[PostprocessorType, AbstractPostprocessor]): + self.registry = registry + + def process( + self, postprocessor_type: PostprocessorType, text: str, base_file_name: str + ) -> None: + """ + Process the input text and write to output file based on processor type. + + Args: + postprocessor_type: Desired postprocessor type + text: Input text string to process + base_file_name: Path to output file, without extension + """ + if postprocessor_type not in self.registry: + raise ValueError(f"Not supported postprocessor {postprocessor_type.value}") + self.registry[postprocessor_type].process(text, base_file_name) diff --git a/tests/Makefile b/tests/Makefile index 6eff3a1..04217df 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -5,7 +5,8 @@ TESTS = \ test_ivreader.py \ test_dvreader.py \ test_soreader.py \ - test_templateinstantiator.py + test_templateinstantiator.py \ + test_md2docx.py .PHONY: \ check diff --git a/tests/test_md2docx.py b/tests/test_md2docx.py new file mode 100644 index 0000000..d122a81 --- /dev/null +++ b/tests/test_md2docx.py @@ -0,0 +1,152 @@ +""" +Tests for md2docx module +""" + +import pytest +from docx.document import Document as DocumentType +from templateprocessor.md2docx import markdown_to_word_object + + +class TestMarkdownToWordObject: + """ + Test cases for markdown_to_word_object function. + markdown_to_word_file is not tested, as it is a simple file write. + """ + + def test_simple_text(self): + """Test converting simple text paragraphs.""" + # Prepare + markdown = "This is a simple paragraph.\n\nThis is another paragraph." + + # Execute + doc = markdown_to_word_object(markdown) + # Verify + assert isinstance(doc, DocumentType) + paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] + assert len(paragraphs) == 2 + assert "This is a simple paragraph." in paragraphs[0] + assert "This is another paragraph." in paragraphs[1] + + def test_simple_list(self): + """Test converting a simple bullet list.""" + # Prepare + markdown = """ +- First item +- Second item +- Third item +""" + + # Execute + doc = markdown_to_word_object(markdown) + # Verify + assert isinstance(doc, DocumentType) + paragraphs = [p for p in doc.paragraphs if p.text.strip()] + assert len(paragraphs) == 3 + assert paragraphs[0].text == "First item" + assert paragraphs[1].text == "Second item" + assert paragraphs[2].text == "Third item" + # Verify style + assert "List Bullet" in paragraphs[0].style.name + assert "List Bullet" in paragraphs[1].style.name + assert "List Bullet" in paragraphs[2].style.name + + def test_nested_list_two_levels(self): + """Test converting a nested list with 2 levels.""" + # Prepare + markdown = """ +- Top level item 1 + - Nested item 1.1 + - Nested item 1.2 +- Top level item 2 + - Nested item 2.1 +""" + + # Execute + doc = markdown_to_word_object(markdown) + # Verify + assert isinstance(doc, DocumentType) + paragraphs = [p for p in doc.paragraphs if p.text.strip()] + assert len(paragraphs) == 5 + + # Check text content + assert paragraphs[0].text == "Top level item 1" + assert paragraphs[1].text == "Nested item 1.1" + assert paragraphs[2].text == "Nested item 1.2" + assert paragraphs[3].text == "Top level item 2" + assert paragraphs[4].text == "Nested item 2.1" + + # Verify top level uses base style + assert "List Bullet" in paragraphs[0].style.name + assert "List Bullet" in paragraphs[3].style.name + + # Verify nested items use appropriate style + assert "List Bullet 2" in paragraphs[1].style.name + assert "List Bullet 2" in paragraphs[2].style.name + + def test_table(self): + """Test converting a markdown table.""" + # Prepare + markdown = """ +| Header 1 | Header 2 | Header 3 | +|----------|----------|----------| +| Row 1 Col 1 | Row 1 Col 2 | Row 1 Col 3 | +| Row 2 Col 1 | Row 2 Col 2 | Row 2 Col 3 | +""" + + # Execute + doc = markdown_to_word_object(markdown) + # Verify + assert isinstance(doc, DocumentType) + assert len(doc.tables) == 1 + + table = doc.tables[0] + assert len(table.rows) == 3 + assert len(table.columns) == 3 + + # Check header row + assert table.rows[0].cells[0].text == "Header 1" + assert table.rows[0].cells[1].text == "Header 2" + assert table.rows[0].cells[2].text == "Header 3" + + # Check data rows + assert table.rows[1].cells[0].text == "Row 1 Col 1" + assert table.rows[1].cells[1].text == "Row 1 Col 2" + assert table.rows[1].cells[2].text == "Row 1 Col 3" + + assert table.rows[2].cells[0].text == "Row 2 Col 1" + assert table.rows[2].cells[1].text == "Row 2 Col 2" + assert table.rows[2].cells[2].text == "Row 2 Col 3" + + # Verify header row is bold + first_cell_runs = table.rows[0].cells[0].paragraphs[0].runs + assert first_cell_runs is not None + assert first_cell_runs[0].bold + + def test_header(self): + """Test converting markdown headers.""" + # Prepare + markdown = """ +# Header 1 + +## Header 2 + +### Header 3 +""" + + # Execute + doc = markdown_to_word_object(markdown) + + # Verify + assert isinstance(doc, DocumentType) + paragraphs = [p for p in doc.paragraphs if p.text.strip()] + assert len(paragraphs) == 3 + + # Check header text + assert paragraphs[0].text == "Header 1" + assert paragraphs[1].text == "Header 2" + assert paragraphs[2].text == "Header 3" + + # Verify heading styles + assert "Heading 1" in paragraphs[0].style.name + assert "Heading 2" in paragraphs[1].style.name + assert "Heading 3" in paragraphs[2].style.name