jsickcodes · jonathansick · Apr 23, 2021 · Apr 23, 2021
diff --git a/src/docxplain/cli.py b/src/docxplain/cli.py
@@ -4,13 +4,17 @@
 import sys
 
 from docxplain.converter import convert_file
+from docxplain.formats import get_format, supported_formats
 
 
 def main() -> None:
     """Command-line entrypoint."""
     parser = create_parser()
     args = parser.parse_args()
-    changed = convert_file(args.source, suffix=args.suffix, header=args.header)
+    fmt = get_format(args.format)
+    changed = convert_file(
+        args.source, output_format=fmt, suffix=args.suffix, header=args.header
+    )
     if changed:
         sys.exit(1)
     else:
@@ -21,7 +25,14 @@ def create_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Convert docx to plain text.")
     parser.add_argument("source")
     parser.add_argument(
-        "--suffix", default=".txt", help="File suffix for plain text file."
+        "--format",
+        default="plain",
+        choices=[f.name for f in supported_formats],
+    )
+    parser.add_argument(
+        "--suffix",
+        default=None,
+        help="Custom file suffix for plain text file.",
     )
     parser.add_argument(
         "--header",

diff --git a/src/docxplain/converter.py b/src/docxplain/converter.py
@@ -2,26 +2,34 @@
 
 import hashlib
 from pathlib import Path
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
 import pypandoc
 
+if TYPE_CHECKING:
+    from docxplain.formats import PandocFormat
+
 __all__ = ["convert_file", "get_hash"]
 
 
 def convert_file(
-    filename: str, suffix: str = ".txt", header: Optional[str] = None
+    filename: str,
+    output_format: PandocFormat,
+    suffix: Optional[str] = None,
+    header: Optional[str] = None,
 ) -> bool:
     """Convert the docx file to plaintext.
 
     Parameters
     ----------
     filename : `str`
         Path of the docx file.
-    suffix : `str`
-        Suffix for the output plain text file, including ``"."`` prefix.
-        Default is ``".txt"``, but a suffix like ``".extracted.txt"``
-        could be useful.
+    output_format : `docxplain.formats.PandocFormat`
+        The output format for the converted plain text file.
+    suffix : `str`, optional
+        Custom suffix for the output plain text file, including ``"."`` prefix.
+        Default is based on the output format, but a custom suffix like
+        ``".extracted.txt"`` can be useful.
     header : `str`, optional
         Content that is added to the top of the plain text file.
 
@@ -34,7 +42,12 @@ def convert_file(
     if not docx_path.is_file():
         raise RuntimeError(f"Source file {docx_path} does not exist.")
 
-    plain_path = docx_path.with_suffix(suffix)
+    if suffix is None:
+        file_suffix = ".txt"
+    else:
+        file_suffix = suffix
+
+    plain_path = docx_path.with_suffix(file_suffix)
     if plain_path.is_file():
         exists = True
         initial_hash = get_hash(plain_path)

diff --git a/src/docxplain/formats.py b/src/docxplain/formats.py
@@ -0,0 +1,26 @@
+"""Information about supported formats."""
+
+from dataclasses import dataclass
+
+__all__ = ["PandocFormat", "supported_formats", "get_format"]
+
+
+@dataclass
+class PandocFormat:
+    """A plain text format supported by pandoc."""
+
+    name: str
+    """Pandoc's name for the format."""
+
+    suffix: str
+    """The default suffix for the format."""
+
+
+supported_formats = (PandocFormat(name="plain", suffix=".txt"),)
+
+
+def get_format(name: str) -> PandocFormat:
+    for f in supported_formats:
+        if f.name == name:
+            return f
+    raise ValueError(f"Format '{name}' is unknown.")
diff --git a/tests/converter_test.py b/tests/converter_test.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 from docxplain.converter import convert_file, trim_trailing_whitespace
+from docxplain.formats import get_format
 
 
 def test_unchanged(tmp_path: Path) -> None:
@@ -12,7 +13,7 @@ def test_unchanged(tmp_path: Path) -> None:
     work_dir = tmp_path / "unchanged"
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
-    assert convert_file(str(docxpath)) is False
+    assert convert_file(str(docxpath), get_format("plain")) is False
 
 
 def test_changed(tmp_path: Path) -> None:
@@ -21,7 +22,7 @@ def test_changed(tmp_path: Path) -> None:
     work_dir = tmp_path / "changed"
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
-    assert convert_file(str(docxpath)) is True
+    assert convert_file(str(docxpath), get_format("plain")) is True
 
 
 def test_new(tmp_path: Path) -> None:
@@ -30,7 +31,7 @@ def test_new(tmp_path: Path) -> None:
     work_dir = tmp_path / "new"
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
-    assert convert_file(str(docxpath)) is True
+    assert convert_file(str(docxpath), get_format("plain")) is True
 
 
 def test_suffix(tmp_path: Path) -> None:
@@ -39,7 +40,12 @@ def test_suffix(tmp_path: Path) -> None:
     work_dir = tmp_path / "suffix"
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
-    assert convert_file(str(docxpath), suffix=".extracted.txt") is True
+    assert (
+        convert_file(
+            str(docxpath), get_format("plain"), suffix=".extracted.txt"
+        )
+        is True
+    )
     plain_path = work_dir.joinpath("test_doc.extracted.txt")
     assert plain_path.is_file()
 
@@ -51,7 +57,9 @@ def test_header(tmp_path: Path) -> None:
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
     header = "This file is autogenerated."
-    assert convert_file(str(docxpath), header=header) is True
+    assert (
+        convert_file(str(docxpath), get_format("plain"), header=header) is True
+    )
     plain_path = docxpath.with_suffix(".txt")
     assert plain_path.is_file()
     content = plain_path.read_text().splitlines()
@@ -67,7 +75,9 @@ def test_header_templating(tmp_path: Path) -> None:
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
     header = "This file is autogenerated from {docx}."
-    assert convert_file(str(docxpath), header=header) is True
+    assert (
+        convert_file(str(docxpath), get_format("plain"), header=header) is True
+    )
     plain_path = docxpath.with_suffix(".txt")
     assert plain_path.is_file()
     content = plain_path.read_text().splitlines()