Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support additional formats such as markdown #10

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions src/docxplain/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,17 @@
import sys

from docxplain.converter import convert_file
from docxplain.formats import get_format, supported_formats


def main() -> None:
"""Command-line entrypoint."""
parser = create_parser()
args = parser.parse_args()
changed = convert_file(args.source, suffix=args.suffix, header=args.header)
fmt = get_format(args.format)
changed = convert_file(
args.source, output_format=fmt, suffix=args.suffix, header=args.header
)
if changed:
sys.exit(1)
else:
Expand All @@ -21,7 +25,14 @@ def create_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Convert docx to plain text.")
parser.add_argument("source")
parser.add_argument(
"--suffix", default=".txt", help="File suffix for plain text file."
"--format",
default="plain",
choices=[f.name for f in supported_formats],
)
parser.add_argument(
"--suffix",
default=None,
help="Custom file suffix for plain text file.",
)
parser.add_argument(
"--header",
Expand Down
27 changes: 20 additions & 7 deletions src/docxplain/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,34 @@

import hashlib
from pathlib import Path
from typing import Optional
from typing import TYPE_CHECKING, Optional

import pypandoc

if TYPE_CHECKING:
from docxplain.formats import PandocFormat

__all__ = ["convert_file", "get_hash"]


def convert_file(
filename: str, suffix: str = ".txt", header: Optional[str] = None
filename: str,
output_format: PandocFormat,
suffix: Optional[str] = None,
header: Optional[str] = None,
) -> bool:
"""Convert the docx file to plaintext.

Parameters
----------
filename : `str`
Path of the docx file.
suffix : `str`
Suffix for the output plain text file, including ``"."`` prefix.
Default is ``".txt"``, but a suffix like ``".extracted.txt"``
could be useful.
output_format : `docxplain.formats.PandocFormat`
The output format for the converted plain text file.
suffix : `str`, optional
Custom suffix for the output plain text file, including ``"."`` prefix.
Default is based on the output format, but a custom suffix like
``".extracted.txt"`` can be useful.
header : `str`, optional
Content that is added to the top of the plain text file.

Expand All @@ -34,7 +42,12 @@ def convert_file(
if not docx_path.is_file():
raise RuntimeError(f"Source file {docx_path} does not exist.")

plain_path = docx_path.with_suffix(suffix)
if suffix is None:
file_suffix = ".txt"
else:
file_suffix = suffix

plain_path = docx_path.with_suffix(file_suffix)
if plain_path.is_file():
exists = True
initial_hash = get_hash(plain_path)
Expand Down
26 changes: 26 additions & 0 deletions src/docxplain/formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Information about supported formats."""

from dataclasses import dataclass

__all__ = ["PandocFormat", "supported_formats", "get_format"]


@dataclass
class PandocFormat:
"""A plain text format supported by pandoc."""

name: str
"""Pandoc's name for the format."""

suffix: str
"""The default suffix for the format."""


supported_formats = (PandocFormat(name="plain", suffix=".txt"),)


def get_format(name: str) -> PandocFormat:
for f in supported_formats:
if f.name == name:
return f
raise ValueError(f"Format '{name}' is unknown.")
22 changes: 16 additions & 6 deletions tests/converter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path

from docxplain.converter import convert_file, trim_trailing_whitespace
from docxplain.formats import get_format


def test_unchanged(tmp_path: Path) -> None:
Expand All @@ -12,7 +13,7 @@ def test_unchanged(tmp_path: Path) -> None:
work_dir = tmp_path / "unchanged"
shutil.copytree(repo_data, work_dir)
docxpath = work_dir.joinpath("test_doc.docx")
assert convert_file(str(docxpath)) is False
assert convert_file(str(docxpath), get_format("plain")) is False


def test_changed(tmp_path: Path) -> None:
Expand All @@ -21,7 +22,7 @@ def test_changed(tmp_path: Path) -> None:
work_dir = tmp_path / "changed"
shutil.copytree(repo_data, work_dir)
docxpath = work_dir.joinpath("test_doc.docx")
assert convert_file(str(docxpath)) is True
assert convert_file(str(docxpath), get_format("plain")) is True


def test_new(tmp_path: Path) -> None:
Expand All @@ -30,7 +31,7 @@ def test_new(tmp_path: Path) -> None:
work_dir = tmp_path / "new"
shutil.copytree(repo_data, work_dir)
docxpath = work_dir.joinpath("test_doc.docx")
assert convert_file(str(docxpath)) is True
assert convert_file(str(docxpath), get_format("plain")) is True


def test_suffix(tmp_path: Path) -> None:
Expand All @@ -39,7 +40,12 @@ def test_suffix(tmp_path: Path) -> None:
work_dir = tmp_path / "suffix"
shutil.copytree(repo_data, work_dir)
docxpath = work_dir.joinpath("test_doc.docx")
assert convert_file(str(docxpath), suffix=".extracted.txt") is True
assert (
convert_file(
str(docxpath), get_format("plain"), suffix=".extracted.txt"
)
is True
)
plain_path = work_dir.joinpath("test_doc.extracted.txt")
assert plain_path.is_file()

Expand All @@ -51,7 +57,9 @@ def test_header(tmp_path: Path) -> None:
shutil.copytree(repo_data, work_dir)
docxpath = work_dir.joinpath("test_doc.docx")
header = "This file is autogenerated."
assert convert_file(str(docxpath), header=header) is True
assert (
convert_file(str(docxpath), get_format("plain"), header=header) is True
)
plain_path = docxpath.with_suffix(".txt")
assert plain_path.is_file()
content = plain_path.read_text().splitlines()
Expand All @@ -67,7 +75,9 @@ def test_header_templating(tmp_path: Path) -> None:
shutil.copytree(repo_data, work_dir)
docxpath = work_dir.joinpath("test_doc.docx")
header = "This file is autogenerated from {docx}."
assert convert_file(str(docxpath), header=header) is True
assert (
convert_file(str(docxpath), get_format("plain"), header=header) is True
)
plain_path = docxpath.with_suffix(".txt")
assert plain_path.is_file()
content = plain_path.read_text().splitlines()
Expand Down