Skip to content

Commit

Permalink
refactor(utils): separate utils helper functions
Browse files Browse the repository at this point in the history
  • Loading branch information
musicEnfanthen committed Apr 28, 2024
1 parent 48b7a3e commit 5ce4854
Show file tree
Hide file tree
Showing 4 changed files with 770 additions and 734 deletions.
14 changes: 9 additions & 5 deletions convert_source_description/convert_source_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import argparse

from bs4 import BeautifulSoup
from file_utils import FileUtils
from utils import ConversionUtils


Expand All @@ -63,24 +64,27 @@ def convert_source_description(directory: str, file_name: str):
A JSON file with the source description.
"""

file_utils = FileUtils()
conversion_utils = ConversionUtils()

# Define file path
file_path = directory + file_name

# Get HTML from Word file
html = ConversionUtils().read_html_from_word_file(file_path)
html = file_utils.read_html_from_word_file(file_path)

# Parse HTML
soup = BeautifulSoup(html, 'html.parser')

# Create the full sourceList object
source_list = ConversionUtils().create_source_list(soup)
source_list = conversion_utils.create_source_list(soup)

# Create the full textcritics object
textcritics = ConversionUtils().create_textcritics(soup)
textcritics = conversion_utils.create_textcritics(soup)

# Output
ConversionUtils().write_json(source_list, file_path + '_source-description')
ConversionUtils().write_json(textcritics, file_path + '_textcritics')
file_utils.write_json(source_list, file_path + '_source-description')
file_utils.write_json(textcritics, file_path + '_textcritics')


def main():
Expand Down
66 changes: 66 additions & 0 deletions convert_source_description/file_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""Utility functions for file operations. This includes reading from WORD and writing to JSON."""

import json
import os
from typing import Dict

import mammoth


############################################
# Public class: FileUtils
############################################
class FileUtils:
"""A class that contains utility functions for file operations which include reading and writing files."""

############################################
# Public class function: read_html_from_word_file
############################################
def read_html_from_word_file(self, file_path: str) -> str:
"""
Reads a Word file in .docx format and returns its content as an HTML string.
Args:
filePath (str): The name of the Word file to be read, without the .docx extension.
Returns:
str: The content of the Word file as an HTML string.
"""
source_file_name = file_path + ".docx"
if not os.path.exists(source_file_name):
raise FileNotFoundError("File not found: " + file_path + ".docx")

with open(source_file_name, "rb") as source_file:
try:
result = mammoth.convert_to_html(source_file)
return result.value # The generated HTML
except ValueError as error:
raise ValueError('Error converting file: ' +
str(error)) from error

############################################
# Public class function: write_json
############################################
def write_json(self, data: Dict, file_path: str) -> None:
"""
Serializes a data dictionary as a JSON formatted string and writes it to a file.
Args:
data (Dict): The data dictionary to be serialized and written.
file_path (str): The path to the file to be written, without the .json extension.
Returns:
None
"""
# Serializing json
json_object = json.dumps(data, indent=4, ensure_ascii=False).encode(
'utf8').decode('utf8')

# Writing to target file
target_file_name = file_path + ".json"
try:
with open(target_file_name, "w", encoding='utf-8') as target_file:
target_file.write(json_object)
print(f"Data written to {target_file_name} successfully.")
except IOError:
print(f"Error writing data to {target_file_name}.")
Loading

0 comments on commit 5ce4854

Please sign in to comment.