Skip to content

Commit

Permalink
feat: Support encoding parameter in partition_csv (#3564)
Browse files Browse the repository at this point in the history
See added test file. Added support for the encoding parameter, which can
be passed directly to `pd.read_csv`.
  • Loading branch information
awalker4 authored Aug 28, 2024
1 parent f21c853 commit f440eb4
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 4 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
## 0.15.9-dev0
## 0.15.9-dev1

### Enhancements

### Features

* **Add support for encoding parameter in partition_csv**

### Fixes

* **Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile** Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFiles have been replaced with TemporaryFileDirectory to avoid a known issue: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile
Expand Down
Binary file added example-docs/stanley-cups-utf-16.csv
Binary file not shown.
8 changes: 8 additions & 0 deletions test_unstructured/partition/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ def test_partition_csv_from_filename_with_metadata_filename():
assert elements[0].metadata.filename == "test"


def test_partition_csv_with_encoding():
elements = partition_csv(example_doc_path("stanley-cups-utf-16.csv"), encoding="utf-16")

assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT


@pytest.mark.parametrize(
("filename", "expected_text", "expected_table"),
[
Expand Down Expand Up @@ -279,6 +285,7 @@ def it_provides_a_validating_alternate_constructor(self):
ctx = _CsvPartitioningContext.load(
file_path=example_doc_path("stanley-cups.csv"),
file=None,
encoding=None,
metadata_file_path=None,
metadata_last_modified=None,
include_header=True,
Expand All @@ -292,6 +299,7 @@ def and_the_validating_constructor_raises_on_an_invalid_context(self):
_CsvPartitioningContext.load(
file_path=None,
file=None,
encoding=None,
metadata_file_path=None,
metadata_last_modified=None,
include_header=True,
Expand Down
1 change: 1 addition & 0 deletions typings/pandas/io/parsers/readers.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ from pandas.core.frame import DataFrame
def read_csv(
filepath_or_buffer: str | IO[bytes],
*,
encoding: str | None = ...,
sep: str | None = ...,
header: int | None | Literal["infer"] = ...,
) -> DataFrame: ...
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.9-dev0" # pragma: no cover
__version__ = "0.15.9-dev1" # pragma: no cover
1 change: 1 addition & 0 deletions unstructured/partition/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ def partition(
elements = partition_csv(
filename=filename,
file=file,
encoding=encoding,
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
Expand Down
14 changes: 12 additions & 2 deletions unstructured/partition/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
def partition_csv(
filename: str | None = None,
file: IO[bytes] | None = None,
encoding: str | None = None,
metadata_filename: str | None = None,
metadata_last_modified: str | None = None,
include_header: bool = False,
Expand All @@ -47,6 +48,8 @@ def partition_csv(
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
metadata_filename
The filename to use for the metadata.
metadata_last_modified
Expand All @@ -73,6 +76,7 @@ def partition_csv(
ctx = _CsvPartitioningContext(
file_path=filename,
file=file,
encoding=encoding,
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
include_header=include_header,
Expand All @@ -81,7 +85,7 @@ def partition_csv(
)

with ctx.open() as file:
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter)
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding)

html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
Expand Down Expand Up @@ -110,6 +114,7 @@ def __init__(
self,
file_path: str | None = None,
file: IO[bytes] | None = None,
encoding: str | None = None,
metadata_file_path: str | None = None,
metadata_last_modified: str | None = None,
include_header: bool = False,
Expand All @@ -118,6 +123,7 @@ def __init__(
):
self._file_path = file_path
self._file = file
self._encoding = encoding
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
self._include_header = include_header
Expand All @@ -129,6 +135,7 @@ def load(
cls,
file_path: str | None,
file: IO[bytes] | None,
encoding: str | None,
metadata_file_path: str | None,
metadata_last_modified: str | None,
include_header: bool,
Expand All @@ -138,6 +145,7 @@ def load(
return cls(
file_path=file_path,
file=file,
encoding=encoding,
metadata_file_path=metadata_file_path,
metadata_last_modified=metadata_last_modified,
include_header=include_header,
Expand All @@ -156,7 +164,9 @@ def delimiter(self) -> str | None:

with self.open() as file:
# -- read whole lines, sniffer can be confused by a trailing partial line --
data = "\n".join(ln.decode("utf-8") for ln in file.readlines(num_bytes))
data = "\n".join(
ln.decode(self._encoding or "utf-8") for ln in file.readlines(num_bytes)
)

try:
return sniffer.sniff(data, delimiters=",;").delimiter
Expand Down

0 comments on commit f440eb4

Please sign in to comment.