Skip to content

Commit

Permalink
fix: allow recursive pdf file searching
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Mar 6, 2024
1 parent 22f20f2 commit ca4ad97
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 2 deletions.
9 changes: 8 additions & 1 deletion edspdf/data/files.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# ruff: noqa: F401
import json
import os
import sys
from collections import Counter
from pathlib import Path
from typing import (
Expand Down Expand Up @@ -39,6 +40,7 @@ def __init__(
keep_ipynb_checkpoints: bool = False,
load_annotations: bool = False,
filesystem: Optional[Any] = None,
recursive: bool = False,
):
super().__init__()

Expand Down Expand Up @@ -66,9 +68,14 @@ def __init__(
if not self.filesystem.exists(path):
raise FileNotFoundError(f"Path {path} does not exist")

assert sys.version_info >= (3, 8) or not recursive, (
"Recursive reading is only supported with Python 3.8 or higher. "
"Please upgrade your Python version or set `recursive=False`."
)
glob_str = "**/*.pdf" if recursive else "*.pdf"
self.files: List[str] = [
file
for file in self.filesystem.glob(os.path.join(str(self.path), "*.pdf"))
for file in self.filesystem.glob(os.path.join(str(self.path), glob_str))
if (keep_ipynb_checkpoints or ".ipynb_checkpoints" not in str(file))
and (
not load_annotations
Expand Down
6 changes: 5 additions & 1 deletion tests/core/test_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import sys
from pathlib import Path

import pandas as pd
Expand Down Expand Up @@ -97,7 +98,10 @@ def parquet_file(tmp_path_factory, request):
os.chdir(request.fspath.dirname)
tmp_path = tmp_path_factory.mktemp("test_input_parquet")
path = tmp_path / "input_test.pq"
docs = edspdf.data.read_files("file://" + os.path.abspath("../resources"))
docs = edspdf.data.read_files(
"file://" + os.path.abspath("../resources"),
recursive=sys.version_info >= (3, 8),
)
docs.write_parquet(
path,
converter=lambda x: {
Expand Down

0 comments on commit ca4ad97

Please sign in to comment.