Skip to content

Commit

Permalink
Improve logging when ingesting an entire folder
Browse files Browse the repository at this point in the history
This adds a total document count and also optionally logs processing start, completion and error to a file.
  • Loading branch information
NetroScript committed Oct 29, 2023
1 parent 24cfddd commit a0ead63
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 6 deletions.
6 changes: 6 additions & 0 deletions docs/description.md
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,12 @@ and optionally watch changes on it with the command:
make ingest /path/to/folder -- --watch
```

To log the processed and failed files to an additional file, use:

```bash
make ingest /path/to/folder -- --watch --log-file /path/to/log/file.log
```

After ingestion is complete, you should be able to chat with your documents
by navigating to http://localhost:8001 and using the option `Query documents`,
or using the completions / chat API.
Expand Down
55 changes: 49 additions & 6 deletions scripts/ingest_folder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
from loguru import logger
import sys
from pathlib import Path

Expand All @@ -8,7 +9,6 @@

ingest_service = root_injector.get(IngestService)


parser = argparse.ArgumentParser(prog="ingest_folder.py")
parser.add_argument("folder", help="Folder to ingest")
parser.add_argument(
Expand All @@ -17,29 +17,72 @@
action=argparse.BooleanOptionalAction,
default=False,
)
parser.add_argument(
"--log-file",
help="Optional path to a log file. If provided, logs will be written to this file.",
type=str,
default=None
)
args = parser.parse_args()

# Set up loguru logging
# Remove pre-configured logging handler
logger.remove(0)
# For console colorized output without line and function info:
logger.add(sys.stdout, level="INFO", colorize=True, format=(
"<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
"<level>{level: <8}</level> | "
"<level>{message}</level>"
),
)
# For file output, using a clear and timestamped format:
if args.log_file:
logger.add(args.log_file, rotation="10 MB", level="INFO", format="[{time:YYYY-MM-DD HH:mm:ss}] [{level}] {message}")

total_documents = 0
current_document_count = 0


def count_documents(folder_path: Path) -> int:
global total_documents
for file_path in folder_path.iterdir():
if file_path.is_file():
total_documents += 1
elif file_path.is_dir():
count_documents(file_path)


def _recursive_ingest_folder(folder_path: Path) -> None:
global current_document_count, total_documents
for file_path in folder_path.iterdir():
if file_path.is_file():
current_document_count += 1
progress_msg = f"Document {current_document_count} of {total_documents} ({(current_document_count / total_documents) * 100:.2f}%)"
logger.info(progress_msg)
_do_ingest(file_path)
elif file_path.is_dir():
_recursive_ingest_folder(file_path)


def _do_ingest(changed_path: Path) -> None:
if changed_path.exists():
print(f"\nIngesting {changed_path}")
ingest_service.ingest(changed_path.name, changed_path)
try:
if changed_path.exists():
logger.info(f"Started ingesting {changed_path}")
ingest_service.ingest(changed_path.name, changed_path)
logger.info(f"Completed ingesting {changed_path}")
except Exception as e:
logger.error(f"Failed to ingest document: {changed_path}. Error: {e}")


path = Path(args.folder)
if not path.exists():
raise ValueError(f"Path {args.folder} does not exist")

# Count total documents before ingestion
count_documents(path)

_recursive_ingest_folder(path)
if args.watch:
print(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
watcher = IngestWatcher(args.folder, _do_ingest)
watcher.start()
watcher.start()

0 comments on commit a0ead63

Please sign in to comment.