diff --git a/docs/description.md b/docs/description.md index 56c1fd41f9..c558fb5b78 100644 --- a/docs/description.md +++ b/docs/description.md @@ -416,6 +416,12 @@ and optionally watch changes on it with the command: make ingest /path/to/folder -- --watch ``` +To log the processed and failed files to an additional file, use: + +```bash +make ingest /path/to/folder -- --watch --log-file /path/to/log/file.log +``` + After ingestion is complete, you should be able to chat with your documents by navigating to http://localhost:8001 and using the option `Query documents`, or using the completions / chat API. diff --git a/scripts/ingest_folder.py b/scripts/ingest_folder.py index 8936d91d73..b788f7f995 100644 --- a/scripts/ingest_folder.py +++ b/scripts/ingest_folder.py @@ -1,4 +1,5 @@ import argparse +from loguru import logger import sys from pathlib import Path @@ -8,7 +9,6 @@ ingest_service = root_injector.get(IngestService) - parser = argparse.ArgumentParser(prog="ingest_folder.py") parser.add_argument("folder", help="Folder to ingest") parser.add_argument( @@ -17,29 +17,72 @@ action=argparse.BooleanOptionalAction, default=False, ) +parser.add_argument( + "--log-file", + help="Optional path to a log file. If provided, logs will be written to this file.", + type=str, + default=None +) args = parser.parse_args() +# Set up loguru logging +# Remove pre-configured logging handler +logger.remove(0) +# For console colorized output without line and function info: +logger.add(sys.stdout, level="INFO", colorize=True, format=( + "{time:YYYY-MM-DD HH:mm:ss.SSS} | " + "{level: <8} | " + "{message}" + ), +) +# For file output, using a clear and timestamped format: +if args.log_file: + logger.add(args.log_file, rotation="10 MB", level="INFO", format="[{time:YYYY-MM-DD HH:mm:ss}] [{level}] {message}") + +total_documents = 0 +current_document_count = 0 + + +def count_documents(folder_path: Path) -> int: + global total_documents + for file_path in folder_path.iterdir(): + if file_path.is_file(): + total_documents += 1 + elif file_path.is_dir(): + count_documents(file_path) + def _recursive_ingest_folder(folder_path: Path) -> None: + global current_document_count, total_documents for file_path in folder_path.iterdir(): if file_path.is_file(): + current_document_count += 1 + progress_msg = f"Document {current_document_count} of {total_documents} ({(current_document_count / total_documents) * 100:.2f}%)" + logger.info(progress_msg) _do_ingest(file_path) elif file_path.is_dir(): _recursive_ingest_folder(file_path) def _do_ingest(changed_path: Path) -> None: - if changed_path.exists(): - print(f"\nIngesting {changed_path}") - ingest_service.ingest(changed_path.name, changed_path) + try: + if changed_path.exists(): + logger.info(f"Started ingesting {changed_path}") + ingest_service.ingest(changed_path.name, changed_path) + logger.info(f"Completed ingesting {changed_path}") + except Exception as e: + logger.error(f"Failed to ingest document: {changed_path}. Error: {e}") path = Path(args.folder) if not path.exists(): raise ValueError(f"Path {args.folder} does not exist") +# Count total documents before ingestion +count_documents(path) + _recursive_ingest_folder(path) if args.watch: - print(f"Watching {args.folder} for changes, press Ctrl+C to stop...") + logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...") watcher = IngestWatcher(args.folder, _do_ingest) - watcher.start() + watcher.start() \ No newline at end of file