diff --git a/code/backend/batch/utilities/helpers/embedders/push_embedder.py b/code/backend/batch/utilities/helpers/embedders/push_embedder.py index 33fa1f6f1..2cec6520b 100644 --- a/code/backend/batch/utilities/helpers/embedders/push_embedder.py +++ b/code/backend/batch/utilities/helpers/embedders/push_embedder.py @@ -38,7 +38,7 @@ def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper): self.embedding_configs[ext] = processor def embed_file(self, source_url: str, file_name: str): - file_extension = file_name.split(".")[-1] + file_extension = file_name.split(".")[-1].lower() embedding_config = self.embedding_configs.get(file_extension) self.__embed( source_url=source_url, diff --git a/code/tests/functional/conftest.py b/code/tests/functional/conftest.py index f416d9c65..d29c62024 100644 --- a/code/tests/functional/conftest.py +++ b/code/tests/functional/conftest.py @@ -334,6 +334,12 @@ def setup_config_mocking(httpserver: HTTPServer): "loading": {"strategy": "web"}, "use_advanced_image_processing": False, }, + { + "document_type": "htm", + "chunking": {"strategy": "layout", "size": 500, "overlap": 100}, + "loading": {"strategy": "web"}, + "use_advanced_image_processing": False, + }, { "document_type": "docx", "chunking": {"strategy": "layout", "size": 500, "overlap": 100}, diff --git a/code/tests/utilities/helpers/test_push_embedder.py b/code/tests/utilities/helpers/test_push_embedder.py index 658c58cef..c1031a49c 100644 --- a/code/tests/utilities/helpers/test_push_embedder.py +++ b/code/tests/utilities/helpers/test_push_embedder.py @@ -347,6 +347,22 @@ def test_embed_file_chunks_documents(document_loading_mock, document_chunking_mo ) +def test_embed_file_chunks_documents_upper_case(document_loading_mock, document_chunking_mock, env_helper_mock): + # given + push_embedder = PushEmbedder(MagicMock(), env_helper_mock) + + # when + push_embedder.embed_file( + "some-url", + "some-file-name.PDF", + ) + + # then + document_chunking_mock.return_value.chunk.assert_called_once_with( + document_loading_mock.return_value.load.return_value, CHUNKING_SETTINGS + ) + + def test_embed_file_generates_embeddings_for_documents(llm_helper_mock, env_helper_mock): # given push_embedder = PushEmbedder(MagicMock(), env_helper_mock)