From f6d80f57896ebc095a87fde767ed75454b336367 Mon Sep 17 00:00:00 2001 From: Ali YAZICI Date: Mon, 6 Nov 2023 08:57:35 +0300 Subject: [PATCH] Update ingest_service.py to fix issue Error: 'utf-8' codec can't decode To fix issue: https://github.com/imartinez/privateGPT/issues/1166 Error: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte #1166 --- private_gpt/server/ingest/ingest_service.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/private_gpt/server/ingest/ingest_service.py b/private_gpt/server/ingest/ingest_service.py index 6a34e6fbb..ddd89eb67 100644 --- a/private_gpt/server/ingest/ingest_service.py +++ b/private_gpt/server/ingest/ingest_service.py @@ -1,4 +1,5 @@ import tempfile +import chardet # Chardet must be put in requirements or manually install with pip install chardet from pathlib import Path from typing import TYPE_CHECKING, Any, AnyStr @@ -77,7 +78,10 @@ def ingest(self, file_name: str, file_data: AnyStr | Path) -> list[IngestedDoc]: # Read as a plain text string_reader = StringIterableReader() if isinstance(file_data, Path): - text = file_data.read_text() + with open(file_data, 'rb') as f2: + result2 = chardet.detect(f2.read()) + text = file_data.read_text(encoding=result2['encoding']) + #text = file_data.read_text() documents = string_reader.load_data([text]) elif isinstance(file_data, bytes): documents = string_reader.load_data([file_data.decode("utf-8")])