diff --git a/gpt4all-bindings/python/gpt4all/_pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py index 208d834c580c..2313729aa398 100644 --- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py +++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py @@ -493,6 +493,7 @@ def prompt_model( context_erase: float = 0.75, reset_context: bool = False, special: bool = False, + fake_reply: str = "", ): """ Generate response from model from a prompt. @@ -537,7 +538,7 @@ def prompt_model( True, self.context, special, - ctypes.c_char_p(), + ctypes.c_char_p(fake_reply.encode()) if fake_reply else ctypes.c_char_p(), ) diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py index 027f28df357c..d4fd76751151 100644 --- a/gpt4all-bindings/python/gpt4all/gpt4all.py +++ b/gpt4all-bindings/python/gpt4all/gpt4all.py @@ -496,6 +496,7 @@ def generate( n_batch: int = 8, n_predict: int | None = None, streaming: bool = False, + fake_reply: str = "", callback: ResponseCallbackType = empty_response_callback, ) -> Any: """ @@ -513,6 +514,7 @@ def generate( n_batch: Number of prompt tokens processed in parallel. Larger values decrease latency but increase resource requirements. n_predict: Equivalent to max_tokens, exists for backwards compatibility. streaming: If True, this method will instead return a generator that yields tokens as the model generates them. + fake_reply: A spoofed reply for the given prompt, used as a way to load chat history. callback: A function with arguments token_id:int and response:str, which receives the tokens from the model as they are generated and stops the generation by returning False. Returns: @@ -529,6 +531,7 @@ def generate( repeat_last_n=repeat_last_n, n_batch=n_batch, n_predict=n_predict if n_predict is not None else max_tokens, + fake_reply=fake_reply, ) if self._history is not None: