Fix ollama embedding integration and CSV format documentation. (#1761)

theopenconversationkit · Oct 10, 2024 · 67b96ee · 67b96ee
1 parent 50354c2
commit 67b96ee
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 11 deletions.
diff --git a/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/README.md b/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/README.md
@@ -45,10 +45,10 @@ Options:
 -v          Verbose output for debugging (without this option, script will be silent but for errors)
 ```
 
-Turns a Smart Tribune CSV export file into a ready-to-index CSV file (one 'title'|'url'|'text' line per filtered entry):
+Turns a Smart Tribune CSV export file into a ready-to-index CSV file (one 'title'|'source'|'text' line per filtered entry):
 
 
-| Title      | URL                | Text                  |
+| title      | source                | text                  |
 |------------|--------------------|-----------------------|
 | Some title | http://example.com | This is example text. |
 | ...        | ...                | ...                   |
@@ -76,14 +76,14 @@ Options:
                 be silent but for errors)
 
 Import and Format a Smart Tribune data by API  into a ready-to-index CSV file
-(one 'title'|'url'|'text' line per filtered entry).
+(one 'title'|'source'|'text' line per filtered entry).
 ```
 Set in a .env your APIKEY and your APISECRET
 
-Import data from smart tribune API and return a ready-to-index CSV file (one 'title'|'url'|'text' line per filtered entry):
+Import data from smart tribune API and return a ready-to-index CSV file (one 'title'|'source'|'text' line per filtered entry):
 
 
-| Title      | URL                | Text                  |
+| title      | source                | text                  |
 |------------|--------------------|-----------------------|
 | Some title | http://example.com | This is example text. |
 | ...        | ...                | ...                   |

diff --git a/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/index_documents.py b/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/index_documents.py
@@ -59,6 +59,7 @@
 import pandas as pd
 from docopt import docopt
 from gen_ai_orchestrator.models.em.azureopenai.azure_openai_em_setting import AzureOpenAIEMSetting
+from gen_ai_orchestrator.models.em.ollama.ollama_em_setting import OllamaEMSetting
 from gen_ai_orchestrator.models.em.em_provider import EMProvider
 from gen_ai_orchestrator.models.em.em_setting import BaseEMSetting
 from gen_ai_orchestrator.models.em.openai.openai_em_setting import OpenAIEMSetting
@@ -102,7 +103,7 @@ async def index_documents(args):
     )
 
     logging.debug(f"Read input CSV file {args['<input_csv>']}")
-    df = pd.read_csv(args['<input_csv>'], delimiter='|', quotechar='"', names=['title', 'source', 'text'])
+    df = pd.read_csv(args['<input_csv>'], delimiter='|', quotechar='"', header=0) # names=['title', 'source', 'text']
     # Prevent NaN value in the 'source' column with a default value 'UNKNOWN', then replace it with None
     df['source'] = df['source'].fillna('UNKNOWN')
     df['source'] = df['source'].replace('UNKNOWN', None)
@@ -132,7 +133,8 @@ async def index_documents(args):
         data=config_dict,
         provider_mapping={
             EMProvider.OPEN_AI: OpenAIEMSetting,
-            EMProvider.AZURE_OPEN_AI_SERVICE: AzureOpenAIEMSetting
+            EMProvider.AZURE_OPEN_AI_SERVICE: AzureOpenAIEMSetting,
+            EMProvider.OLLAMA: OllamaEMSetting,
         },
         base_class=BaseEMSetting
     )

diff --git a/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/smarttribune_consumer.py b/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/smarttribune_consumer.py
@@ -33,7 +33,7 @@
                 be silent but for errors)
 
 Import and Format a Smart Tribune data by API  into a ready-to-index CSV file
-(one 'title'|'url'|'text' line per filtered entry).
+(one 'title'|'source'|'text' line per filtered entry).
 """
 import asyncio
 import logging
@@ -288,7 +288,7 @@ async def _main(args, body_credentials):
     logging.info(
         f'finished {len(df_all_questions)} questions in {time() - _start:.2f} seconds'
     )
-    df_all_questions.get(['Title', 'URL', 'Text']).to_csv(
+    df_all_questions.get(['title', 'source', 'text']).to_csv(
         args.get('<output_csv>'), sep='|', index=False
     )
 

diff --git a/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/webscraper.py b/gen-ai/orchestrator-server/src/main/python/tock-llm-indexing-tools/webscraper.py
@@ -38,7 +38,7 @@
 
 Recursively browse web URLs (follow links from these base URLs), then scrape
 links' contents based on a list of BeautifulSoup filters, then export these
-contents into a ready-to-index CSV file (one 'title'|'url'|'text' line per
+contents into a ready-to-index CSV file (one 'title'|'source'|'text' line per
 URL with scraped contents).
 """
 import logging
@@ -206,7 +206,7 @@ def scrape_urls(soup_filters, output_file, target_dir='.', base_domain='domain')
 
                             # Add URL with title and text to output file
                             results.append(
-                                {'title': title, 'url': line, 'text': full_text}
+                                {'title': title, 'source': line, 'text': full_text}
                             )
                         else:
                             logging.debug(