Skip to content

Commit

Permalink
Fix ollama embedding integration and CSV format documentation. (#1761)
Browse files Browse the repository at this point in the history
  • Loading branch information
Benvii authored Oct 10, 2024
1 parent 50354c2 commit 67b96ee
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ Options:
-v Verbose output for debugging (without this option, script will be silent but for errors)
```

Turns a Smart Tribune CSV export file into a ready-to-index CSV file (one 'title'|'url'|'text' line per filtered entry):
Turns a Smart Tribune CSV export file into a ready-to-index CSV file (one 'title'|'source'|'text' line per filtered entry):


| Title | URL | Text |
| title | source | text |
|------------|--------------------|-----------------------|
| Some title | http://example.com | This is example text. |
| ... | ... | ... |
Expand Down Expand Up @@ -76,14 +76,14 @@ Options:
be silent but for errors)
Import and Format a Smart Tribune data by API into a ready-to-index CSV file
(one 'title'|'url'|'text' line per filtered entry).
(one 'title'|'source'|'text' line per filtered entry).
```
Set in a .env your APIKEY and your APISECRET

Import data from smart tribune API and return a ready-to-index CSV file (one 'title'|'url'|'text' line per filtered entry):
Import data from smart tribune API and return a ready-to-index CSV file (one 'title'|'source'|'text' line per filtered entry):


| Title | URL | Text |
| title | source | text |
|------------|--------------------|-----------------------|
| Some title | http://example.com | This is example text. |
| ... | ... | ... |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
import pandas as pd
from docopt import docopt
from gen_ai_orchestrator.models.em.azureopenai.azure_openai_em_setting import AzureOpenAIEMSetting
from gen_ai_orchestrator.models.em.ollama.ollama_em_setting import OllamaEMSetting
from gen_ai_orchestrator.models.em.em_provider import EMProvider
from gen_ai_orchestrator.models.em.em_setting import BaseEMSetting
from gen_ai_orchestrator.models.em.openai.openai_em_setting import OpenAIEMSetting
Expand Down Expand Up @@ -102,7 +103,7 @@ async def index_documents(args):
)

logging.debug(f"Read input CSV file {args['<input_csv>']}")
df = pd.read_csv(args['<input_csv>'], delimiter='|', quotechar='"', names=['title', 'source', 'text'])
df = pd.read_csv(args['<input_csv>'], delimiter='|', quotechar='"', header=0) # names=['title', 'source', 'text']
# Prevent NaN value in the 'source' column with a default value 'UNKNOWN', then replace it with None
df['source'] = df['source'].fillna('UNKNOWN')
df['source'] = df['source'].replace('UNKNOWN', None)
Expand Down Expand Up @@ -132,7 +133,8 @@ async def index_documents(args):
data=config_dict,
provider_mapping={
EMProvider.OPEN_AI: OpenAIEMSetting,
EMProvider.AZURE_OPEN_AI_SERVICE: AzureOpenAIEMSetting
EMProvider.AZURE_OPEN_AI_SERVICE: AzureOpenAIEMSetting,
EMProvider.OLLAMA: OllamaEMSetting,
},
base_class=BaseEMSetting
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
be silent but for errors)
Import and Format a Smart Tribune data by API into a ready-to-index CSV file
(one 'title'|'url'|'text' line per filtered entry).
(one 'title'|'source'|'text' line per filtered entry).
"""
import asyncio
import logging
Expand Down Expand Up @@ -288,7 +288,7 @@ async def _main(args, body_credentials):
logging.info(
f'finished {len(df_all_questions)} questions in {time() - _start:.2f} seconds'
)
df_all_questions.get(['Title', 'URL', 'Text']).to_csv(
df_all_questions.get(['title', 'source', 'text']).to_csv(
args.get('<output_csv>'), sep='|', index=False
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
Recursively browse web URLs (follow links from these base URLs), then scrape
links' contents based on a list of BeautifulSoup filters, then export these
contents into a ready-to-index CSV file (one 'title'|'url'|'text' line per
contents into a ready-to-index CSV file (one 'title'|'source'|'text' line per
URL with scraped contents).
"""
import logging
Expand Down Expand Up @@ -206,7 +206,7 @@ def scrape_urls(soup_filters, output_file, target_dir='.', base_domain='domain')

# Add URL with title and text to output file
results.append(
{'title': title, 'url': line, 'text': full_text}
{'title': title, 'source': line, 'text': full_text}
)
else:
logging.debug(
Expand Down

0 comments on commit 67b96ee

Please sign in to comment.