Skip to content

Commit

Permalink
[DERCBOT-1168] WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
assouktim committed Oct 31, 2024
1 parent fd65c13 commit 2e769e4
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ Documents will be indexed in OpenSearch DB under index_name index (index_name sh
| id | a uuid for each document (one per line in the input file) |
| chunk | the nb of the chunk if the original document was splitted: 'n/N' |
| title | the 'title' column from original input CSV |
| source | the 'source' column from original input CSV |

#### Sample result:
<pre>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def append_example_runs(dataset_example, _session_ids):
if __name__ == '__main__':
start_time = time.time()

cli_args = docopt(__doc__, version='Webscraper 0.1.0')
cli_args = docopt(__doc__, version='Export dataset result runs')
# Set logging level
log_format = '%(levelname)s:%(module)s:%(message)s'
logging.basicConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
vector_store_json_config path to a vector store configuration file (JSON format)
(shall describe settings for one of OpenSearch or PGVector store)
chunks_size size of the embedded chunks of documents
ignore_source To ignore source
TODO MASS
ignore_source To ignore source, useful if sources aren't valid URL
Options:
-h --help Show this screen
--version Show version
Expand Down Expand Up @@ -116,8 +116,7 @@ async def index_documents(args):
if bool(args['<ignore_source>']):
df_filtered['source'] = None
else:
df_filtered['source'] = df_filtered['source'].fillna('UNKNOWN')
df_filtered['source'] = df_filtered['source'].replace('UNKNOWN', None)
df_filtered['source'] = df_filtered['source'].fillna()

loader = DataFrameLoader(df_filtered, page_content_column='text')
docs = loader.load()
Expand Down Expand Up @@ -354,7 +353,7 @@ async def main(args):

if __name__ == '__main__':
# Parse command-line arguments
cli_args = docopt(__doc__, version='Webscraper 0.1.0')
cli_args = docopt(__doc__, version='Document Indexing tool')

# Set up logging
setup_logging(cli_args)
Expand Down

0 comments on commit 2e769e4

Please sign in to comment.