Merge branch 'main' into feature/ignore-frontmatter

Signed-off-by: Richard Decal <[email protected]>
eugeneyan · Feb 28, 2024 · 564cb30 · 564cb30
2 parents d0d16ce + 5c47fb2
commit 564cb30
Show file tree

Hide file tree

Showing 8 changed files with 42 additions and 12 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -20,6 +20,7 @@ FROM python:3.9.16-slim as app
 WORKDIR /obsidian-copilot
 
 COPY --from=base /usr/local/lib/python3.9/site-packages/ /usr/local/lib/python3.9/site-packages/
+# copy the repo contents into the container
 COPY . .
 
-RUN chmod +x /obsidian-copilot/build.sh
+RUN chmod +x /obsidian-copilot/build.sh
diff --git a/Makefile b/Makefile
@@ -1,8 +1,10 @@
-# Update this path to your obsidian vault directory
-# now you can also plugin the path in yoru make argument or as a env variable
-# echo "export OBSIDIAN_PATH=/path/to/obsidian/" >> ~/.bashrc and source ~/.profile 
-export OBSIDIAN_PATH ?= /Users/eugene/obsidian-vault/
-export TRANSFORMER_CACHE ?= /Users/eugene/.cache/huggingface/hub
+# Check whether the `OBSIDIAN_PATH` and `TRANSFORMER_CACHE` env vars are set
+ifndef OBSIDIAN_PATH
+$(error OBSIDIAN_PATH is not set, please refer to the README for more information)
+endif
+ifndef TRANSFORMER_CACHE
+$(error TRANSFORMER_CACHE is not set, please refer to the README for more information)
+endif
 
 # These generally do not need to be changed
 PWD_PATH = ${PWD}

diff --git a/README.md b/README.md
@@ -20,12 +20,19 @@ Clone and update the path to your obsidian-vault and huggingface hub cache
 
 ```
 git clone https://github.com/eugeneyan/obsidian-copilot.git
+```
+
+Update your `~/.bashrc` or `~/.zshrc` with the `OBSIDIAN_PATH` and `TRANSFORMER_CACHE` paths and then source it.
+Note: the trailing slash is important.
 
-# Open Makefile and update the following paths
-export OBSIDIAN_PATH = /Users/eugene/obsidian-vault/
-export TRANSFORMER_CACHE = /Users/eugene/.cache/huggingface/hub
 ```
 
+export OBSIDIAN_PATH=/path/to/obsidian-vault/
+export TRANSFORMER_CACHE=/path/to/.cache/huggingface/hub
+```
+
+If you don't already have a huggingface hub cache, you can create the directory with `mkdir -p $TRANSFORMER_CACHE`
+
 Build the OpenSearch and semantic indices
 ```
 # Build the docker image

diff --git a/data/.gitkeep b/data/.gitkeep
diff --git a/src/app.py b/src/app.py
@@ -120,6 +120,7 @@ def get_chunks_from_hits(hits: List[dict], model_name: str = 'gpt-3.5-turbo', ma
     # Combine os and semantic hits and rank them
     df = pd.DataFrame(hits)
     df['score'] = df['rank'].apply(lambda x: 10 - x)
+    # deduplicate chunks by ID, summing their OS and semantic scores
     ranked = df.groupby('id').agg({'score': 'sum'}).sort_values('score', ascending=False).reset_index()
 
     # Get context based on ranked IDs
@@ -142,6 +143,10 @@ def get_chunks_from_hits(hits: List[dict], model_name: str = 'gpt-3.5-turbo', ma
 
 @app.get('/get_chunks')
 def get_chunks(query: str):
+    if not query:
+        raise ValueError(
+            f"Query is empty: {query}. Did you try to draft using an empty selection?"
+        )
     # Get hits from opensearch
     os_response = query_opensearch(query, os_client, INDEX_NAME)
     os_hits = parse_os_response(os_response)

diff --git a/src/prep/build_opensearch_index.py b/src/prep/build_opensearch_index.py
@@ -122,7 +122,7 @@ def index_vault(vault: dict[str, dict], client: OpenSearch, index_name: str) ->
         chunk_header = chunk[0]
         docs_indexed += 1
         if docs_indexed % 100 == 0:
-            logger.info(f'Indexing {chunk_id} - Path: {path} ({docs_indexed:,} docs)')
+            logger.info(f'Indexing {chunk_id} - Path: {path} (progress: {docs_indexed:,} docs)')
 
         docs.append({'_index': index_name, '_id': chunk_id, 'title': title, 'type': doc_type,
                      'path': path, 'chunk_header': chunk_header, 'chunk': chunk})

diff --git a/src/prep/build_semantic_index.py b/src/prep/build_semantic_index.py
@@ -88,7 +88,9 @@ def build_embedding_array(vault: dict, tokenizer, model, batch_size=4) -> np.nda
 
         # Get path and chunks
         if docs_embedded % 100 == 0:
-            logger.info(f'Embedding document: {chunk_id} ({docs_embedded:,})')
+            logger.info(
+                f"Embedding document: {chunk_id} (progress: {docs_embedded:,} docs embedded)"
+            )
         docs_embedded += 1
         processed_chunk = 'passage: ' + ' '.join(doc['chunk'].split())  # Remove extra whitespace and add prefix
 

diff --git a/src/prep/build_vault_dict.py b/src/prep/build_vault_dict.py
@@ -70,7 +70,7 @@ def chunk_doc_to_dict(lines: List[str], min_chunk_lines=3) -> dict[str, List[str
         if '![](assets' in line:  # Skip lines that are images
             continue
 
-        if '##' in line:  # Chunk header = Section header
+        if line.startswith("#"):  # Chunk header = Section header
             current_header = line
 
         if line.startswith('- '):  # Top-level bullet
@@ -134,7 +134,20 @@ def create_vault_dict(vault_path: str, paths: List[str]) -> dict[str, dict[str,
                                    'path': str(filename),
                                    'chunk': ''.join(lines)}
 
+                # sometimes, notes follow a template and thus are quite repetitive. For example, stubs for meeting notes.
+                # here, we want to detect if a chunk is an exact duplicate of a previously seen chunk, and if so, skip it
+                seen_chunks = set()
+
                 for chunk_id, chunk in chunks.items():
+                    # check if the chunk is a duplicate
+                    chunk_str = "".join(chunk)
+                    chunk_hash = hash(chunk_str)
+                    if chunk_hash in seen_chunks:
+                        logger.debug(
+                            f"Skipping duplicate chunk in {filename}: {chunk_str}"
+                        )
+                        continue
+
                     chunk_id = f'{filename}-{chunk_id}'
 
                     # Add chunk to vault dict (for shorter context length)