Skip to content

Commit

Permalink
Merge branch 'main' into feature/ignore-frontmatter
Browse files Browse the repository at this point in the history
Signed-off-by: Richard Decal <[email protected]>
  • Loading branch information
crypdick committed Feb 28, 2024
2 parents d0d16ce + 5c47fb2 commit 564cb30
Show file tree
Hide file tree
Showing 8 changed files with 42 additions and 12 deletions.
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ FROM python:3.9.16-slim as app
WORKDIR /obsidian-copilot

COPY --from=base /usr/local/lib/python3.9/site-packages/ /usr/local/lib/python3.9/site-packages/
# copy the repo contents into the container
COPY . .

RUN chmod +x /obsidian-copilot/build.sh
RUN chmod +x /obsidian-copilot/build.sh
12 changes: 7 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# Update this path to your obsidian vault directory
# now you can also plugin the path in yoru make argument or as a env variable
# echo "export OBSIDIAN_PATH=/path/to/obsidian/" >> ~/.bashrc and source ~/.profile
export OBSIDIAN_PATH ?= /Users/eugene/obsidian-vault/
export TRANSFORMER_CACHE ?= /Users/eugene/.cache/huggingface/hub
# Check whether the `OBSIDIAN_PATH` and `TRANSFORMER_CACHE` env vars are set
ifndef OBSIDIAN_PATH
$(error OBSIDIAN_PATH is not set, please refer to the README for more information)
endif
ifndef TRANSFORMER_CACHE
$(error TRANSFORMER_CACHE is not set, please refer to the README for more information)
endif

# These generally do not need to be changed
PWD_PATH = ${PWD}
Expand Down
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,19 @@ Clone and update the path to your obsidian-vault and huggingface hub cache

```
git clone https://github.com/eugeneyan/obsidian-copilot.git
```

Update your `~/.bashrc` or `~/.zshrc` with the `OBSIDIAN_PATH` and `TRANSFORMER_CACHE` paths and then source it.
Note: the trailing slash is important.

# Open Makefile and update the following paths
export OBSIDIAN_PATH = /Users/eugene/obsidian-vault/
export TRANSFORMER_CACHE = /Users/eugene/.cache/huggingface/hub
```
export OBSIDIAN_PATH=/path/to/obsidian-vault/
export TRANSFORMER_CACHE=/path/to/.cache/huggingface/hub
```

If you don't already have a huggingface hub cache, you can create the directory with `mkdir -p $TRANSFORMER_CACHE`

Build the OpenSearch and semantic indices
```
# Build the docker image
Expand Down
Empty file modified data/.gitkeep
100644 → 100755
Empty file.
5 changes: 5 additions & 0 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def get_chunks_from_hits(hits: List[dict], model_name: str = 'gpt-3.5-turbo', ma
# Combine os and semantic hits and rank them
df = pd.DataFrame(hits)
df['score'] = df['rank'].apply(lambda x: 10 - x)
# deduplicate chunks by ID, summing their OS and semantic scores
ranked = df.groupby('id').agg({'score': 'sum'}).sort_values('score', ascending=False).reset_index()

# Get context based on ranked IDs
Expand All @@ -142,6 +143,10 @@ def get_chunks_from_hits(hits: List[dict], model_name: str = 'gpt-3.5-turbo', ma

@app.get('/get_chunks')
def get_chunks(query: str):
if not query:
raise ValueError(
f"Query is empty: {query}. Did you try to draft using an empty selection?"
)
# Get hits from opensearch
os_response = query_opensearch(query, os_client, INDEX_NAME)
os_hits = parse_os_response(os_response)
Expand Down
2 changes: 1 addition & 1 deletion src/prep/build_opensearch_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def index_vault(vault: dict[str, dict], client: OpenSearch, index_name: str) ->
chunk_header = chunk[0]
docs_indexed += 1
if docs_indexed % 100 == 0:
logger.info(f'Indexing {chunk_id} - Path: {path} ({docs_indexed:,} docs)')
logger.info(f'Indexing {chunk_id} - Path: {path} (progress: {docs_indexed:,} docs)')

docs.append({'_index': index_name, '_id': chunk_id, 'title': title, 'type': doc_type,
'path': path, 'chunk_header': chunk_header, 'chunk': chunk})
Expand Down
4 changes: 3 additions & 1 deletion src/prep/build_semantic_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ def build_embedding_array(vault: dict, tokenizer, model, batch_size=4) -> np.nda

# Get path and chunks
if docs_embedded % 100 == 0:
logger.info(f'Embedding document: {chunk_id} ({docs_embedded:,})')
logger.info(
f"Embedding document: {chunk_id} (progress: {docs_embedded:,} docs embedded)"
)
docs_embedded += 1
processed_chunk = 'passage: ' + ' '.join(doc['chunk'].split()) # Remove extra whitespace and add prefix

Expand Down
15 changes: 14 additions & 1 deletion src/prep/build_vault_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def chunk_doc_to_dict(lines: List[str], min_chunk_lines=3) -> dict[str, List[str
if '![](assets' in line: # Skip lines that are images
continue

if '##' in line: # Chunk header = Section header
if line.startswith("#"): # Chunk header = Section header
current_header = line

if line.startswith('- '): # Top-level bullet
Expand Down Expand Up @@ -134,7 +134,20 @@ def create_vault_dict(vault_path: str, paths: List[str]) -> dict[str, dict[str,
'path': str(filename),
'chunk': ''.join(lines)}

# sometimes, notes follow a template and thus are quite repetitive. For example, stubs for meeting notes.
# here, we want to detect if a chunk is an exact duplicate of a previously seen chunk, and if so, skip it
seen_chunks = set()

for chunk_id, chunk in chunks.items():
# check if the chunk is a duplicate
chunk_str = "".join(chunk)
chunk_hash = hash(chunk_str)
if chunk_hash in seen_chunks:
logger.debug(
f"Skipping duplicate chunk in {filename}: {chunk_str}"
)
continue

chunk_id = f'{filename}-{chunk_id}'

# Add chunk to vault dict (for shorter context length)
Expand Down

0 comments on commit 564cb30

Please sign in to comment.