Skip to content

Commit

Permalink
Merge pull request monarch-initiative#88 from monarch-initiative/vers…
Browse files Browse the repository at this point in the history
…ion_update

Updates for v0.2.0
  • Loading branch information
caufieldjh authored Sep 20, 2024
2 parents e35bed8 + 39fb4c4 commit 3bb5f37
Show file tree
Hide file tree
Showing 8 changed files with 478 additions and 418 deletions.
846 changes: 447 additions & 399 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "curate-gpt"
version = "0.0.0.post2.dev0+335f59e"
version = "0.2.0"
description = "curate-gpt"
authors = ["Author 1 <[email protected]>"]
license = "BSD-3"
Expand Down
13 changes: 9 additions & 4 deletions src/curate_gpt/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2402,10 +2402,15 @@ def load_embeddings(path, collection, append, embedding_format, model, file_or_u
db.insert(embeddings, model=model, collection=collection)
print(f"Successfully indexed embeddings into collection '{collection}'.")


@embeddings.command(name="upload")
@path_option
@collection_option
@click.option("--repo-id", required=True, help="Repository ID on Hugging Face, e.g., 'biomedical-translator/[repo_name]'.")
@click.option(
"--repo-id",
required=True,
help="Repository ID on Hugging Face, e.g., 'biomedical-translator/[repo_name]'.",
)
@click.option("--private/--public", default=False, help="Whether the repository should be private.")
@click.option("--adapter", default="huggingface", help="Adapter to use for uploading embeddings.")
@database_type_option
Expand All @@ -2429,15 +2434,15 @@ def upload_embeddings(path, collection, repo_id, private, adapter, database_type
if adapter == "huggingface":
agent = HuggingFaceAgent()
else:
raise ValueError(f"Unsupported adapter: {adapter} "
f"currently only huggingface adapter is supported")
raise ValueError(
f"Unsupported adapter: {adapter} " f"currently only huggingface adapter is supported"
)
try:
agent.upload(objects=objects, metadata=metadata, repo_id=repo_id, private=private)
except Exception as e:
print(f"Error uploading collection to {repo_id}: {e}")



@main.group()
def view():
"Virtual store/wrapper"
Expand Down
14 changes: 10 additions & 4 deletions src/curate_gpt/store/chromadb_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ def _is_openai(self, collection: Collection):
if collection.metadata.get("model", "").startswith("openai:"):
return True

def peek(self, collection: str = None, limit=5, offset: int = 0, **kwargs) -> Iterator[OBJECT]:
def peek(self, collection: str = None, limit=5, offset: int = 0, **kwargs) -> Iterator[OBJECT]:
c = self.client.get_collection(name=self._get_collection(collection))
logger.debug(f"Peeking at {collection} with limit={limit}, offset={offset}")
results = c.peek(limit=limit)
Expand All @@ -536,16 +536,22 @@ def peek(self, collection: str = None, limit=5, offset: int = 0, **kwargs) -> I
for i in range(0, len(metadatas)):
yield self._unjson(metadatas[i])

def fetch_all_objects_memory_safe(self, collection: str = None, batch_size: int = 100, **kwargs) -> Iterator[
OBJECT]:
def fetch_all_objects_memory_safe(
self, collection: str = None, batch_size: int = 100, **kwargs
) -> Iterator[OBJECT]:
"""
Fetch all objects from a collection, in batches to avoid memory overload.
"""
offset = 0
client = self.client
collection_obj = client.get_collection(name=self._get_collection(collection))
while True:
results = collection_obj.get(offset=offset, limit=batch_size, include=["metadatas", "embeddings", "documents"], **kwargs)
results = collection_obj.get(
offset=offset,
limit=batch_size,
include=["metadatas", "embeddings", "documents"],
**kwargs,
)
logger.info(f"Fetching batch from {offset}...")
metadatas = results["metadatas"]
documents = results["documents"]
Expand Down
5 changes: 3 additions & 2 deletions src/curate_gpt/store/db_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,8 +332,9 @@ def peek(self, collection: str = None, limit=5, **kwargs) -> Iterator[OBJECT]:
# Schema operations

@abstractmethod
def fetch_all_objects_memory_safe(self, collection: str = None, batch_size: int = 100, **kwargs) -> Iterator[
OBJECT]:
def fetch_all_objects_memory_safe(
self, collection: str = None, batch_size: int = 100, **kwargs
) -> Iterator[OBJECT]:
"""
Fetch all objects from a collection, in batches to avoid memory overload.
"""
Expand Down
7 changes: 3 additions & 4 deletions src/curate_gpt/store/duckdb_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -807,8 +807,9 @@ def peek(

yield from self.parse_duckdb_result(results, include)

def fetch_all_objects_memory_safe(self, collection: str = None, batch_size: int = 100, include=None, **kwargs) -> Iterator[
OBJECT]:
def fetch_all_objects_memory_safe(
self, collection: str = None, batch_size: int = 100, include=None, **kwargs
) -> Iterator[OBJECT]:
"""
Fetch all objects from a collection, in batches to avoid memory overload.
"""
Expand All @@ -830,8 +831,6 @@ def fetch_all_objects_memory_safe(self, collection: str = None, batch_size: int
else:
break



def get_raw_objects(self, collection) -> Iterator[Dict]:
"""
Get all raw objects in the collection as they were inserted into the database
Expand Down
4 changes: 2 additions & 2 deletions src/curate_gpt/store/duckdb_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ class DuckDBSearchResult(BaseModel):
include: Optional[Set[str]] = None

def to_json(self, indent: int = 2):
return self.json(include=self.include, indent=indent)
return self.model_dump_json(include=self.include, indent=indent)

def to_dict(self):
if self.include:
return self.model_dump(include=self.include)
return self.model_dump()

def __repr__(self, indent: int = 2):
return self.json(include=self.include, indent=indent)
return self.model_dump_json(include=self.include, indent=indent)

def __iter__(self) -> Iterator[SEARCH_RESULT]:
# TODO vocab.py for 'VARIABLES', but circular import
Expand Down
5 changes: 3 additions & 2 deletions src/curate_gpt/store/in_memory_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ def peek(self, collection: str = None, limit=5, **kwargs) -> Iterator[OBJECT]:
collection_obj = self._get_collection_object(collection)
yield from collection_obj.objects[:limit]

def fetch_all_objects_memory_safe(self, collection: str = None, batch_size: int = 100, **kwargs) -> Iterator[
OBJECT]:
def fetch_all_objects_memory_safe(
self, collection: str = None, batch_size: int = 100, **kwargs
) -> Iterator[OBJECT]:
pass

0 comments on commit 3bb5f37

Please sign in to comment.