Skip to content

Commit

Permalink
community, genai, vertexai[major]: release 2.0 (langchain-ai#489)
Browse files Browse the repository at this point in the history
* update deps

* delete check_pydantic script

* to_pydantic_2

* model_before_rewrite

* model_after_rewrite

* Self

* format

* clean up

* model_before_rewrite

* change VertexAI.validate_environment to pre

* lint

* update chat and embeddings validation to pre

* update some features to pydantic 2

* remove unused type ignores

* fix validate_environment in llm and chat models

* more validation updates

* change maas model garden validation to post

* add protected namespaces to embeddings

* fix embeddings init

* update docstrings

* delete check_pydantic script

* update dependencies

* to_pydantic_2

* model_after_rewrite

* Self

* revert change to _genai_extension

* format

* remove check_pydantic from MakeFile

* remove unused imports

* fix

* upgrade mypy

* fix type hints

* update serialization test

* add test_watch to makefile

* merge

* add snapshots

* schema -> model_json_schema

* update _format_json_schema_to_gapic

* support v1 function

* add test for union types

* add integration test workflow

* Revert "add integration test workflow"

This reverts commit 2589fd6.

* lock

* infra: remove pydantic compatibility checks

* delete check_pydantic script

* extra_migrate

* to_pydantic_2

* model_before_rewrite

* model_after_rewrite

* Self

* format

* update deps

* change some post validators to pre

* set private attributes with PrivateAttr instead of config

* resolve lint errors

* increment version to 2.0.0.dev1

* increment version to 2.0.0.dev1

* add snapshots for serialization standard test

* bump core dep

* json_schema_extra in test

* protected namespaces

* fix warnings

* fix warnings

* fix mistral dependency

* fix warning

* increment version to 2.0.0.dev1

* fix mistral dep and lock

* update docstrings

* todo: figure out if we need google-cloud-core in package deps

* increment version to get around test.pypi

* assign missed default

* catch pydantic v2 schemas in dict (langchain-ai#488)

* catch pydantic v2 schemas in dict

* lock

* update test

* fix VertexFSVectorStore

* fix equality check in community integration tests for pydantic 2

* update genai + lock

* update vertexai + lock

* update snapshots

* update genai snapshots

* update community + lock

---------

Co-authored-by: Leonid Kuligin <[email protected]>
Co-authored-by: Bagatur <[email protected]>
  • Loading branch information
3 people authored Sep 13, 2024
1 parent 72d1b5a commit b4b3c1c
Show file tree
Hide file tree
Showing 71 changed files with 1,962 additions and 1,719 deletions.
7 changes: 0 additions & 7 deletions .github/workflows/_all_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,6 @@ jobs:
working-directory: ${{ inputs.working-directory }}
secrets: inherit

dependencies:
name: "-"
uses: ./.github/workflows/_dependencies.yml
with:
working-directory: ${{ inputs.working-directory }}
secrets: inherit

test:
name: "-"
uses: ./.github/workflows/_test.yml
Expand Down
103 changes: 0 additions & 103 deletions .github/workflows/_dependencies.yml

This file was deleted.

1 change: 0 additions & 1 deletion libs/community/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ lint_tests: PYTHON_FILES=tests
lint_tests: MYPY_CACHE=.mypy_cache_test

lint lint_diff lint_package lint_tests:
./scripts/check_pydantic.sh .
./scripts/lint_imports.sh
poetry run ruff .
poetry run ruff format $(PYTHON_FILES) --diff
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
from langchain_community.vectorstores.utils import maximal_marginal_relevance
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseModel, ConfigDict, root_validator
from langchain_core.vectorstores import VectorStore
from pydantic import BaseModel, ConfigDict, model_validator
from typing_extensions import Self

from langchain_google_community._utils import get_client_info
from langchain_google_community.bq_storage_vectorstores.utils import (
Expand Down Expand Up @@ -75,8 +76,9 @@ class BaseBigQueryVectorStore(VectorStore, BaseModel, ABC):
_logger: Any = None
_full_table_id: Optional[str] = None

class Config:
arbitrary_types_allowed = True
model_config = ConfigDict(
arbitrary_types_allowed=True,
)

@abstractmethod
def sync_data(self) -> None:
Expand Down Expand Up @@ -113,8 +115,8 @@ def _similarity_search_by_vectors_with_scores_and_embeddings(
) -> list[list[list[Any]]]:
...

@root_validator(pre=False, skip_on_failure=True)
def validate_vals(cls, values: dict) -> dict:
@model_validator(mode="after")
def validate_vals(self) -> Self:
try:
import pandas # noqa: F401
from google.cloud import bigquery # type: ignore[attr-defined]
Expand All @@ -127,41 +129,37 @@ def validate_vals(cls, values: dict) -> dict:
"Please, install feature store dependency group: "
"`pip install langchain-google-community[featurestore]`"
)
values["_logger"] = base.Logger(__name__)
values["_bq_client"] = bigquery.Client(
project=values["project_id"],
location=values["location"],
credentials=values["credentials"],
self._logger = base.Logger(__name__)
self._bq_client = bigquery.Client(
project=self.project_id,
location=self.location,
credentials=self.credentials,
client_info=get_client_info(module="bigquery-vector-search"),
)
if values["embedding_dimension"] is None:
values["embedding_dimension"] = len(values["embedding"].embed_query("test"))
full_table_id = (
f"{values['project_id']}.{values['dataset_name']}.{values['table_name']}"
)
values["_full_table_id"] = full_table_id
temp_dataset_id = f"{values['dataset_name']}_temp"
if self.embedding_dimension is None:
self.embedding_dimension = len(self.embedding.embed_query("test"))
full_table_id = f"{self.project_id}.{self.dataset_name}.{self.table_name}"
self._full_table_id = full_table_id
temp_dataset_id = f"{self.dataset_name}_temp"
if not check_bq_dataset_exists(
client=values["_bq_client"], dataset_id=values["dataset_name"]
client=self._bq_client, dataset_id=self.dataset_name
):
values["_bq_client"].create_dataset(
dataset=values["dataset_name"], exists_ok=True
)
self._bq_client.create_dataset(dataset=self.dataset_name, exists_ok=True)
if not check_bq_dataset_exists(
client=values["_bq_client"], dataset_id=temp_dataset_id
client=self._bq_client, dataset_id=temp_dataset_id
):
values["_bq_client"].create_dataset(dataset=temp_dataset_id, exists_ok=True)
self._bq_client.create_dataset(dataset=temp_dataset_id, exists_ok=True)
table_ref = bigquery.TableReference.from_string(full_table_id)
values["_bq_client"].create_table(table_ref, exists_ok=True)
values["_logger"].info(
self._bq_client.create_table(table_ref, exists_ok=True)
self._logger.info(
f"BigQuery table {full_table_id} "
f"initialized/validated as persistent storage. "
f"Access via BigQuery console:\n "
f"https://console.cloud.google.com/bigquery?project={values['project_id']}"
f"&ws=!1m5!1m4!4m3!1s{values['project_id']}!2s{values['dataset_name']}!3s"
f"{values['table_name']}"
f"https://console.cloud.google.com/bigquery?project={self.project_id}"
f"&ws=!1m5!1m4!4m3!1s{self.project_id}!2s{self.dataset_name}!3s"
f"{self.table_name}"
)
return values
return self

@property
def embeddings(self) -> Optional[Embeddings]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
from google.api_core.exceptions import ClientError
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import root_validator
from pydantic import model_validator

if TYPE_CHECKING:
from google.cloud.bigquery.table import Table

from typing_extensions import Self

from langchain_google_community.bq_storage_vectorstores._base import (
BaseBigQueryVectorStore,
)
Expand Down Expand Up @@ -114,67 +116,67 @@ def get_documents(
docs.append(doc)
return docs

@root_validator(pre=False, skip_on_failure=True)
def initialize_bq_vector_index(cls, values: dict) -> dict:
@model_validator(mode="after")
def initialize_bq_vector_index(self) -> Self:
"""
A vector index in BigQuery table enables efficient
approximate vector search.
"""
from google.cloud import bigquery # type: ignore[attr-defined]

values["_creating_index"] = values.get("_creating_index", False)
values["_have_index"] = values.get("_have_index", False)
values["_last_index_check"] = values.get("_last_index_check", datetime.min)
self._creating_index = self._creating_index
self._have_index = self._have_index
self._last_index_check = self._last_index_check

if values.get("_have_index") or values.get("_creating_index"):
return values
if self._have_index or self._creating_index:
return self

table = values["_bq_client"].get_table(values["_full_table_id"]) # type: ignore[union-attr]
table = self._bq_client.get_table(self._full_table_id) # type: ignore[union-attr]

# Update existing table schema
schema = table.schema.copy()
if schema: ## Check if table has a schema
values["table_schema"] = {field.name: field.field_type for field in schema}
self.table_schema = {field.name: field.field_type for field in schema}

if (table.num_rows or 0) < MIN_INDEX_ROWS:
values["_logger"].debug("Not enough rows to create a vector index.")
return values
self._logger.debug("Not enough rows to create a vector index.")
return self

if datetime.utcnow() - values["_last_index_check"] < INDEX_CHECK_INTERVAL:
return values
if datetime.utcnow() - self._last_index_check < INDEX_CHECK_INTERVAL:
return self

with _vector_table_lock:
values["_last_index_check"] = datetime.utcnow()
self._last_index_check = datetime.utcnow()
# Check if index exists, create if necessary
check_query = (
f"SELECT 1 FROM `{values['project_id']}."
f"{values['dataset_name']}"
f"SELECT 1 FROM `{self.project_id}."
f"{self.dataset_name}"
".INFORMATION_SCHEMA.VECTOR_INDEXES` WHERE"
f" table_name = '{values['table_name']}'"
f" table_name = '{self.table_name}'"
)
job = values["_bq_client"].query( # type: ignore[union-attr]
job = self._bq_client.query( # type: ignore[union-attr]
check_query, api_method=bigquery.enums.QueryApiMethod.QUERY
)
if job.result().total_rows == 0:
# Need to create an index. Make it in a separate thread.
values["_logger"].debug("Trying to create a vector index.")
self._logger.debug("Trying to create a vector index.")
Thread(
target=_create_bq_index,
kwargs={
"bq_client": values["_bq_client"],
"table_name": values["table_name"],
"full_table_id": values["_full_table_id"],
"embedding_field": values["embedding_field"],
"distance_type": values["distance_type"],
"logger": values["_logger"],
"bq_client": self._bq_client,
"table_name": self.table_name,
"full_table_id": self._full_table_id,
"embedding_field": self.embedding_field,
"distance_type": self.distance_type,
"logger": self._logger,
},
daemon=True,
).start()

else:
values["_logger"].debug("Vector index already exists.")
values["_have_index"] = True
return values
self._logger.debug("Vector index already exists.")
self._have_index = True
return self

def _similarity_search_by_vectors_with_scores_and_embeddings(
self,
Expand Down Expand Up @@ -565,7 +567,9 @@ def to_vertex_fs_vector_store(self, **kwargs: Any) -> Any:
VertexFSVectorStore,
)

base_params = self.dict(include=BaseBigQueryVectorStore.__fields__.keys())
base_params = self.model_dump(
include=set(BaseBigQueryVectorStore.model_fields.keys())
)
base_params["embedding"] = self.embedding
all_params = {**base_params, **kwargs}
fs_obj = VertexFSVectorStore(**all_params)
Expand Down
Loading

0 comments on commit b4b3c1c

Please sign in to comment.