Skip to content

Commit

Permalink
test: Pytest with PySpark (#15)
Browse files Browse the repository at this point in the history
* test: Pytest

* ci: renamed workflow

* ci: release JAR

* ci: YAML formatting
  • Loading branch information
Anush008 authored Feb 29, 2024
1 parent e027c1a commit 469d62d
Show file tree
Hide file tree
Showing 12 changed files with 1,629 additions and 72 deletions.
106 changes: 53 additions & 53 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,56 +7,56 @@ jobs:
release:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: '0'

- name: Get Author Name and Email
run: |
AUTHOR_NAME=$(git log -1 --pretty=format:%an ${{ github.sha }})
AUTHOR_EMAIL=$(git log -1 --pretty=format:%ae ${{ github.sha }})
echo "AUTHOR_NAME=$AUTHOR_NAME" >> $GITHUB_OUTPUT
echo "AUTHOR_EMAIL=$AUTHOR_EMAIL" >> $GITHUB_OUTPUT
id: author_info
- name: Set up Java 8
uses: actions/setup-java@v3
with:
java-version: "8"
distribution: temurin
server-id: ossrh
server-username: OSSRH_JIRA_USERNAME
server-password: OSSRH_JIRA_PASSWORD
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
gpg-passphrase: GPG_PASSPHRASE

- name: Cache local Maven repository
uses: actions/cache@v3
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ runner.os }}-maven-
- uses: actions/setup-node@v4
with:
node-version: 20

- name: "🔧 setup Bun"
uses: oven-sh/setup-bun@v1

- name: Semantic Release
run: |
bun install @conveyal/maven-semantic-release semantic-release @semantic-release/git conventional-changelog-conventionalcommits
bun x semantic-release --prepare @conveyal/maven-semantic-release --publish @semantic-release/github,@conveyal/maven-semantic-release --verify-conditions @semantic-release/github,@conveyal/maven-semantic-release --verify-release @conveyal/maven-semantic-release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GPG_KEY_NAME: ${{ secrets.GPG_KEY_NAME }}
GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
OSSRH_JIRA_USERNAME: ${{ secrets.OSSRH_JIRA_USERNAME }}
OSSRH_JIRA_PASSWORD: ${{ secrets.OSSRH_JIRA_PASSWORD }}
GIT_COMMITTER_NAME: "github-actions[bot]"
GIT_COMMITTER_EMAIL: "41898282+github-actions[bot]@users.noreply.github.com"
GIT_AUTHOR_NAME: ${{ steps.author_info.outputs.AUTHOR_NAME }}
GIT_AUTHOR_EMAIL: ${{ steps.author_info.outputs.AUTHOR_EMAIL }}
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: '0'

- name: Get Author Name and Email
run: |
AUTHOR_NAME=$(git log -1 --pretty=format:%an ${{ github.sha }})
AUTHOR_EMAIL=$(git log -1 --pretty=format:%ae ${{ github.sha }})
echo "AUTHOR_NAME=$AUTHOR_NAME" >> $GITHUB_OUTPUT
echo "AUTHOR_EMAIL=$AUTHOR_EMAIL" >> $GITHUB_OUTPUT
id: author_info

- name: Set up Java 8
uses: actions/setup-java@v3
with:
java-version: "8"
distribution: temurin
server-id: ossrh
server-username: OSSRH_JIRA_USERNAME
server-password: OSSRH_JIRA_PASSWORD
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
gpg-passphrase: GPG_PASSPHRASE

- name: Cache local Maven repository
uses: actions/cache@v3
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ runner.os }}-maven-
- uses: actions/setup-node@v4
with:
node-version: 20

- name: "🔧 setup Bun"
uses: oven-sh/setup-bun@v1

- name: Semantic Release
run: |
bun install @conveyal/maven-semantic-release semantic-release @semantic-release/git conventional-changelog-conventionalcommits
bun x semantic-release --prepare @conveyal/maven-semantic-release --publish @semantic-release/github,@conveyal/maven-semantic-release --verify-conditions @semantic-release/github,@conveyal/maven-semantic-release --verify-release @conveyal/maven-semantic-release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GPG_KEY_NAME: ${{ secrets.GPG_KEY_NAME }}
GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
OSSRH_JIRA_USERNAME: ${{ secrets.OSSRH_JIRA_USERNAME }}
OSSRH_JIRA_PASSWORD: ${{ secrets.OSSRH_JIRA_PASSWORD }}
GIT_COMMITTER_NAME: "github-actions[bot]"
GIT_COMMITTER_EMAIL: "41898282+github-actions[bot]@users.noreply.github.com"
GIT_AUTHOR_NAME: ${{ steps.author_info.outputs.AUTHOR_NAME }}
GIT_AUTHOR_EMAIL: ${{ steps.author_info.outputs.AUTHOR_EMAIL }}
43 changes: 29 additions & 14 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -1,24 +1,39 @@
name: Maven Tests
name: Maven and Python Tests
"on":
pull_request:
types:
- opened
- edited
- synchronize
- reopened
- opened
- edited
- synchronize
- reopened
env:
QDRANT_URL: "${{ secrets.QDRANT_URL }}"
QDRANT_API_KEY: "${{ secrets.QDRANT_API_KEY }}"
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-java@v3
with:
java-version: "8"
distribution: temurin
- name: Run the Maven tests
run: mvn test
- name: Generate assembly fat JAR
run: mvn clean package
- uses: actions/checkout@v4
- uses: actions/setup-java@v3
with:
java-version: "8"
distribution: temurin
- name: Cache local Maven repository
uses: actions/cache@v3
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ runner.os }}-maven-
- name: Run Maven tests
run: mvn test
- name: Generate JARs
run: mvn clean package -DskipTests
- uses: actions/setup-python@v4
with:
python-version: '3.11'
cache: 'pip'
- name: Install Python test dependencies
run: pip install -r src/test/python/requirements.txt
- name: Run Python tests
run: pytest
34 changes: 34 additions & 0 deletions .github/workflows/upload-binaries.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Build and release JAR files

on:
release:
types:
- published

jobs:
upload-jar:
runs-on: ubuntu-latest
permissions:
contents: write # release changes require contents write

steps:
- name: Check out code
uses: actions/checkout@v4

- uses: actions/setup-java@v3
with:
java-version: "8"
distribution: temurin

- name: Generate JARs
run: mvn clean package -DskipTests

- name: Set project version env variable
run: |
echo "PROJECT_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)" >> $GITHUB_ENV
- name: Build and upload JAR
env:
GH_TOKEN: ${{ github.token }}
run: |
gh release upload ${{ github.event.release.tag_name }} target/spark-${{ env.PROJECT_VERSION }}.jar
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,10 @@ buildNumber.properties
# JDT-specific (Eclipse Java Development Tools)
.classpath
.vscode/
.DS_Store
.DS_Store

poetry.lock
.pytest_cache/
*_pycache__

senv
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ For use with Java and Scala projects, the package can be found [here](https://ce
<dependency>
<groupId>io.qdrant</groupId>
<artifactId>spark</artifactId>
<version>2.0</version>
<version>2.0.1</version>
</dependency>
```

Expand All @@ -43,7 +43,7 @@ from pyspark.sql import SparkSession

spark = SparkSession.builder.config(
"spark.jars",
"spark-2.0.jar", # specify the downloaded JAR file
"spark-2.0.1.jar", # specify the downloaded JAR file
)
.master("local[*]")
.appName("qdrant")
Expand Down Expand Up @@ -75,7 +75,7 @@ You can use the `qdrant-spark` connector as a library in Databricks to ingest da

- Go to the `Libraries` section in your cluster dashboard.
- Select `Install New` to open the library installation modal.
- Search for `io.qdrant:spark:2.0` in the Maven packages and click `Install`.
- Search for `io.qdrant:spark:2.0.1` in the Maven packages and click `Install`.

<img width="1064" alt="Screenshot 2024-01-05 at 17 20 01 (1)" src="https://github.com/qdrant/qdrant-spark/assets/46051506/d95773e0-c5c6-4ff2-bf50-8055bb08fd1b">

Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>io.qdrant</groupId>
<artifactId>spark</artifactId>
<version>2.0.0</version>
<version>2.0.1</version>
<name>qdrant-spark</name>
<url>https://github.com/qdrant/qdrant-spark</url>
<description>An Apache Spark connector for the Qdrant vector database</description>
Expand Down
5 changes: 5 additions & 0 deletions src/main/java/io/qdrant/spark/QdrantValueFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ public class QdrantValueFactory {
private QdrantValueFactory() {}

public static Value value(InternalRow record, StructField field, int fieldIndex) {

if (record.isNullAt(fieldIndex)) {
return nullValue();
}

DataType dataType = field.dataType();

switch (dataType.typeName()) {
Expand Down
Empty file added src/test/python/__init__.py
Empty file.
96 changes: 96 additions & 0 deletions src/test/python/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import pytest
from testcontainers.core.container import DockerContainer # type: ignore
from testcontainers.core.waiting_utils import wait_for_logs # type: ignore
from qdrant_client import QdrantClient, models
import uuid
from pyspark.sql import SparkSession
from typing import NamedTuple


QDRANT_GRPC_PORT = 6334
QDRANT_EMBEDDING_DIM = 6
QDRANT_DISTANCE = models.Distance.COSINE


class Qdrant(NamedTuple):
url: str
collection_name: str
client: QdrantClient


qdrant_container = DockerContainer("qdrant/qdrant").with_exposed_ports(QDRANT_GRPC_PORT)


# Reference: https://gist.github.com/dizzythinks/f3bb37fd8ab1484bfec79d39ad8a92d3
def get_pom_version():
from xml.etree import ElementTree as et

ns = "http://maven.apache.org/POM/4.0.0"
et.register_namespace("", ns)
tree = et.ElementTree()
tree.parse("pom.xml")
p = tree.getroot().find("{%s}version" % ns)
return p.text


@pytest.fixture(scope="module", autouse=True)
def setup_container(request):
qdrant_container.start()
wait_for_logs(
qdrant_container, ".*Actix runtime found; starting in Actix runtime.*", 60
)

def remove_container():
qdrant_container.stop()

request.addfinalizer(remove_container)


@pytest.fixture(scope="session")
def spark_session():
spark_session = (
SparkSession.builder.config(
"spark.jars", f"target/spark-{get_pom_version()}.jar"
)
.master("local[*]")
.appName("qdrant")
.getOrCreate()
)

return spark_session


@pytest.fixture()
def qdrant() -> Qdrant:
host = qdrant_container.get_container_host_ip()
grpc_port = qdrant_container.get_exposed_port(QDRANT_GRPC_PORT)

client = QdrantClient(
host=host,
grpc_port=grpc_port,
prefer_grpc=True,
)

collection_name = str(uuid.uuid4())
client.create_collection(
collection_name=collection_name,
vectors_config={
"dense": models.VectorParams(
size=QDRANT_EMBEDDING_DIM,
distance=QDRANT_DISTANCE,
),
"": models.VectorParams(
size=QDRANT_EMBEDDING_DIM,
distance=QDRANT_DISTANCE,
),
},
sparse_vectors_config={
"sparse": models.SparseVectorParams(),
},
)

return Qdrant(
url=f"http://{host}:{grpc_port}",
client=client,
collection_name=collection_name,
)
4 changes: 4 additions & 0 deletions src/test/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pyspark==3.5.1
pytest==8.0.2
qdrant-client==1.7.3
testcontainers==3.7.1
Loading

0 comments on commit 469d62d

Please sign in to comment.