Added a support for search (search_after parameter) (#859)

* Added a sample that uses search_after parameter Signed-off-by: Nathalie Jonathan <[email protected]> * Moved search_after sample to samples/search folder, updated CHANGELOG and _sync sample, and added _async sample. Signed-off-by: Nathalie Jonathan <[email protected]> * Solved conflicts in CHANGELOG.md Signed-off-by: Nathalie Jonathan <[email protected]> --------- Signed-off-by: Nathalie Jonathan <[email protected]>
opensearch-project · Dec 2, 2024 · 87aebcd · 87aebcd
1 parent 6f761ab
commit 87aebcd
Show file tree

Hide file tree

Showing 3 changed files with 222 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 ## [Unreleased]
 ### Added
 - Added option to pass custom headers to 'AWSV4SignerAsyncAuth' ([863](https://github.com/opensearch-project/opensearch-py/pull/863))
+- Added sync and async sample that uses `search_after` parameter ([859](https://github.com/opensearch-project/opensearch-py/pull/859))
 ### Updated APIs
 ### Changed
 ### Deprecated

diff --git a/samples/search/search_after_async.py b/samples/search/search_after_async.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+
+# SPDX-License-Identifier: Apache-2.0
+#
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+#
+# Modifications Copyright OpenSearch Contributors. See
+# GitHub history for details.
+
+import asyncio
+import os
+
+from opensearchpy import AsyncOpenSearch
+
+
+async def main() -> None:
+    """
+    This sample uses asyncio and AsyncOpenSearch to asynchronously
+    connect to local OpenSearch cluster, performs a search query on an index,
+    retrieves the first page of results, and fetches the next page of results
+    using the search_after parameter.
+    """
+
+    # connect to OpenSearch
+    host = "localhost"
+    port = 9200
+    auth = (
+        "admin",
+        os.getenv("OPENSEARCH_PASSWORD", "admin"),
+    )  # For testing only. Don't store credentials in code.
+
+    client = AsyncOpenSearch(
+        hosts=[{"host": host, "port": port}],
+        http_auth=auth,
+        use_ssl=True,
+        verify_certs=False,
+        ssl_show_warn=False,
+    )
+
+    # create an index
+    await client.indices.create(index="movies")
+
+    try:
+        # add a large dataset (100 movies)
+        for i in range(15):
+            await client.index(
+                index="movies",
+                id=i,
+                body={
+                    "title": f"The Dark Knight {i}",
+                    "director": "Christopher Nolan",
+                    "year": 2008 + i,
+                },
+            )
+
+        for i in range(95):
+            await client.index(
+                index="movies",
+                id=i + 15,
+                body={
+                    "title": f"Movie Title {i + 15}",
+                    "director": f"Director {i + 15}",
+                    "year": 1950 + i + 15,
+                },
+            )
+
+        # refresh the index to make the documents searchable
+        await client.indices.refresh(index="movies")
+
+        # define the search query with sorting and pagination options
+        search_body = {
+            "query": {"match": {"title": "dark knight"}},
+            "sort": [{"year": {"order": "asc"}}],
+            "size": 10,
+        }
+
+        page = 1
+        total_hits = 0
+        while True:
+            # execute the search
+            response = await client.search(index="movies", body=search_body)
+            hits = response["hits"]["hits"]
+
+            # break if no more results
+            if not hits:
+                break
+
+            print(f"\nPage {page}:")
+
+            for hit in hits:
+                print(hit)
+                total_hits += 1
+
+            # get the sort values of the last document for the next page
+            last_sort = hits[-1]["sort"]
+            search_body["search_after"] = last_sort
+            page += 1
+
+        print("\nPagination Summary:")
+        print(f"Total pages: {page - 1}")
+        print(f"Total hits: {total_hits}")
+        print(f"Results per page: {search_body['size']}")
+    finally:
+        # delete the index
+        await client.indices.delete(index="movies")
+        await client.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/samples/search/search_after_sync.py b/samples/search/search_after_sync.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+
+# SPDX-License-Identifier: Apache-2.0
+#
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+#
+# Modifications Copyright OpenSearch Contributors. See
+# GitHub history for details.
+
+import os
+
+from opensearchpy import OpenSearch
+
+
+def main() -> None:
+    """
+    This sample shows how to use search_after to paginate through the search results.
+    It performs a search query on an index, retrieves the first page of results,
+    and then fetches the next page of results using the search_after parameter.
+    """
+
+    # connect to OpenSearch
+    host = "localhost"
+    port = 9200
+    auth = (
+        "admin",
+        os.getenv("OPENSEARCH_PASSWORD", "admin"),
+    )  # For testing only. Don't store credentials in code.
+
+    client = OpenSearch(
+        hosts=[{"host": host, "port": port}],
+        http_auth=auth,
+        use_ssl=True,
+        verify_certs=False,
+        ssl_show_warn=False,
+    )
+
+    # create an index
+    client.indices.create(index="movies")
+
+    try:
+        # add a large dataset (100 movies)
+        for i in range(15):
+            client.index(
+                index="movies",
+                id=i,
+                body={
+                    "title": f"The Dark Knight {i}",
+                    "director": "Christopher Nolan",
+                    "year": 2008 + i,
+                },
+            )
+
+        for i in range(95):
+            client.index(
+                index="movies",
+                id=i + 15,
+                body={
+                    "title": f"Movie Title {i + 15}",
+                    "director": f"Director {i + 15}",
+                    "year": 1950 + i + 15,
+                },
+            )
+
+        # refresh the index to make the documents searchable
+        client.indices.refresh(index="movies")
+
+        # define the search query with sorting and pagination options
+        search_body = {
+            "query": {"match": {"title": "dark knight"}},
+            "sort": [{"year": {"order": "asc"}}],
+            "size": 10,
+        }
+
+        page = 1
+        total_hits = 0
+        while True:
+            # execute the search
+            response = client.search(index="movies", body=search_body)
+            hits = response["hits"]["hits"]
+
+            # break if no more results
+            if not hits:
+                break
+
+            print(f"\nPage {page}:")
+
+            for hit in hits:
+                print(hit)
+                total_hits += 1
+
+            # get the sort values of the last document for the next page
+            last_sort = hits[-1]["sort"]
+            search_body["search_after"] = last_sort
+            page += 1
+
+        print("\nPagination Summary:")
+        print(f"Total pages: {page - 1}")
+        print(f"Total hits: {total_hits}")
+        print(f"Results per page: {search_body['size']}")
+    finally:
+        # delete the index
+        client.indices.delete(index="movies")
+
+
+if __name__ == "__main__":
+    main()