From f6f053c784971635dfa75537c773708d116622df Mon Sep 17 00:00:00 2001 From: Test User Date: Thu, 10 Aug 2023 23:18:57 +0000 Subject: [PATCH 01/10] =?UTF-8?q?fix:=20=F0=9F=90=9B=20update=20the=20Open?= =?UTF-8?q?API=20spec?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit also fix some types --- .spectral.yml | 3 + .vscode/monorepo.code-workspace | 6 +- docs/source/openapi.json | 4341 ++++++++++++-------- libs/libcommon/src/libcommon/exceptions.py | 92 +- libs/libcommon/src/libcommon/utils.py | 12 +- services/rows/src/rows/routes/rows.py | 13 +- services/worker/src/worker/dtos.py | 32 +- 7 files changed, 2806 insertions(+), 1693 deletions(-) create mode 100644 .spectral.yml diff --git a/.spectral.yml b/.spectral.yml new file mode 100644 index 0000000000..1f74425c4c --- /dev/null +++ b/.spectral.yml @@ -0,0 +1,3 @@ +extends: spectral:oas +rules: + operation-tags: off diff --git a/.vscode/monorepo.code-workspace b/.vscode/monorepo.code-workspace index 6fe77b3de5..d76454ebdc 100644 --- a/.vscode/monorepo.code-workspace +++ b/.vscode/monorepo.code-workspace @@ -67,12 +67,14 @@ "python.formatting.provider": "black", "python.linting.enabled": true, "python.linting.mypyEnabled": true, - "python.linting.flake8Enabled": true + "python.linting.flake8Enabled": true, + "spectral.rulesetFile": ".spectral.yml" }, "extensions": { "recommendations": [ "ms-python.python", - "ms-kubernetes-tools.vscode-kubernetes-tools" + "ms-kubernetes-tools.vscode-kubernetes-tools", + "stoplight.spectral" ] } } diff --git a/docs/source/openapi.json b/docs/source/openapi.json index a6c7c14b96..f270e0d02d 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -1,14 +1,25 @@ { - "openapi": "3.0.2", + "openapi": "3.1.0", "info": { "title": "Datasets server API", "description": "The 🤗 Datasets server API gives access to the contents, metadata and basic statistics of the Hugging Face Hub datasets.", "contact": { "name": "API Support", - "email": "website@huggingface.co" + "email": "website@huggingface.co", + "url": "https://github.com/huggingface/datasets-server/" }, - "version": "1.0" + "version": "1.0", + "license": { + "name": "Apache 2.0", + "url": "https://www.apache.org/licenses/LICENSE-2.0" + } }, + "tags": [ + { + "name": "datasets", + "description": "API to access datasets" + } + ], "servers": [ { "url": "https://datasets-server.huggingface.co", @@ -19,358 +30,142 @@ "headers": { "Cache-Control": { "description": "Directives that control caching in browsers and shared caches. This directive is used internally for caching the responses: the API will serve the same response until the cache has expired.", - "schema": { "type": "string" }, - "examples": { - "no-cache": { "summary": "No cache.", "value": "no-cache" }, - "max-age": { "summary": "Cache TTL.", "value": "max-age=120" } - }, - "required": true - }, - "Access-Control-Allow-Origin": { - "description": "Indicates whether the response can be shared with requesting code from the given origin.", - "schema": { "type": "string" }, - "example": "*", - "required": true - }, - "X-Error-Code-splits-401": { - "description": "A string that identifies the underlying error for 401 on /splits.", - "schema": { - "type": "string", - "enum": ["ExternalUnauthenticatedError"] - }, - "examples": { - "ExternalUnauthenticatedError": { - "summary": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication.", - "value": "ExternalUnauthenticatedError" - } - }, - "required": true - }, - "X-Error-Code-splits-404": { - "description": "A string that identifies the underlying error for 404 on /splits.", "schema": { - "type": "string", - "enum": [ - "ExternalAuthenticatedError", - "DatasetNotFoundError", - "SplitsResponseNotFound" - ] + "type": "string" }, "examples": { - "ExternalAuthenticatedError": { - "summary": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials.", - "value": "ExternalAuthenticatedError" + "no-cache": { + "summary": "No cache.", + "value": "no-cache" }, - "DatasetNotFoundError": { - "summary": "The dataset does not exist on the Hub.", - "value": "DatasetNotFoundError" - }, - "SplitsResponseNotFound": { - "summary": "Not found.", - "value": "SplitsResponseNotFound" + "max-age": { + "summary": "Cache TTL.", + "value": "max-age=120" } }, "required": true }, - "X-Error-Code-splits-422": { - "description": "A string that identifies the underlying error for 422 on /splits.", + "Access-Control-Allow-Origin": { + "description": "Indicates whether the response can be shared with requesting code from the given origin.", "schema": { - "type": "string", - "enum": ["MissingRequiredParameter"] - }, - "examples": { - "MissingRequiredParameter": { - "summary": "Parameter 'dataset' is required", - "value": "MissingRequiredParameter" - } + "type": "string" }, + "example": "*", "required": true }, - "X-Error-Code-splits-500": { - "description": "A string that identifies the underlying error for 500 on /splits.", + "X-Error-Code-401": { + "description": "A string that identifies the underlying error for 401.", "schema": { - "type": "string", - "enum": [ - "SplitsResponseNotReadyError", - "SplitsNamesError", - "UnexpectedError" + "oneOf": [ + { + "$ref": "#/components/schemas/X-Error-Code-ExternalUnauthenticatedError" + } ] }, - "examples": { - "SplitsResponseNotReadyError": { - "summary": "The server is busier than usual and the list of splits is not ready yet. Please retry later.", - "value": "SplitsResponseNotReadyError" - }, - "SplitsNamesError": { - "summary": "Cannot get the split names for the dataset.", - "value": "SplitsNamesError" - }, - "UnexpectedError": { - "summary": "Unexpected error.", - "value": "UnexpectedError" - } - }, - "required": true - }, - "X-Error-Code-first-rows-401": { - "description": "A string that identifies the underlying error for 401 on /first-rows.", - "schema": { - "type": "string", - "enum": ["ExternalUnauthenticatedError"] - }, - "examples": { - "ExternalUnauthenticatedError": { - "summary": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication.", - "value": "ExternalUnauthenticatedError" - } - }, "required": true }, - "X-Error-Code-first-rows-404": { - "description": "A string that identifies the underlying error for 404 on /first-rows.", + "X-Error-Code-404": { + "description": "A string that identifies the underlying error for 404.", "schema": { - "type": "string", - "enum": [ - "ExternalAuthenticatedError", - "DatasetNotFoundError", - "ConfigNotFoundError", - "SplitNotFoundError", - "FirstRowsResponseNotFound" + "oneOf": [ + { + "$ref": "#/components/schemas/X-Error-Code-ExternalAuthenticatedError" + }, + { "$ref": "#/components/schemas/X-Error-Code-ResponseNotFound" } ] }, - "examples": { - "ExternalAuthenticatedError": { - "summary": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials.", - "value": "ExternalAuthenticatedError" - }, - "DatasetNotFoundError": { - "summary": "The dataset does not exist on the Hub.", - "value": "DatasetNotFoundError" - }, - "ConfigNotFoundError": { - "summary": "config yyy does not exist for dataset xxx", - "value": "ConfigNotFoundError" - }, - "SplitNotFoundError": { - "summary": "The config or the split does not exist in the dataset", - "value": "SplitNotFoundError" - }, - "FirstRowsResponseNotFound": { - "summary": "Not found.", - "value": "FirstRowsResponseNotFound" - } - }, - "required": true - }, - "X-Error-Code-first-rows-422": { - "description": "A string that identifies the underlying error for 422 on /first-rows.", - "schema": { - "type": "string", - "enum": ["MissingRequiredParameter"] - }, - "examples": { - "MissingRequiredParameter": { - "summary": "Parameters 'dataset', 'config' and 'split' are required", - "value": "MissingRequiredParameter" - } - }, "required": true }, - "X-Error-Code-first-rows-500": { - "description": "A string that identifies the underlying error for 500 on /first-rows.", + "X-Error-Code-422": { + "description": "A string that identifies the underlying error for 422.", "schema": { - "type": "string", - "enum": [ - "FirstRowsResponseNotReady", - "InfoError", - "FeaturesError", - "StreamingRowsError", - "NormalRowsError", - "RowsPostProcessingError", - "UnexpectedError" + "oneOf": [ + { + "$ref": "#/components/schemas/X-Error-Code-MissingRequiredParameter" + } ] }, - "examples": { - "FirstRowsResponseNotReady": { - "summary": "The list of the first rows is not ready yet. Please retry later.", - "value": "FirstRowsResponseNotReady" - }, - "InfoError": { - "summary": "The info cannot be fetched for the config of the dataset.", - "value": "InfoError" - }, - "FeaturesError": { - "summary": "Cannot extract the features (columns) for the split of the config of the dataset.", - "value": "FeaturesError" - }, - "StreamingRowsError": { - "summary": "Cannot load the dataset split (in streaming mode) to extract the first rows.", - "value": "StreamingRowsError" - }, - "NormalRowsError": { - "summary": "Cannot load the dataset split (in normal download mode) to extract the first rows.", - "value": "NormalRowsError" - }, - "RowsPostProcessingError": { - "summary": "Server error while post-processing the split rows. Please report the issue.", - "value": "RowsPostProcessingError" - }, - "UnexpectedError": { - "summary": "Unexpected error.", - "value": "UnexpectedError" - } - }, - "required": true - }, - "X-Error-Code-rows-401": { - "description": "A string that identifies the underlying error for 401 on /rows.", - "schema": { - "type": "string", - "enum": ["ExternalUnauthenticatedError"] - }, - "examples": { - "ExternalUnauthenticatedError": { - "summary": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication.", - "value": "ExternalUnauthenticatedError" - } - }, "required": true }, - "X-Error-Code-rows-404": { - "description": "A string that identifies the underlying error for 404 on /rows.", + "X-Error-Code-500": { + "description": "A string that identifies the underlying error for 500. It's marked as required: false because the header can be missing on text-plain response.", "schema": { - "type": "string", - "enum": [ - "ExternalAuthenticatedError", - "DatasetNotFoundError", - "ConfigNotFoundError", - "SplitNotFoundError", - "RowsResponseNotFound" + "oneOf": [ + { + "$ref": "#/components/schemas/X-Error-Code-ResponseNotReadyError" + }, + { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" } ] }, - "examples": { - "ExternalAuthenticatedError": { - "summary": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials.", - "value": "ExternalAuthenticatedError" - }, - "DatasetNotFoundError": { - "summary": "The dataset does not exist on the Hub.", - "value": "DatasetNotFoundError" - }, - "ConfigNotFoundError": { - "summary": "config yyy does not exist for dataset xxx", - "value": "ConfigNotFoundError" - }, - "SplitNotFoundError": { - "summary": "The config or the split does not exist in the dataset", - "value": "SplitNotFoundError" - }, - "RowsResponseNotFound": { - "summary": "Not found.", - "value": "RowsResponseNotFound" - } - }, - "required": true - }, - "X-Error-Code-rows-422": { - "description": "A string that identifies the underlying error for 422 on /rows.", - "schema": { - "type": "string", - "enum": ["MissingRequiredParameter"] - }, - "examples": { - "MissingRequiredParameter": { - "summary": "Parameters 'dataset', 'config', 'split', 'offset' and 'length' are required", - "value": "MissingRequiredParameter" - } - }, - "required": true + "required": false }, - "X-Error-Code-rows-500": { - "description": "A string that identifies the underlying error for 500 on /first-rows.", + "X-Error-Code-500-first-rows": { + "description": "A string that identifies the underlying error for 500 on /first-rows. It's marked as required: false because the header can be missing on text-plain response.", "schema": { - "type": "string", - "enum": ["RowsPostProcessingError", "UnexpectedError"] - }, - "examples": { - "RowsPostProcessingError": { - "summary": "Server error while post-processing the split rows. Please report the issue.", - "value": "RowsPostProcessingError" - }, - "UnexpectedError": { - "summary": "Unexpected error.", - "value": "UnexpectedError" - } + "oneOf": [ + { + "$ref": "#/components/schemas/X-Error-Code-ResponseNotReadyError" + }, + { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" }, + { "$ref": "#/components/schemas/X-Error-Code-StreamingRowsError" } + ] }, - "required": true + "required": false }, - "X-Error-Code-valid-500": { - "description": "A string that identifies the underlying error for 500 on /valid.", + "X-Error-Code-500-is-valid": { + "description": "A string that identifies the underlying error for 500 on /is-valid. It's marked as required: false because the header can be missing on text-plain response.", "schema": { - "type": "string", - "enum": ["UnexpectedError"] - }, - "examples": { - "UnexpectedError": { - "summary": "Unexpected error.", - "value": "UnexpectedError" - } + "oneOf": [ + { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" } + ] }, "required": true }, - "X-Error-Code-is-valid-401": { - "description": "A string that identifies the underlying error for 401 on /is-valid.", + "X-Error-Code-500-common": { + "description": "A string that identifies the underlying error for 500 on /parquet. It's marked as required: false because the header can be missing on text-plain response.", "schema": { - "type": "string", - "enum": ["ExternalUnauthenticatedError"] - }, - "examples": { - "ExternalUnauthenticatedError": { - "summary": "Cannot access the route. Please retry with authentication.", - "value": "ExternalUnauthenticatedError" - } + "oneOf": [ + { + "$ref": "#/components/schemas/X-Error-Code-ResponseNotReadyError" + }, + { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" }, + { + "$ref": "#/components/schemas/X-Error-Code-ExternalFilesSizeRequestHTTPError" + } + ] }, "required": true }, - "X-Error-Code-is-valid-404": { - "description": "A string that identifies the underlying error for 404 on /is-valid.", + "X-Error-Code-500-rows": { + "description": "A string that identifies the underlying error for 500 on /rows. It's marked as required: false because the header can be missing on text-plain response.", "schema": { - "type": "string", - "enum": ["ExternalAuthenticatedError"] - }, - "examples": { - "ExternalAuthenticatedError": { - "summary": "Cannot access the route with the current credentials. Please retry with other authentication credentials.", - "value": "ExternalAuthenticatedError" - } + "oneOf": [ + { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" }, + { + "$ref": "#/components/schemas/X-Error-Code-RowsPostProcessingError" + } + ] }, "required": true }, - "X-Error-Code-is-valid-422": { - "description": "A string that identifies the underlying error for 422 on /is-valid.", + "X-Error-Code-500-search": { + "description": "A string that identifies the underlying error for 500 on /search. It's marked as required: false because the header can be missing on text-plain response.", "schema": { - "type": "string", - "enum": ["MissingRequiredParameter"] - }, - "examples": { - "MissingRequiredParameter": { - "summary": "Parameter 'dataset' is required", - "value": "MissingRequiredParameter" - } + "oneOf": [ + { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" }, + { + "$ref": "#/components/schemas/X-Error-Code-RowsPostProcessingError" + } + ] }, "required": true }, - "X-Error-Code-is-valid-500": { - "description": "A string that identifies the underlying error for 500 on /is-valid.", + "X-Error-Code-500-valid": { + "description": "A string that identifies the underlying error for 500 on /valid. It's marked as required: false because the header can be missing on text-plain response.", "schema": { - "type": "string", - "enum": ["UnexpectedError"] - }, - "examples": { - "UnexpectedError": { - "summary": "Unexpected error.", - "value": "UnexpectedError" - } + "oneOf": [ + { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" } + ] }, "required": true } @@ -380,13 +175,60 @@ "type": "string", "example": "Internal Server Error" }, + "ConfigItem": { + "type": "object", + "required": ["dataset", "config"], + "properties": { + "dataset": { + "type": "string" + }, + "config": { + "type": "string" + } + } + }, + "ConfigItems": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ConfigItem" + } + }, + "FailedConfigItem": { + "type": "object", + "required": ["dataset", "config", "error"], + "properties": { + "dataset": { + "type": "string" + }, + "config": { + "type": "string" + }, + "error": { + "type": "object" + } + } + }, + "FailedConfigItems": { + "type": "array", + "items": { + "$ref": "#/components/schemas/FailedConfigItem" + } + }, "SplitsResponse": { "type": "object", "required": ["splits"], "properties": { "splits": { "type": "array", - "items": { "$ref": "#/components/schemas/SplitItem" } + "items": { + "$ref": "#/components/schemas/SplitItem" + } + }, + "pending": { + "$ref": "#/components/schemas/ConfigItems" + }, + "failed": { + "$ref": "#/components/schemas/FailedConfigItems" } } }, @@ -402,12 +244,6 @@ }, "split": { "type": "string" - }, - "num_bytes": { - "type": "integer" - }, - "num_examples": { - "type": "integer" } } }, @@ -447,25 +283,36 @@ }, "features": { "type": "array", - "items": { "$ref": "#/components/schemas/FeatureItem" } + "items": { + "$ref": "#/components/schemas/FeatureItem" + } }, "rows": { "type": "array", - "items": { "$ref": "#/components/schemas/FirstRowItem" } + "items": { + "$ref": "#/components/schemas/RowItem" + } } } }, - "RowsResponse": { + "PaginatedResponse": { "type": "object", - "required": ["features", "rows"], + "required": ["features", "rows", "num_total_rows"], "properties": { "features": { "type": "array", - "items": { "$ref": "#/components/schemas/FeatureItem" } + "items": { + "$ref": "#/components/schemas/FeatureItem" + } }, "rows": { "type": "array", - "items": { "$ref": "#/components/schemas/FirstRowItem" } + "items": { + "$ref": "#/components/schemas/RowItem" + } + }, + "num_total_rows": { + "type": "integer" } } }, @@ -486,10 +333,18 @@ }, "Feature": { "oneOf": [ - { "$ref": "#/components/schemas/ValueFeature" }, - { "$ref": "#/components/schemas/ClassLabelFeature" }, - { "$ref": "#/components/schemas/ArrayXDFeature" }, - { "$ref": "#/components/schemas/TranslationFeature" }, + { + "$ref": "#/components/schemas/ValueFeature" + }, + { + "$ref": "#/components/schemas/ClassLabelFeature" + }, + { + "$ref": "#/components/schemas/ArrayXDFeature" + }, + { + "$ref": "#/components/schemas/TranslationFeature" + }, { "$ref": "#/components/schemas/TranslationVariableLanguagesFeature" }, @@ -514,11 +369,6 @@ "type": "object", "required": ["_type", "dtype"], "properties": { - "id": { - "type": "string", - "nullable": true, - "enum": [null] - }, "_type": { "type": "string", "enum": ["Value"] @@ -560,18 +410,10 @@ "type": "object", "required": ["_type", "names"], "properties": { - "id": { - "type": "string", - "nullable": true, - "enum": [null] - }, "_type": { "type": "string", "enum": ["ClassLabel"] }, - "num_classes": { - "type": "integer" - }, "names": { "type": "array", "items": { @@ -584,11 +426,6 @@ "type": "object", "required": ["_type", "shape"], "properties": { - "id": { - "type": "string", - "nullable": true, - "enum": [null] - }, "_type": { "type": "string", "enum": ["Array2D", "Array3D", "Array4D", "Array5D"] @@ -606,11 +443,6 @@ "type": "object", "required": ["_type", "languages"], "properties": { - "id": { - "type": "string", - "nullable": true, - "enum": [null] - }, "_type": { "type": "string", "enum": ["Translation"] @@ -627,11 +459,6 @@ "type": "object", "required": ["_type", "languages"], "properties": { - "id": { - "type": "string", - "nullable": true, - "enum": [null] - }, "_type": { "type": "string", "enum": ["TranslationVariableLanguages"] @@ -651,11 +478,6 @@ "type": "object", "required": ["_type", "feature"], "properties": { - "id": { - "type": "string", - "nullable": true, - "enum": [null] - }, "_type": { "type": "string", "enum": ["Sequence"] @@ -684,11 +506,6 @@ "type": "object", "required": ["_type", "sampling_rate"], "properties": { - "id": { - "type": "string", - "nullable": true, - "enum": [null] - }, "_type": { "type": "string", "enum": ["Audio"] @@ -708,11 +525,6 @@ "type": "object", "required": ["_type"], "properties": { - "id": { - "type": "string", - "nullable": true, - "enum": [null] - }, "_type": { "type": "string", "enum": ["Image"] @@ -722,7 +534,7 @@ } } }, - "FirstRowItem": { + "RowItem": { "type": "object", "required": ["row_idx", "row", "truncated_cells"], "properties": { @@ -737,19 +549,35 @@ }, "truncated_cells": { "type": "array", - "items": { "type": "string" } + "items": { + "type": "string" + } } } }, "Cell": { "oneOf": [ - { "$ref": "#/components/schemas/ValueCell" }, - { "$ref": "#/components/schemas/ClassLabelCell" }, - { "$ref": "#/components/schemas/Array2DCell" }, - { "$ref": "#/components/schemas/Array3DCell" }, - { "$ref": "#/components/schemas/Array4DCell" }, - { "$ref": "#/components/schemas/Array5DCell" }, - { "$ref": "#/components/schemas/TranslationCell" }, + { + "$ref": "#/components/schemas/ValueCell" + }, + { + "$ref": "#/components/schemas/ClassLabelCell" + }, + { + "$ref": "#/components/schemas/Array2DCell" + }, + { + "$ref": "#/components/schemas/Array3DCell" + }, + { + "$ref": "#/components/schemas/Array4DCell" + }, + { + "$ref": "#/components/schemas/Array5DCell" + }, + { + "$ref": "#/components/schemas/TranslationCell" + }, { "$ref": "#/components/schemas/TranslationVariableLanguagesCell" }, @@ -767,15 +595,26 @@ }, { "$ref": "#/components/schemas/ImageCell" + }, + { + "$ref": "#/components/schemas/NullableImagesListCell" } ] }, "ValueCell": { "oneOf": [ - { "type": "boolean" }, - { "type": "integer" }, - { "type": "number" }, - { "type": "string" } + { + "type": "boolean" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "string" + } ], "nullable": true }, @@ -835,8 +674,12 @@ }, "SequenceCell": { "oneOf": [ - { "$ref": "#/components/schemas/ListCell" }, - { "$ref": "#/components/schemas/DictionaryOfListsCell" } + { + "$ref": "#/components/schemas/ListCell" + }, + { + "$ref": "#/components/schemas/DictionaryOfListsCell" + } ] }, "ListCell": { @@ -861,6 +704,7 @@ "type": "array", "items": { "type": "object", + "required": ["src", "type"], "properties": { "src": { "type": "string", @@ -874,56 +718,112 @@ } }, "ImageCell": { + "type": "object", + "properties": { + "src": { + "type": "string", + "format": "uri" + }, + "height": { + "type": "integer" + }, + "width": { + "type": "integer" + } + }, + "required": ["src", "height", "width"] + }, + "NullableImagesListCell": { "type": "array", "items": { - "type": "object", - "properties": { - "src": { - "type": "string", - "format": "uri" - }, - "height": { - "type": "integer" + "oneOf": [ + { + "$ref": "#/components/schemas/ImageCell" }, - "width": { - "type": "integer" + { + "type": "null" } - } + ] } }, "ValidResponse": { "type": "object", - "required": ["preview", "viewer", "valid"], + "required": ["preview", "viewer"], "properties": { "viewer": { "type": "array", - "items": { "type": "string" } - }, + "items": { + "type": "string" + } + }, "preview": { "type": "array", - "items": { "type": "string" } + "items": { + "type": "string" + } } } }, "IsValidResponse": { "type": "object", - "required": ["preview", "viewer"], + "required": ["preview", "viewer", "search"], "properties": { "viewer": { "type": "boolean" }, "preview": { "type": "boolean" + }, + "search": { + "type": "boolean" } } }, - "ParquetFilesResponse": { + "PreviousJob": { + "type": "object", + "required": ["dataset", "config", "split", "kind"], + "properties": { + "dataset": { + "type": "string" + }, + "kind": { + "type": "string" + }, + "config": { + "type": "string" + }, + "split": { + "type": ["string", "null"] + } + } + }, + "PreviousJobs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/PreviousJob" + } + }, + "ParquetResponse": { "type": "object", - "required": ["parquet_files"], + "required": ["parquet_files", "partial"], "properties": { "parquet_files": { "type": "array", - "items": { "$ref": "#/components/schemas/SplitHubFile" } + "items": { + "$ref": "#/components/schemas/SplitHubFile" + } + }, + "features": { + "type": "object" + }, + "pending": { + "$ref": "#/components/schemas/PreviousJobs" + }, + "failed": { + "$ref": "#/components/schemas/PreviousJobs" + }, + "partial": { + "$ref": "#/components/schemas/Partial" } } }, @@ -951,6 +851,364 @@ "type": "integer" } } + }, + "InfoResponse": { + "type": "object", + "required": ["dataset_info", "partial"], + "properties": { + "dataset_info": { + "type": "object", + "description": "A dump of the DatasetInfo object from the datasets library. See https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetInfo. We don't describe the contents of these metadata for now." + }, + "pending": { + "$ref": "#/components/schemas/PreviousJobs" + }, + "failed": { + "$ref": "#/components/schemas/PreviousJobs" + }, + "partial": { + "$ref": "#/components/schemas/Partial" + } + } + }, + "Partial": { + "type": "boolean", + "description": "True means that the response has been computed on part of the dataset (typically the first 5GB). False means that the complete dataset was used." + }, + "DatasetSize": { + "type": "object", + "required": [ + "dataset", + "num_bytes_parquet_files", + "num_bytes_memory", + "num_rows" + ], + "properties": { + "dataset": { + "type": "string" + }, + "num_bytes_original_files": { + "type": "integer" + }, + "num_bytes_parquet_files": { + "type": "integer" + }, + "num_bytes_memory": { + "type": "integer" + }, + "num_rows": { + "type": "integer" + } + } + }, + "ConfigSize": { + "type": "object", + "required": [ + "dataset", + "config", + "num_bytes_parquet_files", + "num_bytes_memory", + "num_rows", + "num_columns" + ], + "properties": { + "dataset": { + "type": "string" + }, + "config": { + "type": "string" + }, + "num_bytes_original_files": { + "type": "integer" + }, + "num_bytes_parquet_files": { + "type": "integer" + }, + "num_bytes_memory": { + "type": "integer" + }, + "num_rows": { + "type": "integer" + }, + "num_columns": { + "type": "integer" + } + } + }, + "SplitSize": { + "type": "object", + "required": [ + "dataset", + "config", + "split", + "num_bytes_parquet_files", + "num_bytes_memory", + "num_rows", + "num_columns" + ], + "properties": { + "dataset": { + "type": "string" + }, + "config": { + "type": "string" + }, + "split": { + "type": "string" + }, + "num_bytes_parquet_files": { + "type": "integer" + }, + "num_bytes_memory": { + "type": "integer" + }, + "num_rows": { + "type": "integer" + }, + "num_columns": { + "type": "integer" + } + } + }, + "SizeResponse": { + "type": "object", + "required": ["size", "partial"], + "properties": { + "size": { + "type": "object", + "required": ["splits"], + "properties": { + "dataset": { + "$ref": "#/components/schemas/DatasetSize" + }, + "config": { + "$ref": "#/components/schemas/ConfigSize" + }, + "configs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ConfigSize" + } + }, + "splits": { + "type": "array", + "items": { + "$ref": "#/components/schemas/SplitSize" + } + } + } + }, + "pending": { + "$ref": "#/components/schemas/PreviousJobs" + }, + "failed": { + "$ref": "#/components/schemas/PreviousJobs" + }, + "partial": { + "$ref": "#/components/schemas/Partial" + } + } + }, + "OptInOutUrlsCountResponse": { + "type": "object", + "required": [ + "urls_columns", + "num_opt_in_urls", + "num_opt_out_urls", + "num_urls", + "num_scanned_rows", + "has_urls_columns" + ], + "properties": { + "urls_columns": { + "type": "array", + "items": { + "type": "string" + } + }, + "num_opt_in_urls": { + "type": "integer" + }, + "num_opt_out_urls": { + "type": "integer" + }, + "num_urls": { + "type": "integer" + }, + "num_scanned_rows": { + "type": "integer" + }, + "has_urls_columns": { + "type": "boolean" + }, + "full_scan": { + "type": "boolean" + } + } + }, + "ColumnType": { + "type": "string", + "enum": ["float", "int", "class_label"] + }, + "Histogram": { + "type": "object", + "required": ["hist", "bin_edges"], + "properties": { + "hist": { + "type": "array", + "items": { + "type": "integer" + } + }, + "bin_edges": { + "type": "array", + "items": { + "type": "number" + } + } + } + }, + "NumericalStatisticsItem": { + "type": "object", + "required": [ + "nan_count", + "nan_proportion", + "min", + "max", + "mean", + "median", + "std", + "histogram" + ], + "properties": { + "nan_count": { + "type": "integer" + }, + "nan_proportion": { + "type": "number" + }, + "min": { + "type": "number" + }, + "max": { + "type": "number" + }, + "mean": { + "type": "number" + }, + "median": { + "type": "number" + }, + "std": { + "type": "number" + }, + "histogram": { + "$ref": "#/components/schemas/Histogram" + } + } + }, + "CategoricalStatisticsItem": { + "type": "object", + "required": ["nan_count", "nan_proportion", "n_unique", "frequencies"], + "properties": { + "nan_count": { + "type": "integer" + }, + "nan_proportion": { + "type": "number" + }, + "n_unique": { + "type": "integer" + }, + "frequencies": { + "type": "object", + "additionalProperties": { + "type": "integer" + } + } + } + }, + "StatisticsPerColumnItem": { + "type": "object", + "required": ["column_name", "column_type", "column_statistics"], + "properties": { + "column_name": { + "type": "string" + }, + "column_type": { + "$ref": "#/components/schemas/ColumnType" + }, + "column_statistics": { + "oneOf": [ + { + "$ref": "#/components/schemas/NumericalStatisticsItem" + }, + { + "$ref": "#/components/schemas/CategoricalStatisticsItem" + } + ] + } + } + }, + "StatisticsResponse": { + "type": "object", + "required": ["statistics", "num_examples"], + "properties": { + "statistics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/StatisticsPerColumnItem" + } + }, + "num_examples": { + "type": "integer" + } + } + }, + "X-Error-Code-ExternalAuthenticatedError": { + "type": "string", + "const": "ExternalAuthenticatedError", + "description": "Raised when the external authentication check failed while the user was authenticated. Even if the external authentication server returns 403 in that case, we return 404 because we don't know if the dataset exist or not. It's also coherent with how the Hugging Face Hub works." + }, + "X-Error-Code-ExternalFilesSizeRequestHTTPError": { + "type": "string", + "const": "ExternalFilesSizeRequestHTTPError", + "description": "We failed to get the size of the external files." + }, + "X-Error-Code-ExternalUnauthenticatedError": { + "type": "string", + "const": "ExternalUnauthenticatedError", + "description": "The external authentication check failed while the user was unauthenticated." + }, + "X-Error-Code-ResponseNotFound": { + "type": "string", + "const": "ResponseNotFound", + "description": "Raised when the response has not been found." + }, + "X-Error-Code-MissingRequiredParameter": { + "type": "string", + "const": "MissingRequiredParameter", + "description": "A required parameter is missing." + }, + "X-Error-Code-ResponseNotReadyError": { + "type": "string", + "const": "ResponseNotReadyError", + "description": "The response has not been processed yet." + }, + "X-Error-Code-RowsPostProcessingError": { + "type": "string", + "const": "RowsPostProcessingError", + "description": "The rows could not be post-processed successfully." + }, + "X-Error-Code-StreamingRowsError": { + "type": "string", + "const": "StreamingRowsError", + "description": "The rows could not be fetched in streaming mode." + }, + "X-Error-Code-UnexpectedError": { + "type": "string", + "const": "UnexpectedError", + "description": "The job runner raised an unexpected error." } }, "securitySchemes": { @@ -966,6 +1224,394 @@ "scheme": "bearer", "bearerFormat": "A User Access Token is prefixed with `hf_`, while an Organization API token is prefixed with `api_org_`." } + }, + "examples": { + "InexistentConfigError": { + "summary": "The response is not found because the config does not exist.", + "description": "try with config=inexistent-config.", + "value": { + "error": "Not found." + } + }, + "InexistentDatasetError": { + "summary": "The dataset does not exist.", + "description": "try with dataset=inexistent-dataset.", + "value": { + "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." + } + }, + "InexistentSplitError": { + "summary": "The response is not found because the split does not exist.", + "description": "try with split=inexistent-split.", + "value": { + "error": "Not found." + } + }, + "AuthorizedPrivateDatasetError": { + "summary": "The dataset is private, and you are not authorized.", + "description": "try with dataset=severo/test_private.", + "value": { + "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." + } + }, + "UnauthorizedPrivateDatasetError": { + "summary": "The dataset is private, and you are authorized, but private datasets are not supported yet.", + "description": "try with dataset=severo/test_private.", + "value": { + "error": "Not found." + } + }, + "UnauthorizedGatedDatasetError": { + "summary": "The dataset is public but gated, and you are not authenticated or authorized.", + "description": "try with dataset=severo/test_gated.", + "value": { + "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." + } + }, + "MissingDatasetParameterError": { + "summary": "The dataset parameter is missing.", + "description": "try without setting ?dataset", + "value": { + "error": "Parameter 'dataset' is required" + } + }, + "EmptyDatasetParameterError": { + "summary": "The dataset parameter is empty.", + "description": "try with ?dataset=", + "value": { + "error": "Parameter 'dataset' is required" + } + }, + "MissingDatasetConfigSplitParameterError": { + "summary": "One of the dataset, config or split parameters is missing.", + "description": "try without setting ?dataset", + "value": { + "error": "Parameters 'split', 'config' and 'dataset' are required" + } + }, + "EmptyDatasetConfigSplitParameterError": { + "summary": "One of the dataset, config or split parameters is empty.", + "description": "try with ?dataset=", + "value": { + "error": "Parameters 'split', 'config' and 'dataset' are required" + } + }, + "ResponseNotReadyError": { + "summary": "The response is not ready yet. You can retry later. The response header 'x-error-code' contains 'ResponseNotReady'.", + "description": "Create a new dataset and try immediately, before the response could be generated.", + "value": { + "error": "The server is busier than usual and the response is not ready yet. Please retry later." + } + }, + "UnexpectedJsonError": { + "summary": "The server encountered an unexpected error", + "description": "This error indicates a bug in the code or a failure in the infrastructure. It can be reported to https://github.com/huggingface/datasets-server/issues.", + "value": { + "error": "Unexpected error." + } + }, + "UnexpectedTextError": { + "summary": "The server encountered an unexpected error", + "description": "This error indicates a bug in the code or a failure in the infrastructure. It can be reported to https://github.com/huggingface/datasets-server/issues.", + "value": "Internal Server Error." + } + }, + "parameters": { + "RequiredDataset": { + "name": "dataset", + "in": "query", + "description": "The identifier of the dataset on the Hub.", + "required": true, + "schema": { + "type": "string" + }, + "examples": { + "glue": { + "summary": "A canonical dataset", + "value": "glue" + }, + "Helsinki-NLP/tatoeba_mt": { + "summary": "A namespaced dataset", + "value": "Helsinki-NLP/tatoeba_mt" + } + } + }, + "RequiredConfig": { + "name": "config", + "in": "query", + "description": "The dataset configuration (or subset).", + "required": true, + "schema": { + "type": "string" + }, + "examples": { + "cola": { + "summary": "A subset of the glue dataset", + "value": "cola" + }, + "yangdong/ecqa": { + "summary": "The default configuration given by the 🤗 Datasets library", + "value": "yangdong--ecqa" + } + } + }, + "RequiredSplit": { + "name": "split", + "in": "query", + "description": "The split name.", + "required": true, + "schema": { + "type": "string" + }, + "examples": { + "train": { + "summary": "train split", + "value": "train" + }, + "test": { + "summary": "test split", + "value": "test" + }, + "validation": { + "summary": "validation split", + "value": "validation" + } + } + }, + "OptionalConfig": { + "name": "config", + "in": "query", + "description": "The dataset configuration (or subset) on which to filter the response.", + "schema": { + "type": "string" + }, + "examples": { + "cola": { + "summary": "A subset of the glue dataset", + "value": "cola" + }, + "yangdong/ecqa": { + "summary": "The default configuration given by the 🤗 Datasets library", + "value": "yangdong--ecqa" + } + } + }, + "OptionalSplit": { + "name": "split", + "in": "query", + "description": "The split name.", + "schema": { + "type": "string" + }, + "examples": { + "train": { + "summary": "train split", + "value": "train" + }, + "test": { + "summary": "test split", + "value": "test" + }, + "validation": { + "summary": "validation split", + "value": "validation" + } + } + } + }, + "responses": { + "Common401": { + "description": "If the external authentication step on the Hugging Face Hub failed, and no authentication mechanism has been provided. Retry with authentication.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-401" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "inexistent dataset, and not authenticated": { + "$ref": "#/components/examples/InexistentDatasetError" + }, + "private dataset, and not authenticated or authorized": { + "$ref": "#/components/examples/UnauthorizedPrivateDatasetError" + } + } + } + } + }, + "Dataset404": { + "description": "If the repository to download from cannot be found. This may be because it doesn't exist, or because it is set to `private` and you do not have access.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-404" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "inexistent dataset, while authenticated": { + "$ref": "#/components/examples/InexistentDatasetError" + }, + "private dataset, while authenticated and authorized": { + "$ref": "#/components/examples/AuthorizedPrivateDatasetError" + }, + "gated dataset, and not authenticated or authorized": { + "$ref": "#/components/examples/UnauthorizedGatedDatasetError" + } + } + } + } + }, + "Dataset422": { + "description": "The `dataset` parameter has not been provided.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-422" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "missing dataset parameter": { + "$ref": "#/components/examples/MissingDatasetParameterError" + }, + "empty dataset parameter": { + "$ref": "#/components/examples/EmptyDatasetParameterError" + } + } + } + } + }, + "DatasetConfig404": { + "description": "If the repository to download from cannot be found. This may be because it doesn't exist, or because it is set to `private` and you do not have access.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-404" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "inexistent dataset, while authenticated": { + "$ref": "#/components/examples/InexistentDatasetError" + }, + "private dataset, while authenticated and authorized": { + "$ref": "#/components/examples/AuthorizedPrivateDatasetError" + }, + "gated dataset, and not authenticated or authorized": { + "$ref": "#/components/examples/UnauthorizedGatedDatasetError" + }, + "inexistent config": { + "$ref": "#/components/examples/InexistentConfigError" + } + } + } + } + }, + "DatasetConfigSplit404": { + "description": "If the repository to download from cannot be found, or if the config or split does not exist in the dataset. Note that this may be because the dataset doesn't exist, or because it is set to `private` and you do not have access.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-404" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "inexistent dataset, while authenticated": { + "$ref": "#/components/examples/InexistentDatasetError" + }, + "private dataset, while authenticated and authorized": { + "$ref": "#/components/examples/AuthorizedPrivateDatasetError" + }, + "gated dataset, and not authenticated or authorized": { + "$ref": "#/components/examples/UnauthorizedGatedDatasetError" + }, + "inexistent config": { + "$ref": "#/components/examples/InexistentConfigError" + }, + "inexistent split": { + "$ref": "#/components/examples/InexistentSplitError" + } + } + } + } + }, + "DatasetConfigSplit422": { + "description": "Some of the `dataset`, `config`, or `split` parameters have not been provided or are invalid.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-422" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "missing required parameter": { + "$ref": "#/components/examples/MissingDatasetConfigSplitParameterError" + }, + "empty required parameter": { + "$ref": "#/components/examples/EmptyDatasetConfigSplitParameterError" + } + } + } + } + } } }, "paths": { @@ -989,25 +1635,19 @@ ], "parameters": [ { - "name": "dataset", - "in": "query", - "description": "The identifier of the dataset on the Hub.", - "required": true, - "schema": { "type": "string" }, - "examples": { - "glue": { "summary": "a canonical dataset", "value": "glue" }, - "Helsinki-NLP/tatoeba_mt": { - "summary": "a namespaced dataset", - "value": "Helsinki-NLP/tatoeba_mt" - } - } + "$ref": "#/components/parameters/RequiredDataset" + }, + { + "$ref": "#/components/parameters/OptionalConfig" } ], "responses": { "200": { "description": "A list of splits.
Beware: the response is not paginated.", "headers": { - "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } @@ -1018,79 +1658,55 @@ "$ref": "#/components/schemas/SplitsResponse" }, "examples": { - "duorc": { + "all splits in a dataset": { "summary": "duorc: two configs, six splits", + "description": "try it at https://datasets-server.huggingface.co/splits?dataset=duorc.", "value": { "splits": [ { "dataset": "duorc", - "config": "SelfRC", - "split": "train", - "num_bytes": 239852925, - "num_examples": 60721 - }, - { - "dataset": "duorc", - "config": "SelfRC", - "split": "validation", - "num_bytes": 51662575, - "num_examples": 12961 + "config": "ParaphraseRC", + "split": "train" }, { "dataset": "duorc", - "config": "SelfRC", - "split": "test", - "num_bytes": 49142766, - "num_examples": 12559 + "config": "ParaphraseRC", + "split": "validation" }, { "dataset": "duorc", "config": "ParaphraseRC", - "split": "train", - "num_bytes": 496683105, - "num_examples": 69524 + "split": "test" }, { "dataset": "duorc", - "config": "ParaphraseRC", - "split": "validation", - "num_bytes": 106510545, - "num_examples": 15591 + "config": "SelfRC", + "split": "train" }, { "dataset": "duorc", - "config": "ParaphraseRC", - "split": "test", - "num_bytes": 115215816, - "num_examples": 15857 + "config": "SelfRC", + "split": "validation" + }, + { + "dataset": "duorc", + "config": "SelfRC", + "split": "test" } - ] + ], + "pending": [], + "failed": [] } }, - "emotion": { - "summary": "emotion: one config, three splits", + "splits for a single config": { + "summary": "emotion has two configs. Setting config=unsplit only returns the splits for this config.", + "description": "try it at https://datasets-server.huggingface.co/splits?dataset=emotion&config=unsplit.", "value": { "splits": [ { "dataset": "emotion", - "config": "default", - "split": "train", - "num_bytes": 1741541, - "num_examples": 16000 - }, - { - "dataset": "emotion", - "config": "default", - "split": "validation", - "num_bytes": 214699, - "num_examples": 2000 - }, - { - "dataset": "emotion", - "config": "default", - "split": "test", - "num_bytes": 217177, - "num_examples": 2000 + "config": "unsplit", + "split": "train" } ] } @@ -1100,117 +1716,13 @@ } }, "401": { - "description": "If the external authentication step on the Hugging Face Hub failed, and no authentication mechanism has been provided. Retry with authentication.", - "headers": { - "Cache-Control": { - "$ref": "#/components/headers/Cache-Control" - }, - "Access-Control-Allow-Origin": { - "$ref": "#/components/headers/Access-Control-Allow-Origin" - }, - "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-splits-401" - } - }, - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CustomError" - }, - "examples": { - "inexistent-dataset": { - "summary": "The dataset does not exist.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } - }, - "gated-dataset": { - "summary": "The dataset is gated.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } - }, - "private-dataset": { - "summary": "The dataset is private.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } - } - } - } - } + "$ref": "#/components/responses/Common401" }, "404": { - "description": "If the repository to download from cannot be found. This may be because it doesn't exist, or because it is set to `private` and you do not have access.", - "headers": { - "Cache-Control": { - "$ref": "#/components/headers/Cache-Control" - }, - "Access-Control-Allow-Origin": { - "$ref": "#/components/headers/Access-Control-Allow-Origin" - }, - "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-splits-404" - } - }, - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CustomError" - }, - "examples": { - "inexistent-dataset": { - "summary": "The dataset does not exist, while authentication was provided in the request.", - "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." - } - }, - "gated-dataset": { - "summary": "The dataset is private, while authentication was provided in the request.", - "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." - } - }, - "private-dataset": { - "summary": "The dataset is private, while authentication was provided in the request.", - "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." - } - } - } - } - } + "$ref": "#/components/responses/DatasetConfig404" }, "422": { - "description": "The `dataset` parameter has not been provided.", - "headers": { - "Cache-Control": { - "$ref": "#/components/headers/Cache-Control" - }, - "Access-Control-Allow-Origin": { - "$ref": "#/components/headers/Access-Control-Allow-Origin" - }, - "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-splits-422" - } - }, - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CustomError" - }, - "examples": { - "missing-parameter": { - "summary": "The dataset parameter is missing.", - "value": { "error": "Parameter 'dataset' is required" } - }, - "empty-parameter": { - "summary": "The dataset parameter is empty (?dataset=).", - "value": { "error": "Parameter 'dataset' is required" } - } - } - } - } + "$ref": "#/components/responses/Dataset422" }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", @@ -1222,7 +1734,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-splits-500" + "$ref": "#/components/headers/X-Error-Code-500" } }, "content": { @@ -1231,54 +1743,30 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "SplitsNotFoundError": { - "summary": "SplitsNotFoundError", - "value": { - "error": "Cannot get the split names for the dataset.", - "cause_exception": "SplitsNotFoundError", - "cause_message": "The split names could not be parsed from the dataset config.", - "cause_traceback": [ - "Traceback (most recent call last):\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/inspect.py\", line 354, in get_dataset_config_info\n for split_generator in builder._split_generators(\n", - "TypeError: _split_generators() missing 1 required positional argument: 'pipeline'\n", - "\nThe above exception was the direct cause of the following exception:\n\n", - "Traceback (most recent call last):\n", - " File \"/src/services/worker/src/worker/responses/splits.py\", line 74, in get_splits_response\n split_full_names = get_dataset_split_full_names(dataset, hf_token)\n", - " File \"/src/services/worker/src/worker/responses/splits.py\", line 35, in get_dataset_split_full_names\n return [\n", - " File \"/src/services/worker/src/worker/responses/splits.py\", line 38, in \n for split in get_dataset_split_names(dataset, config, token=hf_token)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/inspect.py\", line 404, in get_dataset_split_names\n info = get_dataset_config_info(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/inspect.py\", line 359, in get_dataset_config_info\n raise SplitsNotFoundError(\"The split names could not be parsed from the dataset config.\") from err\n", - "datasets.inspect.SplitsNotFoundError: The split names could not be parsed from the dataset config.\n" - ] - } - }, - "FileNotFoundError": { - "summary": "FileNotFoundError", + "error in the dataset itself": { + "summary": "The dataset is empty, or a file is missing, or some other error that prevents the response to be created.", + "description": "Try with dataset=severo/empty", "value": { - "error": "Cannot get the split names for the dataset.", - "cause_exception": "FileNotFoundError", - "cause_message": "Couldn't find a dataset script at /src/services/worker/akhaliq/test/test.py or any data file in the same directory. Couldn't find 'akhaliq/test' on the Hugging Face Hub either: FileNotFoundError: The dataset repository at 'akhaliq/test' doesn't contain any data file.", + "error": "The dataset is empty.", + "cause_exception": "EmptyDatasetError", + "cause_message": "The directory at hf://datasets/severo/empty@5db043c2aee5fe0f2118c134de45f7b2e3230fbc doesn't contain any data files", "cause_traceback": [ "Traceback (most recent call last):\n", - " File \"/src/services/worker/src/worker/responses/splits.py\", line 74, in get_splits_response\n split_full_names = get_dataset_split_full_names(dataset, hf_token)\n", - " File \"/src/services/worker/src/worker/responses/splits.py\", line 37, in get_dataset_split_full_names\n for config in get_dataset_config_names(dataset, token=hf_token)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/inspect.py\", line 289, in get_dataset_config_names\n dataset_module = dataset_module_factory(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/load.py\", line 1243, in dataset_module_factory\n raise FileNotFoundError(\n", - "FileNotFoundError: Couldn't find a dataset script at /src/services/worker/akhaliq/test/test.py or any data file in the same directory. Couldn't find 'akhaliq/test' on the Hugging Face Hub either: FileNotFoundError: The dataset repository at 'akhaliq/test' doesn't contain any data file.\n" + " File \"/src/services/worker/src/worker/job_runners/dataset/config_names.py\", line 56, in compute_config_names_response\n for config in sorted(get_dataset_config_names(path=dataset, use_auth_token=use_auth_token))\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/inspect.py\", line 351, in get_dataset_config_names\n dataset_module = dataset_module_factory(\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/load.py\", line 1486, in dataset_module_factory\n raise e1 from None\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/load.py\", line 1469, in dataset_module_factory\n return HubDatasetModuleFactoryWithoutScript(\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/load.py\", line 1032, in get_module\n else get_data_patterns(base_path, download_config=self.download_config)\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/data_files.py\", line 459, in get_data_patterns\n raise EmptyDatasetError(f\"The directory at {base_path} doesn't contain any data files\") from None\n", + "datasets.data_files.EmptyDatasetError: The directory at hf://datasets/severo/empty@5db043c2aee5fe0f2118c134de45f7b2e3230fbc doesn't contain any data files\n" ] } }, - "not-ready": { - "summary": "the response is not ready yet.", - "value": { - "error": "The server is busier than usual and the list of splits is not ready yet. Please retry later." - } + "response not ready": { + "$ref": "#/components/examples/ResponseNotReadyError" }, - "internal": { - "summary": "internal error", - "value": { - "error": "Unexpected error." - } + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" } } }, @@ -1287,16 +1775,16 @@ "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { - "internal": { - "summary": "internal error", - "value": { - "error": "Internal Server Error" - } + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" } } } } } + }, + "requestBody": { + "content": {} } } }, @@ -1320,56 +1808,13 @@ ], "parameters": [ { - "name": "dataset", - "in": "query", - "description": "The identifier of the dataset on the Hub.", - "required": true, - "schema": { "type": "string" }, - "examples": { - "glue": { "summary": "a canonical dataset", "value": "glue" }, - "Helsinki-NLP/tatoeba_mt": { - "summary": "a namespaced dataset", - "value": "Helsinki-NLP/tatoeba_mt" - } - } + "$ref": "#/components/parameters/RequiredDataset" }, { - "name": "config", - "in": "query", - "description": "The dataset configuration (or subset).", - "required": true, - "schema": { "type": "string" }, - "examples": { - "cola": { - "summary": "a subset of the glue dataset", - "value": "cola" - }, - "yangdong/ecqa": { - "summary": "the default configuration given by the 🤗 Datasets library", - "value": "yangdong--ecqa" - } - } + "$ref": "#/components/parameters/RequiredConfig" }, { - "name": "split", - "in": "query", - "description": "The split name.", - "required": true, - "schema": { "type": "string" }, - "examples": { - "train": { - "summary": "train split", - "value": "train" - }, - "test": { - "summary": "test split", - "value": "test" - }, - "validation": { - "summary": "validation split", - "value": "validation" - } - } + "$ref": "#/components/parameters/RequiredSplit" } ], "responses": { @@ -1389,8 +1834,9 @@ "$ref": "#/components/schemas/FirstRowsResponse" }, "examples": { - "imdb": { - "summary": "text, and label column (only 3 rows are shown for brevity)", + "A simple dataset (imdb) with text and label": { + "summary": "Text, and label column. Only 3 rows are shown for brevity.", + "description": "Try it at https://datasets-server.huggingface.co/first-rows?dataset=imdb&config=plain_text&split=train.", "value": { "dataset": "imdb", "config": "plain_text", @@ -1408,7 +1854,6 @@ "feature_idx": 1, "name": "label", "type": { - "num_classes": 2, "names": ["neg", "pos"], "_type": "ClassLabel" } @@ -1438,20 +1883,13 @@ "label": 0 }, "truncated_cells": [] - }, - { - "row_idx": 3, - "row": { - "text": "This film was probably inspired by Godard's Masculin, féminin and I urge you to see that film instead.

The film has two strong elements and those are, (1) the realistic acting (2) the impressive, undeservedly good, photo. Apart from that, what strikes me most is the endless stream of silliness. Lena Nyman has to be most annoying actress in the world. She acts so stupid and with all the nudity in this film,...it's unattractive. Comparing to Godard's film, intellectuality has been replaced with stupidity. Without going too far on this subject, I would say that follows from the difference in ideals between the French and the Swedish society.

A movie of its time, and place. 2/10.", - "label": 0 - }, - "truncated_cells": [] } ] } }, - "truncated": { - "summary": "truncated cells due to the response size (has a timestamp column)", + "Truncated cells": { + "summary": "Truncated cells due to the response size (has a timestamp column). Only 3 rows are shown for brevity.", + "description": "Try it at https://datasets-server.huggingface.co/first-rows?dataset=ett&config=m2&split=test.", "value": { "dataset": "ett", "config": "m2", @@ -1543,23 +1981,13 @@ "item_id": "OT" }, "truncated_cells": ["target", "feat_dynamic_real"] - }, - { - "row_idx": 3, - "row": { - "start": "2016-07-01T00:00:00", - "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039", - "feat_static_cat": [0], - "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611", - "item_id": "OT" - }, - "truncated_cells": ["target", "feat_dynamic_real"] } ] } }, - "image": { - "summary": "a column with images (only 3 rows are shown for brevity)", + "Image column": { + "summary": "A column with images. Only 3 rows are shown for brevity.", + "description": "Try it at https://datasets-server.huggingface.co/first-rows?dataset=huggan/horse2zebra&config=huggan--horse2zebra-aligned&split=train.", "value": { "dataset": "huggan/horse2zebra", "config": "huggan--horse2zebra-aligned", @@ -1585,12 +2013,12 @@ "row_idx": 0, "row": { "imageA": { - "url": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/0/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/0/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "url": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/0/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/0/imageB/image.jpg", "height": 256, "width": 256 } @@ -1601,12 +2029,12 @@ "row_idx": 1, "row": { "imageA": { - "url": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/1/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/1/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "url": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/1/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/1/imageB/image.jpg", "height": 256, "width": 256 } @@ -1617,28 +2045,12 @@ "row_idx": 2, "row": { "imageA": { - "url": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/2/imageA/image.jpg", - "height": 256, - "width": 256 - }, - "imageB": { - "url": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/2/imageB/image.jpg", - "height": 256, - "width": 256 - } - }, - "truncated_cells": [] - }, - { - "row_idx": 3, - "row": { - "imageA": { - "url": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/3/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/2/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "url": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/3/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/2/imageB/image.jpg", "height": 256, "width": 256 } @@ -1648,16 +2060,17 @@ ] } }, - "audio": { - "summary": "a column with audio files (only 3 rows are shown for brevity)", + "Audio column": { + "summary": "A column with audio files. Only 3 rows are shown for brevity.", + "description": "Try it at https://datasets-server.huggingface.co/first-rows?dataset=asapp%2Fslue&config=voxceleb&split=train.", "value": { - "dataset": "mozilla-foundation/common_voice_9_0", - "config": "en", + "dataset": "asapp/slue", + "config": "voxceleb", "split": "train", "features": [ { "feature_idx": 0, - "name": "client_id", + "name": "id", "type": { "dtype": "string", "_type": "Value" @@ -1665,81 +2078,49 @@ }, { "feature_idx": 1, - "name": "path", - "type": { - "dtype": "string", - "_type": "Value" - } - }, - { - "feature_idx": 2, "name": "audio", "type": { - "sampling_rate": 48000, + "sampling_rate": 16000, "_type": "Audio" } }, { - "feature_idx": 3, - "name": "sentence", - "type": { - "dtype": "string", - "_type": "Value" - } - }, - { - "feature_idx": 4, - "name": "up_votes", - "type": { - "dtype": "int64", - "_type": "Value" - } - }, - { - "feature_idx": 5, - "name": "down_votes", - "type": { - "dtype": "int64", - "_type": "Value" - } - }, - { - "feature_idx": 6, - "name": "age", + "feature_idx": 2, + "name": "speaker_id", "type": { "dtype": "string", "_type": "Value" } }, { - "feature_idx": 7, - "name": "gender", + "feature_idx": 3, + "name": "normalized_text", "type": { "dtype": "string", "_type": "Value" } }, { - "feature_idx": 8, - "name": "accent", + "feature_idx": 4, + "name": "sentiment", "type": { "dtype": "string", "_type": "Value" } }, { - "feature_idx": 9, - "name": "locale", + "feature_idx": 5, + "name": "start_second", "type": { - "dtype": "string", + "dtype": "float64", "_type": "Value" } }, { - "feature_idx": 10, - "name": "segment", + "feature_idx": 6, + "name": "end_second", "type": { - "dtype": "string", + "dtype": "float64", "_type": "Value" } } @@ -1748,236 +2129,84 @@ { "row_idx": 0, "row": { - "client_id": "04960d53cc851eeb6d93f21a09e09ab36fe16943acb226ced1211d7250ab2f1b9a1d655c1cc03d50006e396010851ad52d4c53f49dd77b080b01c4230704c68d", - "path": null, + "id": "id10059_229vKIGbxrI_00001", "audio": [ { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/assets/mozilla-foundation/common_voice_9_0/--/en/train/0/audio/audio.mp3", + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/0/audio/audio.mp3", "type": "audio/mpeg" }, { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/assets/mozilla-foundation/common_voice_9_0/--/en/train/0/audio/audio.wav", + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/0/audio/audio.wav", "type": "audio/wav" } ], - "sentence": "Why does Melissandre look like she wants to consume Jon Snow on the ride up the wall?", - "up_votes": 2, - "down_votes": 0, - "age": "fourties", - "gender": "male", - "accent": "United States English", - "locale": "en", - "segment": "" + "speaker_id": "id10059", + "normalized_text": "and i i don't believe in god no religion says yet i was", + "sentiment": "Neutral", + "start_second": 0, + "end_second": 4.24 }, "truncated_cells": [] }, { "row_idx": 1, "row": { - "client_id": "f9f1f96bae1390dfe61ff298abb90975c079e913c712d57d97307ed797469eac446abb149daaad24cacffcc24e1e3275fefeb97f977eb74ce2233e0e5c1d437e", - "path": null, - "audio": [ - { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/assets/mozilla-foundation/common_voice_9_0/--/en/train/1/audio/audio.mp3", - "type": "audio/mpeg" - }, - { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/assets/mozilla-foundation/common_voice_9_0/--/en/train/1/audio/audio.wav", - "type": "audio/wav" - } - ], - "sentence": "\"I'm getting them for twelve dollars a night.\"", - "up_votes": 2, - "down_votes": 0, - "age": "", - "gender": "", - "accent": "", - "locale": "en", - "segment": "" - }, - "truncated_cells": [] - }, - { - "row_idx": 2, - "row": { - "client_id": "a6c7706a220eeea7ee3687c1122fe7ac17962d2449d25b6db37cc41cdaace442683e11945b6f581e73941c3083cd4eecfafc938840459cd8c571dae7774ee687", - "path": null, + "id": "id10059_229vKIGbxrI_00002", "audio": [ - { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/assets/mozilla-foundation/common_voice_9_0/--/en/train/2/audio/audio.mp3", - "type": "audio/mpeg" - }, - { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/assets/mozilla-foundation/common_voice_9_0/--/en/train/2/audio/audio.wav", - "type": "audio/wav" - } - ], - "sentence": "Tower of strength", - "up_votes": 2, - "down_votes": 0, - "age": "", - "gender": "", - "accent": "", - "locale": "en", - "segment": "" - }, - "truncated_cells": [] - } - ] - } - } - } - } - } - }, - "401": { - "description": "If the external authentication step on the Hugging Face Hub failed, and no authentication mechanism has been provided. Retry with authentication.", - "headers": { - "Cache-Control": { - "$ref": "#/components/headers/Cache-Control" - }, - "Access-Control-Allow-Origin": { - "$ref": "#/components/headers/Access-Control-Allow-Origin" - }, - "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-first-rows-401" - } - }, - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CustomError" - }, - "examples": { - "inexistent-dataset": { - "summary": "The dataset does not exist.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } - }, - "gated-dataset": { - "summary": "The dataset is gated.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } - }, - "private-dataset": { - "summary": "The dataset is private.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." + { + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/1/audio/audio.mp3", + "type": "audio/mpeg" + }, + { + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/1/audio/audio.wav", + "type": "audio/wav" + } + ], + "speaker_id": "id10059", + "normalized_text": "the question because of my mother till i was fourteen when i thought about it when i emerged with", + "sentiment": "Neutral", + "start_second": 0, + "end_second": 5.8 + }, + "truncated_cells": [] + }, + { + "row_idx": 2, + "row": { + "id": "id10059_229vKIGbxrI_00003", + "audio": [ + { + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/2/audio/audio.mp3", + "type": "audio/mpeg" + }, + { + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/2/audio/audio.wav", + "type": "audio/wav" + } + ], + "speaker_id": "id10059", + "normalized_text": "from my own culture things changed i i think about it a lot i value our", + "sentiment": "Neutral", + "start_second": 0, + "end_second": 5.67 + }, + "truncated_cells": [] + } + ] } } } } } }, + "401": { + "$ref": "#/components/responses/Common401" + }, "404": { - "description": "If the repository to download from cannot be found, or if the config or split does not exist in the dataset. Note that this may be because the dataset doesn't exist, or because it is set to `private` and you do not have access.", - "headers": { - "Cache-Control": { - "$ref": "#/components/headers/Cache-Control" - }, - "Access-Control-Allow-Origin": { - "$ref": "#/components/headers/Access-Control-Allow-Origin" - }, - "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-first-rows-404" - } - }, - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CustomError" - }, - "examples": { - "inexistent-dataset": { - "summary": "The dataset does not exist, while authentication was provided in the request.", - "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." - } - }, - "gated-dataset": { - "summary": "The dataset is private, while authentication was provided in the request.", - "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." - } - }, - "private-dataset": { - "summary": "The dataset is private, while authentication was provided in the request.", - "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." - } - }, - "inexistent-config": { - "summary": "The config does not exist in the dataset.", - "value": { "error": "Not found." } - }, - "inexistent-split": { - "summary": "The soplit does not exist in the dataset.", - "value": { "error": "Not found." } - } - } - } - } + "$ref": "#/components/responses/DatasetConfigSplit404" }, "422": { - "description": "Some of the `dataset`, `config`, or `split` parameters have not been provided or are invalid.", - "headers": { - "Cache-Control": { - "$ref": "#/components/headers/Cache-Control" - }, - "Access-Control-Allow-Origin": { - "$ref": "#/components/headers/Access-Control-Allow-Origin" - }, - "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-first-rows-422" - } - }, - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CustomError" - }, - "examples": { - "missing-dataset": { - "summary": "The dataset parameter is missing.", - "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" - } - }, - "missing-config": { - "summary": "The config parameter is missing.", - "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" - } - }, - "missing-split": { - "summary": "The split parameter is missing.", - "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" - } - }, - "empty-dataset": { - "summary": "The dataset parameter is empty.", - "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" - } - }, - "empty-config": { - "summary": "The config parameter is empty.", - "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" - } - }, - "empty-split": { - "summary": "The split parameter is empty.", - "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" - } - } - } - } - } + "$ref": "#/components/responses/DatasetConfigSplit422" }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", @@ -1989,7 +2218,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-first-rows-500" + "$ref": "#/components/headers/X-Error-Code-500-first-rows" } }, "content": { @@ -1998,109 +2227,47 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "NonMatchingCheckError": { - "summary": "NonMatchingCheckError", - "value": { - "error": "Cannot load the dataset split (in normal download mode) to extract the first rows.", - "cause_exception": "NonMatchingChecksumError", - "cause_message": "Checksums didn't match for dataset source files:\n['https://gitlab.com/bigirqu/ArCOV-19/-/archive/master/ArCOV-19-master.zip']", - "cause_traceback": [ - "Traceback (most recent call last):\n", - " File \"/src/services/worker/src/worker/responses/first_rows.py\", line 345, in get_first_rows_response\n rows = get_rows(\n", - " File \"/src/services/worker/src/worker/utils.py\", line 123, in decorator\n return func(*args, **kwargs)\n", - " File \"/src/services/worker/src/worker/responses/first_rows.py\", line 80, in get_rows\n rows_plus_one = list(itertools.islice(dataset, rows_max_number + 1))\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py\", line 718, in __iter__\n for key, example in self._iter():\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py\", line 708, in _iter\n yield from ex_iterable\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py\", line 112, in __iter__\n yield from self.generate_examples_fn(**self.kwargs)\n", - " File \"/root/.cache/huggingface/modules/datasets_modules/datasets/ar_cov19/818d9b774f4b70542b6807e6ddb6db32c916aafeba4fbdcd228ec79d21edaeab/ar_cov19.py\", line 131, in _generate_examples\n for fname in sorted(glob.glob(os.path.join(data_dir, \"ArCOV-19-master/dataset/all_tweets/2020-*\"))):\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/streaming.py\", line 67, in wrapper\n return function(*args, token=token, **kwargs)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/streaming_download_manager.py\", line 522, in xglob\n fs, *_ = fsspec.get_fs_token_paths(urlpath, storage_options=storage_options)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/core.py\", line 632, in get_fs_token_paths\n fs = filesystem(protocol, **inkwargs)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/registry.py\", line 262, in filesystem\n return cls(**storage_options)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/spec.py\", line 76, in __call__\n obj = super().__call__(*args, **kwargs)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/zip.py\", line 58, in __init__\n self.zip = zipfile.ZipFile(self.fo)\n", - " File \"/usr/local/lib/python3.9/zipfile.py\", line 1257, in __init__\n self._RealGetContents()\n", - " File \"/usr/local/lib/python3.9/zipfile.py\", line 1320, in _RealGetContents\n endrec = _EndRecData(fp)\n", - " File \"/usr/local/lib/python3.9/zipfile.py\", line 263, in _EndRecData\n fpin.seek(0, 2)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 684, in seek\n raise ValueError(\"Cannot seek streaming HTTP file\")\n", - "ValueError: Cannot seek streaming HTTP file\n", - "\nDuring handling of the above exception, another exception occurred:\n\n", - "Traceback (most recent call last):\n", - " File \"/src/services/worker/src/worker/responses/first_rows.py\", line 355, in get_first_rows_response\n rows = get_rows(\n", - " File \"/src/services/worker/src/worker/utils.py\", line 123, in decorator\n return func(*args, **kwargs)\n", - " File \"/src/services/worker/src/worker/responses/first_rows.py\", line 68, in get_rows\n dataset = load_dataset(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/load.py\", line 1746, in load_dataset\n builder_instance.download_and_prepare(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py\", line 704, in download_and_prepare\n self._download_and_prepare(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py\", line 1227, in _download_and_prepare\n super()._download_and_prepare(dl_manager, verify_infos, check_duplicate_keys=verify_infos)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py\", line 775, in _download_and_prepare\n verify_checksums(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/utils/info_utils.py\", line 40, in verify_checksums\n raise NonMatchingChecksumError(error_msg + str(bad_urls))\n", - "datasets.utils.info_utils.NonMatchingChecksumError: Checksums didn't match for dataset source files:\n['https://gitlab.com/bigirqu/ArCOV-19/-/archive/master/ArCOV-19-master.zip']\n" - ] - } - }, - "FileNotFoundError": { - "summary": "FileNotFoundError", + "error in the dataset itself": { + "summary": "An error while processing the dataset prevents the response to be created.", + "description": "Try with /first-rows?dataset=atomic&config=atomic&split=train", "value": { - "error": "Cannot load the dataset split (in normal download mode) to extract the first rows.", + "error": "Cannot load the dataset split (in streaming mode) to extract the first rows.", "cause_exception": "FileNotFoundError", - "cause_message": "Couldn't find file at https://homes.cs.washington.edu/~msap/atomic/data/atomic_data.tgz", + "cause_message": "https://homes.cs.washington.edu/~msap/atomic/data/atomic_data.tgz", "cause_traceback": [ "Traceback (most recent call last):\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 391, in _info\n await _file_info(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 772, in _file_info\n r.raise_for_status()\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/aiohttp/client_reqrep.py\", line 1004, in raise_for_status\n raise ClientResponseError(\n", - "aiohttp.client_exceptions.ClientResponseError: 404, message='Not Found', url=URL('https://homes.cs.washington.edu/~msap/atomic/data/atomic_data.tgz')\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 417, in _info\n await _file_info(\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 837, in _file_info\n r.raise_for_status()\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/aiohttp/client_reqrep.py\", line 1005, in raise_for_status\n raise ClientResponseError(\n", + "aiohttp.client_exceptions.ClientResponseError: 404, message='Not Found', url=URL('https://maartensap.com/atomic/data/atomic_data.tgz')\n", "\nThe above exception was the direct cause of the following exception:\n\n", "Traceback (most recent call last):\n", - " File \"/src/services/worker/src/worker/responses/first_rows.py\", line 345, in get_first_rows_response\n rows = get_rows(\n", - " File \"/src/services/worker/src/worker/utils.py\", line 123, in decorator\n return func(*args, **kwargs)\n", - " File \"/src/services/worker/src/worker/responses/first_rows.py\", line 80, in get_rows\n rows_plus_one = list(itertools.islice(dataset, rows_max_number + 1))\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py\", line 718, in __iter__\n for key, example in self._iter():\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py\", line 708, in _iter\n yield from ex_iterable\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py\", line 112, in __iter__\n yield from self.generate_examples_fn(**self.kwargs)\n", - " File \"/root/.cache/huggingface/modules/datasets_modules/datasets/atomic/c0f0ec7d10713c41dfc87f0cf17f936b122d22e19216051217c99134d38f6d7b/atomic.py\", line 123, in _generate_examples\n for path, f in files:\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/streaming_download_manager.py\", line 760, in __iter__\n yield from self.generator(*self.args, **self.kwargs)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/streaming_download_manager.py\", line 787, in _iter_from_urlpath\n with xopen(urlpath, \"rb\", token=token) as f:\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/streaming_download_manager.py\", line 453, in xopen\n file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/core.py\", line 141, in open\n out = self.__enter__()\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/core.py\", line 104, in __enter__\n f = self.fs.open(self.path, mode=mode)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/spec.py\", line 1037, in open\n f = self._open(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 340, in _open\n size = size or self.info(path, **kwargs)[\"size\"]\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/asyn.py\", line 86, in wrapper\n return sync(self.loop, func, *args, **kwargs)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/asyn.py\", line 66, in sync\n raise return_result\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/asyn.py\", line 26, in _runner\n result[0] = await coro\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 404, in _info\n raise FileNotFoundError(url) from exc\n", - "FileNotFoundError: https://homes.cs.washington.edu/~msap/atomic/data/atomic_data.tgz\n", - "\nDuring handling of the above exception, another exception occurred:\n\n", - "Traceback (most recent call last):\n", - " File \"/src/services/worker/src/worker/responses/first_rows.py\", line 355, in get_first_rows_response\n rows = get_rows(\n", - " File \"/src/services/worker/src/worker/utils.py\", line 123, in decorator\n return func(*args, **kwargs)\n", - " File \"/src/services/worker/src/worker/responses/first_rows.py\", line 68, in get_rows\n dataset = load_dataset(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/load.py\", line 1746, in load_dataset\n builder_instance.download_and_prepare(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py\", line 704, in download_and_prepare\n self._download_and_prepare(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py\", line 1227, in _download_and_prepare\n super()._download_and_prepare(dl_manager, verify_infos, check_duplicate_keys=verify_infos)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/builder.py\", line 771, in _download_and_prepare\n split_generators = self._split_generators(dl_manager, **split_generators_kwargs)\n", - " File \"/root/.cache/huggingface/modules/datasets_modules/datasets/atomic/c0f0ec7d10713c41dfc87f0cf17f936b122d22e19216051217c99134d38f6d7b/atomic.py\", line 95, in _split_generators\n archive = dl_manager.download(my_urls)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/download_manager.py\", line 309, in download\n downloaded_path_or_paths = map_nested(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/utils/py_utils.py\", line 385, in map_nested\n return function(data_struct)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/download_manager.py\", line 335, in _download\n return cached_path(url_or_filename, download_config=download_config)\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/utils/file_utils.py\", line 185, in cached_path\n output_path = get_from_cache(\n", - " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/utils/file_utils.py\", line 530, in get_from_cache\n raise FileNotFoundError(f\"Couldn't find file at {url}\")\n", - "FileNotFoundError: Couldn't find file at https://homes.cs.washington.edu/~msap/atomic/data/atomic_data.tgz\n" + " File \"/src/services/worker/src/worker/utils.py\", line 363, in get_rows_or_raise\n return get_rows(\n", + " File \"/src/services/worker/src/worker/utils.py\", line 305, in decorator\n return func(*args, **kwargs)\n", + " File \"/src/services/worker/src/worker/utils.py\", line 341, in get_rows\n rows_plus_one = list(itertools.islice(ds, rows_max_number + 1))\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py\", line 981, in __iter__\n for key, example in ex_iterable:\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py\", line 116, in __iter__\n yield from self.generate_examples_fn(**self.kwargs)\n", + " File \"/tmp/modules-cache/datasets_modules/datasets/atomic/c0f0ec7d10713c41dfc87f0cf17f936b122d22e19216051217c99134d38f6d7b/atomic.py\", line 123, in _generate_examples\n for path, f in files:\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/streaming_download_manager.py\", line 866, in __iter__\n yield from self.generator(*self.args, **self.kwargs)\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/streaming_download_manager.py\", line 917, in _iter_from_urlpath\n with xopen(urlpath, \"rb\", use_auth_token=use_auth_token) as f:\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/streaming_download_manager.py\", line 498, in xopen\n file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/core.py\", line 134, in open\n return self.__enter__()\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/core.py\", line 102, in __enter__\n f = self.fs.open(self.path, mode=mode)\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/spec.py\", line 1199, in open\n f = self._open(\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 356, in _open\n size = size or self.info(path, **kwargs)[\"size\"]\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/asyn.py\", line 115, in wrapper\n return sync(self.loop, func, *args, **kwargs)\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/asyn.py\", line 100, in sync\n raise return_result\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/asyn.py\", line 55, in _runner\n result[0] = await coro\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 430, in _info\n raise FileNotFoundError(url) from exc\n", + "FileNotFoundError: https://homes.cs.washington.edu/~msap/atomic/data/atomic_data.tgz\n" ] } }, - "not-ready": { - "summary": "the response is not ready yet.", - "value": { - "error": "The list of the first rows is not ready yet. Please retry later." - } + "response not ready": { + "$ref": "#/components/examples/ResponseNotReadyError" }, - "internal": { - "summary": "internal error", - "value": { - "error": "Unexpected error." - } + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" } } }, @@ -2109,11 +2276,8 @@ "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { - "internal": { - "summary": "internal error", - "value": { - "error": "Internal Server Error" - } + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" } } } @@ -2125,7 +2289,7 @@ "/rows": { "get": { "summary": "A slice of rows of a split", - "description": "The list of rows of a dataset split at a given slice location (offset).", + "description": "The list of rows of a dataset split at a given slice location (offset). Up to 100 rows are returned, use the length parameter to get less.", "externalDocs": { "description": "See rows (Hub docs)", "url": "https://huggingface.co/docs/datasets-server/rows" @@ -2142,64 +2306,23 @@ ], "parameters": [ { - "name": "dataset", - "in": "query", - "description": "The identifier of the dataset on the Hub.", - "required": true, - "schema": { "type": "string" }, - "examples": { - "glue": { "summary": "a canonical dataset", "value": "glue" }, - "Helsinki-NLP/tatoeba_mt": { - "summary": "a namespaced dataset", - "value": "Helsinki-NLP/tatoeba_mt" - } - } + "$ref": "#/components/parameters/RequiredDataset" }, { - "name": "config", - "in": "query", - "description": "The dataset configuration (or subset).", - "required": true, - "schema": { "type": "string" }, - "examples": { - "cola": { - "summary": "a subset of the glue dataset", - "value": "cola" - }, - "yangdong/ecqa": { - "summary": "the default configuration given by the 🤗 Datasets library", - "value": "yangdong--ecqa" - } - } + "$ref": "#/components/parameters/RequiredConfig" }, { - "name": "split", - "in": "query", - "description": "The split name.", - "required": true, - "schema": { "type": "string" }, - "examples": { - "train": { - "summary": "train split", - "value": "train" - }, - "test": { - "summary": "test split", - "value": "test" - }, - "validation": { - "summary": "validation split", - "value": "validation" - } - } + "$ref": "#/components/parameters/RequiredSplit" }, { "name": "offset", "in": "query", "description": "The offset of the slice.", - "default": 0, - "minimum": 0, - "schema": { "type": "integer" }, + "schema": { + "type": "integer", + "default": 0, + "minimum": 0 + }, "examples": { "0": { "summary": "from the beginning", @@ -2215,10 +2338,12 @@ "name": "length", "in": "query", "description": "The length of the slice", - "default": 100, - "minimum": 0, - "maximum": 100, - "schema": { "type": "integer" }, + "schema": { + "type": "integer", + "default": 100, + "minimum": 0, + "maximum": 100 + }, "examples": { "100": { "summary": "a slice of 100 rows", @@ -2229,7 +2354,7 @@ ], "responses": { "200": { - "description": "The features, and the list of rows of the requested slice.", + "description": "The features, and the list of rows of the requested slice. Audio and bytes columns are not supported at the moment, and their content will be 'null'.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" @@ -2241,26 +2366,23 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/RowsResponse" + "$ref": "#/components/schemas/PaginatedResponse" }, "examples": { - "imdb": { - "summary": "text, and label column (only 4 rows are shown for brevity)", + "A slice of a simple dataset (imdb)": { + "summary": "Get a slice of length 3 from row 234 (offset=234&length=3).", + "description": "Try it at https://datasets-server.huggingface.co/rows?dataset=imdb&config=plain_text&split=train&offset=234&length=3.", "value": { "features": [ { "feature_idx": 0, "name": "text", - "type": { - "dtype": "string", - "_type": "Value" - } + "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 1, "name": "label", "type": { - "num_classes": 2, "names": ["neg", "pos"], "_type": "ClassLabel" } @@ -2268,86 +2390,60 @@ ], "rows": [ { - "row_idx": 0, - "row": { - "text": "I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered \"controversial\" I really had to see this for myself.

The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.

What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. While my countrymen mind find it shocking, in reality sex and nudity are a major staple in Swedish cinema. Even Ingmar Bergman, arguably their answer to good old boy John Ford, had sex scenes in his films.

I do commend the filmmakers for the fact that any sex shown in the film is shown for artistic purposes rather than just to shock people and make money to be shown in pornographic theaters in America. I AM CURIOUS-YELLOW is a good film for anyone wanting to study the meat and potatoes (no pun intended) of Swedish cinema. But really, this film doesn't have much of a plot.", - "label": 0 - }, - "truncated_cells": [] - }, - { - "row_idx": 1, + "row_idx": 234, "row": { - "text": "\"I Am Curious: Yellow\" is a risible and pretentious steaming pile. It doesn't matter what one's political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn't true. I've seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don't exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we're treated to the site of Vincent Gallo's throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) \"double-standard\" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, and the same cannot be said for a man. In fact, you generally won't see female genitals in an American film in anything short of porn or explicit erotica. This alleged double-standard is less a double standard than an admittedly depressing ability to come to terms culturally with the insides of women's bodies.", + "text": "Well, you know the rest! This has to be the worst movie I've seen in a long long time. I can only imagine that Stephanie Beaham had some bills to pay when taking on this role.

The lead role is played by (to me) a complete unknown and I would imagine disappeared right back into obscurity right after this turkey.

Bruce Lee led the martial arts charge in the early 70's and since then fight scenes have to be either martial arts based or at least brutal if using street fighting techniques. This movie uses fast cuts to show off the martial arts, however, even this can't disguise the fact that the lady doesn't know how to throw a punch. An average 8 year old boy would take her apart on this showing.

Sorry, the only mystery on show here is how this didn't win the golden raspberry for its year.", "label": 0 }, "truncated_cells": [] }, { - "row_idx": 2, + "row_idx": 235, "row": { - "text": "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.

One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).

One might better spend one's time staring out a window at a tree growing.

", + "text": "I'm in Iraq right now doing a job that gives plenty of time for watching movies. We also have access to plenty of pirated movies, this gem came along with 11 other movies, and this is easily the worst I've seen in a long time. I've seen a few other reviews that claim this movie doesn't take itself too seriously, but really, I think that's a cover up for the fact that its horrible. It's not tongue in cheek, the writers really thought they were improving on the movie Blade. This movie is just one notch above Vampire Assassin, which if you haven't seen, i recommend. At least that movie is so unbelievably bad that you'll laugh harder than you thought possible. This is right at that cusp of no redeeming qualities what so ever. from the bad acting, to cliché visual (ie opening credits), to the adobe premier special effects. they couldn't even get blanks for the guns, which may have to do with where the movie was filmed, but if you're going to use effects, make them close to accurate. as for the cast, it seems like they just went to a tae bo class and picked up the first not to ugly chick that walked out. Once again, like Ron Hall in Vampire Assassin, don't let stunt folk act, they can't. Also, the comment about this being a \"return of old vampire movies\"...no, it's not. This is exactly what all new vampire movies are about. Buffy the Vampire Slayer, Blade, Underworld, they're all about some super star fighting the vampires. This is the newest vampire genre, with bad blood, fake screams, and cheesy over acting. obviously anyone who wrote a good review about this is somehow connected to the movie, or friends of the cast. But what do I care, I paid 33 cents for it. Anyway, to wrap this up, someone in their first semester of film school decided to make a movie, I give them credit because it's better than I could do. Of course I also know I can't make movies so I don't try. I do know how to watch movies though. I work 12 hour nights, 6 days a week, I've seen several thousand in the year I've been out here and this was so bad that half way through i was hoping for a mortar attack.", "label": 0 }, "truncated_cells": [] }, { - "row_idx": 3, + "row_idx": 236, "row": { - "text": "This film was probably inspired by Godard's Masculin, féminin and I urge you to see that film instead.

The film has two strong elements and those are, (1) the realistic acting (2) the impressive, undeservedly good, photo. Apart from that, what strikes me most is the endless stream of silliness. Lena Nyman has to be most annoying actress in the world. She acts so stupid and with all the nudity in this film,...it's unattractive. Comparing to Godard's film, intellectuality has been replaced with stupidity. Without going too far on this subject, I would say that follows from the difference in ideals between the French and the Swedish society.

A movie of its time, and place. 2/10.", + "text": "\"Valentine\" is another horror movie to add to the stalk and slash movie list (think \"Halloween\", \"Friday the 13th\", \"Scream\", and \"I Know What You Did Last Summer\"). It certainly isn't as good as those movies that I have listed about, but it's better than most of the ripoffs that came out after the first \"Friday the 13th\" film. One of those films was the 1981 Canadian made \"My Bloody Valentine\", which I hated alot. \"Valentine\" is a better film than that one, but it's not saying much. The plot: a nerdy young boy is teased and pranked by a couple of his classmates at the beginning of the film. Then the film moves years later when those classmates are all grown up, then they're picked off one-by-one. The killer is presumed to be the young boy now all grown up looking for revenge. But is it him? Or could it be somebody else? \"Valentine\" has an attractive cast which includes Denise Richards, David Boreanaz, Marley Shelton, Jessica Capshaw, and Katherine Heigl. They do what they can with the material they've got, but a lackluster script doesn't really do them any justice. There are some scary moments throughout, however.

** (out of four)", "label": 0 }, "truncated_cells": [] } - ] + ], + "num_total_rows": 25000 } }, - "image": { - "summary": "a column with images (only 4 rows are shown for brevity)", + "A slice of an image dataset (huggan/horse2zebra)": { + "summary": "Get a slice of length 3 from row 234 (offset=234&length=3).", + "description": "Try it at https://datasets-server.huggingface.co/rows?dataset=huggan/horse2zebra&config=huggan--horse2zebra-aligned&split=train&offset=234&length=3.", "value": { "features": [ { "feature_idx": 0, "name": "imageA", - "type": { - "_type": "Image" - } + "type": { "_type": "Image" } }, { "feature_idx": 1, "name": "imageB", - "type": { - "_type": "Image" - } + "type": { "_type": "Image" } } ], "rows": [ { - "row_idx": 0, - "row": { - "imageA": { - "url": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/0/imageA/image.jpg", - "height": 256, - "width": 256 - }, - "imageB": { - "url": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/0/imageB/image.jpg", - "height": 256, - "width": 256 - } - }, - "truncated_cells": [] - }, - { - "row_idx": 1, + "row_idx": 234, "row": { "imageA": { - "url": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/1/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/234/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "url": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/1/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/234/imageB/image.jpg", "height": 256, "width": 256 } @@ -2355,15 +2451,15 @@ "truncated_cells": [] }, { - "row_idx": 2, + "row_idx": 235, "row": { "imageA": { - "url": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/2/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/235/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "url": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/2/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/235/imageB/image.jpg", "height": 256, "width": 256 } @@ -2371,197 +2467,108 @@ "truncated_cells": [] }, { - "row_idx": 3, + "row_idx": 236, "row": { "imageA": { - "url": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/3/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/236/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "url": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/3/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/huggan--horse2zebra-aligned/train/236/imageB/image.jpg", "height": 256, "width": 256 } }, "truncated_cells": [] } - ] + ], + "num_total_rows": 1334 } }, - "audio": { - "summary": "a column with audio files (only 4 rows are shown for brevity)", + "Audio is not supported at the moment (example: asapp/slue)": { + "summary": "Get a slice of length 3 from row 234 (offset=234&length=3). The audio column is 'null'", + "description": "Try it at https://datasets-server.huggingface.co/rows?dataset=asapp/slue&config=voxceleb&split=train&offset=234&length=3.", "value": { "features": [ { "feature_idx": 0, - "name": "client_id", - "type": { - "dtype": "string", - "_type": "Value" - } + "name": "id", + "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 1, - "name": "path", - "type": { - "dtype": "string", - "_type": "Value" - } + "name": "audio", + "type": { "sampling_rate": 16000, "_type": "Audio" } }, { "feature_idx": 2, - "name": "audio", - "type": { - "sampling_rate": 48000, - "_type": "Audio" - } + "name": "speaker_id", + "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 3, - "name": "sentence", - "type": { - "dtype": "string", - "_type": "Value" - } + "name": "normalized_text", + "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 4, - "name": "up_votes", - "type": { - "dtype": "int64", - "_type": "Value" - } + "name": "sentiment", + "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 5, - "name": "down_votes", - "type": { - "dtype": "int64", - "_type": "Value" - } + "name": "start_second", + "type": { "dtype": "float64", "_type": "Value" } }, { "feature_idx": 6, - "name": "age", - "type": { - "dtype": "string", - "_type": "Value" - } - }, - { - "feature_idx": 7, - "name": "gender", - "type": { - "dtype": "string", - "_type": "Value" - } - }, - { - "feature_idx": 8, - "name": "accent", - "type": { - "dtype": "string", - "_type": "Value" - } - }, - { - "feature_idx": 9, - "name": "locale", - "type": { - "dtype": "string", - "_type": "Value" - } - }, - { - "feature_idx": 10, - "name": "segment", - "type": { - "dtype": "string", - "_type": "Value" - } + "name": "end_second", + "type": { "dtype": "float64", "_type": "Value" } } ], - "rows": [ - { - "row_idx": 0, - "row": { - "client_id": "04960d53cc851eeb6d93f21a09e09ab36fe16943acb226ced1211d7250ab2f1b9a1d655c1cc03d50006e396010851ad52d4c53f49dd77b080b01c4230704c68d", - "path": null, - "audio": [ - { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/cached-assets/mozilla-foundation/common_voice_9_0/--/en/train/0/audio/audio.mp3", - "type": "audio/mpeg" - }, - { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/cached-assets/mozilla-foundation/common_voice_9_0/--/en/train/0/audio/audio.wav", - "type": "audio/wav" - } - ], - "sentence": "Why does Melissandre look like she wants to consume Jon Snow on the ride up the wall?", - "up_votes": 2, - "down_votes": 0, - "age": "fourties", - "gender": "male", - "accent": "United States English", - "locale": "en", - "segment": "" + "rows": [ + { + "row_idx": 234, + "row": { + "id": "id10080_Xp1eLGN_fHI_00006", + "audio": null, + "speaker_id": "id10080", + "normalized_text": "well i i wasn't boasting to what happened is that i it was it was a few years ago now i hit thirty i thought it's so mad that i don't want to reach into", + "sentiment": "Neutral", + "start_second": 0.0, + "end_second": 8.46 }, "truncated_cells": [] }, { - "row_idx": 1, + "row_idx": 235, "row": { - "client_id": "f9f1f96bae1390dfe61ff298abb90975c079e913c712d57d97307ed797469eac446abb149daaad24cacffcc24e1e3275fefeb97f977eb74ce2233e0e5c1d437e", - "path": null, - "audio": [ - { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/cached-assets/mozilla-foundation/common_voice_9_0/--/en/train/1/audio/audio.mp3", - "type": "audio/mpeg" - }, - { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/cached-assets/mozilla-foundation/common_voice_9_0/--/en/train/1/audio/audio.wav", - "type": "audio/wav" - } - ], - "sentence": "\"I'm getting them for twelve dollars a night.\"", - "up_votes": 2, - "down_votes": 0, - "age": "", - "gender": "", - "accent": "", - "locale": "en", - "segment": "" + "id": "id10080_Xp1eLGN_fHI_00007", + "audio": null, + "speaker_id": "id10080", + "normalized_text": "there's none of there's none of the kind i've had to survive in the world i've done that as a book and and a lot of people", + "sentiment": "Neutral", + "start_second": 0.0, + "end_second": 5.12 }, "truncated_cells": [] }, { - "row_idx": 2, + "row_idx": 236, "row": { - "client_id": "a6c7706a220eeea7ee3687c1122fe7ac17962d2449d25b6db37cc41cdaace442683e11945b6f581e73941c3083cd4eecfafc938840459cd8c571dae7774ee687", - "path": null, - "audio": [ - { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/cached-assets/mozilla-foundation/common_voice_9_0/--/en/train/2/audio/audio.mp3", - "type": "audio/mpeg" - }, - { - "src": "https://datasets-server.us.dev.moon.huggingface.tech/cached-assets/mozilla-foundation/common_voice_9_0/--/en/train/2/audio/audio.wav", - "type": "audio/wav" - } - ], - "sentence": "Tower of strength", - "up_votes": 2, - "down_votes": 0, - "age": "", - "gender": "", - "accent": "", - "locale": "en", - "segment": "" + "id": "id10080_Xp1eLGN_fHI_00008", + "audio": null, + "speaker_id": "id10080", + "normalized_text": "but what are the things you've learned and and and nobody told me at school stuff like you know how to achieve goals how to thrive when it's difficult how to what sort of", + "sentiment": "Neutral", + "start_second": 0.0, + "end_second": 8.85 }, "truncated_cells": [] } - ] + ], + "num_total_rows": 5777 } } } @@ -2569,7 +2576,10 @@ } }, "401": { - "description": "If the external authentication step on the Hugging Face Hub failed, and no authentication mechanism has been provided. Retry with authentication.", + "$ref": "#/components/responses/Common401" + }, + "404": { + "description": "If the repository to download from cannot be found, or if the config or split does not exist in the dataset. Note that this may be because the dataset doesn't exist, or because it is set to `private` and you do not have access.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" @@ -2578,7 +2588,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-rows-401" + "$ref": "#/components/headers/X-Error-Code-404" } }, "content": { @@ -2587,30 +2597,32 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "inexistent-dataset": { - "summary": "The dataset does not exist.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } + "inexistent dataset, while authenticated": { + "$ref": "#/components/examples/InexistentDatasetError" }, - "gated-dataset": { - "summary": "The dataset is gated.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } + "private dataset, while authenticated and authorized": { + "$ref": "#/components/examples/AuthorizedPrivateDatasetError" }, - "private-dataset": { - "summary": "The dataset is private.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } + "gated dataset, and not authenticated or authorized": { + "$ref": "#/components/examples/UnauthorizedGatedDatasetError" + }, + "inexistent config": { + "$ref": "#/components/examples/InexistentConfigError" + }, + "inexistent split": { + "$ref": "#/components/examples/InexistentSplitError" + }, + "error in the dataset itself": { + "summary": "An error while processing the dataset prevents the response to be created.", + "description": "Try with /rows?dataset=atomic&config=atomic&split=train. It's a bug, it should be a 500 error, see https://github.com/huggingface/datasets-server/issues/1661.", + "value": { "error": "Not found." } } } } } }, - "404": { - "description": "If the repository to download from cannot be found, or if the config or split does not exist in the dataset. Note that this may be because the dataset doesn't exist, or because it is set to `private` and you do not have access.", + "422": { + "description": "Some of the `dataset`, `config`, `split`, `offset` or `length` parameters have not been provided or are invalid.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" @@ -2619,7 +2631,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-rows-404" + "$ref": "#/components/headers/X-Error-Code-422" } }, "content": { @@ -2628,36 +2640,250 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "inexistent-dataset": { - "summary": "The dataset does not exist, while authentication was provided in the request.", + "missing-dataset": { + "summary": "The dataset parameter is missing.", + "value": { + "error": "Parameters 'split', 'config' and 'dataset' are required" + } + }, + "missing-config": { + "summary": "The config parameter is missing.", "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." + "error": "Parameters 'split', 'config' and 'dataset' are required" } }, - "gated-dataset": { - "summary": "The dataset is private, while authentication was provided in the request.", + "missing-split": { + "summary": "The split parameter is missing.", "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." + "error": "Parameters 'split', 'config' and 'dataset' are required" } }, - "private-dataset": { - "summary": "The dataset is private, while authentication was provided in the request.", + "empty-dataset": { + "summary": "The dataset parameter is empty.", "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." + "error": "Parameters 'split', 'config' and 'dataset' are required" } }, - "inexistent-config": { - "summary": "The config does not exist in the dataset.", - "value": { "error": "Not found." } + "empty-config": { + "summary": "The config parameter is empty.", + "value": { + "error": "Parameters 'split', 'config' and 'dataset' are required" + } }, - "inexistent-split": { - "summary": "The soplit does not exist in the dataset.", - "value": { "error": "Not found." } + "empty-split": { + "summary": "The split parameter is empty.", + "value": { + "error": "Parameters 'split', 'config' and 'dataset' are required" + } + }, + "negative-offset": { + "summary": "The offset must be positive.", + "value": { + "error": "Offset must be positive" + } + }, + "negative-length": { + "summary": "The length must be positive.", + "value": { + "error": "Length must be positive" + } + } + } + } + } + }, + "500": { + "description": "The server crashed, or the response couldn't be generated successfully due to an error in the dataset itself. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-500-rows" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" + } + } + }, + "text/plain": { + "schema": { + "$ref": "#/components/schemas/ServerErrorResponse" + }, + "examples": { + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" + } + } + } + } + } + } + } + }, + "/search": { + "get": { + "summary": "Full-text search in the text columns of a split", + "description": "Returns the rows matching the query, ordered by row index. Up to 100 rows are returned. The offset and length parameters allow to navigate the results.", + "externalDocs": { + "description": "See search (Hub docs). The doc is still missing for the endpoint, see https://github.com/huggingface/datasets-server/issues/1663.", + "url": "https://huggingface.co/docs/datasets-server/" + }, + "operationId": "searchRows", + "security": [ + {}, + { + "HuggingFaceCookie": [] + }, + { + "HuggingFaceToken": [] + } + ], + "parameters": [ + { + "$ref": "#/components/parameters/RequiredDataset" + }, + { + "$ref": "#/components/parameters/RequiredConfig" + }, + { + "$ref": "#/components/parameters/RequiredSplit" + }, + { + "name": "query", + "in": "query", + "description": "The search query.", + "required": true, + "schema": { + "type": "string" + }, + "examples": { + "dog": { + "summary": "search the rows that contain the text 'dog'", + "value": "dog" + } + } + }, + { + "name": "offset", + "in": "query", + "description": "The offset of the returned rows.", + "schema": { + "type": "integer", + "default": 0, + "minimum": 0 + }, + "examples": { + "0": { + "summary": "from the beginning", + "value": 0 + }, + "100": { + "summary": "ignore the first 100 results", + "value": 100 + } + } + }, + { + "name": "length", + "in": "query", + "description": "The maximum number of returned rows", + "schema": { + "type": "integer", + "default": 100, + "minimum": 0, + "maximum": 100 + }, + "examples": { + "100": { + "summary": "up to 100 rows in the response", + "value": 100 + } + } + } + ], + "responses": { + "200": { + "description": "The features, and the list of rows that match the search query. The query will only be searched among the string columns. Audio and bytes columns are not supported at the moment, and their content will be 'null'.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/PaginatedResponse" + }, + "examples": { + "A slice of a simple dataset (imdb)": { + "summary": "The first 3 rows that match the 'dog' search query (query=dog&length=3).", + "description": "Try it at https://datasets-server.huggingface.co/search?dataset=imdb&config=plain_text&split=train&query=dog&length=3.", + "value": { + "features": [ + { + "feature_idx": 0, + "name": "text", + "type": { "dtype": "string", "_type": "Value" } + }, + { + "feature_idx": 1, + "name": "label", + "type": { "dtype": "int64", "_type": "Value" } + } + ], + "rows": [ + { + "row_idx": 27, + "row": { + "text": "Pedantic, overlong fabrication which attempts to chronicle the birth of the Federal Bureau of Investigations. Begins quite promisingly, with a still-relevant probe into an airplane explosion, however the melodrama involving James Stewart and wife Vera Miles just gets in the way (Miles had a habit of playing tepid wives under duress, and her frayed nerves arrive here right on schedule). Esteemed director Mervyn LeRoy helmed this adaptation of Don Whitehead's book, but despite the talent involved, the picture fails to make much of an impression. Best performance is turned in by Murray Hamilton as Stewart's partner, however most of the dialogue is ludicrous and the dogged pacing causes the movie to seem twice as long as it is. *1/2 from ****", + "label": 0 + }, + "truncated_cells": [] + }, + { + "row_idx": 51, + "row": { + "text": "The opening shot was the best thing about this movie, because it gave you hope that you would be seeing a passionate, well-crafted independent film. Damn that opening shot for filling me hope. As the \"film\" progressed in a slow, plodding manner, my thoughts were varied in relation to this \"film\": Was there too much butter in my popcorn? Did the actors have to PAY the director to be in this \"film\"? Did I get my ticket validated at the Box Office? Yes, dear reader. I saw this film in the Theatre! This would be the only exception I will make about seeing a film at home over a Movie Theatre, because at home you can TURN IT OFF. Were there any redeeming values? Peter Lemongelli as the standard college \"nerd\" had his moments, especially in a dog collar. Other than that this \"film\" went from trying to be a comedy, to a family drama to a spiritual uplifter. It succeeded on none of these fronts. Oh, and the girlfriend was realllllllllly bad. Her performance was the only comedy I found.", + "label": 0 + }, + "truncated_cells": [] + }, + { + "row_idx": 106, + "row": { + "text": "I saw this movie at the AFI Dallas festival. Most of the audience, including my wife, enjoyed this comedy-drama, but I didn't. It stars Lucas Haas (Brick, Alpha Dog), Molly Parker (Kissed, The Five Senses, Hollywoodland) and Adam Scott (First Snow, Art School Confidential). The director is Matt Bissonnette, who's married to Molly Parker. All three actors do a fine job in this movie about 3 friends, the marriage of two of them and infidelity involving the third. It all takes place at a lake house and it looks wonderful. The film wants to treat its subject as a comedy first and then a drama, and I thought it needed to be the other way around.", + "label": 0 + }, + "truncated_cells": [] + } + ], + "num_total_rows": 624 + } } } } } }, + "401": { + "$ref": "#/components/responses/Common401" + }, + "404": { + "$ref": "#/components/responses/DatasetConfigSplit404" + }, "422": { "description": "Some of the `dataset`, `config`, `split`, `offset` or `length` parameters have not been provided or are invalid.", "headers": { @@ -2668,7 +2894,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-rows-422" + "$ref": "#/components/headers/X-Error-Code-422" } }, "content": { @@ -2680,37 +2906,49 @@ "missing-dataset": { "summary": "The dataset parameter is missing.", "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" + "error": "Parameter 'dataset', 'config', 'split' and 'query' are required" } }, "missing-config": { "summary": "The config parameter is missing.", "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" + "error": "Parameter 'dataset', 'config', 'split' and 'query' are required" } }, "missing-split": { "summary": "The split parameter is missing.", "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" + "error": "Parameter 'dataset', 'config', 'split' and 'query' are required" + } + }, + "missing-query": { + "summary": "The query parameter is missing.", + "value": { + "error": "Parameter 'dataset', 'config', 'split' and 'query' are required" } }, "empty-dataset": { "summary": "The dataset parameter is empty.", "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" + "error": "Parameter 'dataset', 'config', 'split' and 'query' are required" } }, "empty-config": { "summary": "The config parameter is empty.", "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" + "error": "Parameter 'dataset', 'config', 'split' and 'query' are required" } }, "empty-split": { "summary": "The split parameter is empty.", "value": { - "error": "Parameters 'split', 'config' and 'dataset' are required" + "error": "Parameter 'dataset', 'config', 'split' and 'query' are required" + } + }, + "empty-query": { + "summary": "The query parameter is empty.", + "value": { + "error": "Parameter 'dataset', 'config', 'split' and 'query' are required" } }, "negative-offset": { @@ -2739,7 +2977,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-rows-500" + "$ref": "#/components/headers/X-Error-Code-500-search" } }, "content": { @@ -2748,17 +2986,29 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "not-ready": { - "summary": "the response is not ready yet.", + "error in the dataset itself": { + "summary": "An error while processing the dataset prevents the response to be created.", + "description": "Try with /search?dataset=atomic&config=atomic&split=train&query=dog", "value": { - "error": "The list of rows is not ready yet. Please retry later." + "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", + "cause_exception": "HTTPError", + "cause_message": "404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz", + "cause_traceback": [ + "Traceback (most recent call last):\n", + " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 506, in raise_if_too_big_from_external_data_files\n for i, size in enumerate(pool.imap_unordered(get_size, ext_data_files)):\n", + " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 870, in next\n raise value\n", + " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n", + " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 402, in _request_size\n response.raise_for_status()\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/requests/models.py\", line 1021, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\n", + "requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\n" + ] } }, - "internal": { - "summary": "internal error", - "value": { - "error": "Unexpected error." - } + "response not ready": { + "$ref": "#/components/examples/ResponseNotReadyError" + }, + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" } } }, @@ -2767,11 +3017,8 @@ "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { - "internal": { - "summary": "internal error", - "value": { - "error": "Internal Server Error" - } + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" } } } @@ -2799,43 +3046,20 @@ } ], "parameters": [ - { - "name": "dataset", - "in": "query", - "description": "The identifier of the dataset on the Hub.", - "required": true, - "schema": { "type": "string" }, - "examples": { - "glue": { "summary": "a canonical dataset", "value": "glue" }, - "Helsinki-NLP/tatoeba_mt": { - "summary": "a namespaced dataset", - "value": "Helsinki-NLP/tatoeba_mt" - } - } - }, - { - "name": "config", - "in": "query", - "description": "The dataset configuration (or subset).", - "required": false, - "schema": { "type": "string" }, - "examples": { - "cola": { - "summary": "a subset of the glue dataset", - "value": "cola" - }, - "yangdong/ecqa": { - "summary": "the default configuration given by the 🤗 Datasets library", - "value": "yangdong--ecqa" - } - } + { + "$ref": "#/components/parameters/RequiredDataset" + }, + { + "$ref": "#/components/parameters/OptionalConfig" } ], "responses": { "200": { "description": "A list of parquet files.
Beware: the response is not paginated.", "headers": { - "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } @@ -2843,11 +3067,12 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ParquetFilesResponse" + "$ref": "#/components/schemas/ParquetResponse" }, "examples": { "duorc": { "summary": "duorc: six parquet files, one per split", + "description": "Try with /parquet?dataset=duorc", "value": { "parquet_files": [ { @@ -2856,7 +3081,7 @@ "split": "test", "url": "https://huggingface.co/datasets/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/duorc-test.parquet", "filename": "duorc-test.parquet", - "size": 6136590 + "size": 6136591 }, { "dataset": "duorc", @@ -2864,7 +3089,7 @@ "split": "train", "url": "https://huggingface.co/datasets/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/duorc-train.parquet", "filename": "duorc-train.parquet", - "size": 26005667 + "size": 26005668 }, { "dataset": "duorc", @@ -2872,7 +3097,7 @@ "split": "validation", "url": "https://huggingface.co/datasets/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/duorc-validation.parquet", "filename": "duorc-validation.parquet", - "size": 5566867 + "size": 5566868 }, { "dataset": "duorc", @@ -2880,7 +3105,7 @@ "split": "test", "url": "https://huggingface.co/datasets/duorc/resolve/refs%2Fconvert%2Fparquet/SelfRC/duorc-test.parquet", "filename": "duorc-test.parquet", - "size": 3035735 + "size": 3035736 }, { "dataset": "duorc", @@ -2888,7 +3113,7 @@ "split": "train", "url": "https://huggingface.co/datasets/duorc/resolve/refs%2Fconvert%2Fparquet/SelfRC/duorc-train.parquet", "filename": "duorc-train.parquet", - "size": 14851719 + "size": 14851720 }, { "dataset": "duorc", @@ -2896,7 +3121,7 @@ "split": "validation", "url": "https://huggingface.co/datasets/duorc/resolve/refs%2Fconvert%2Fparquet/SelfRC/duorc-validation.parquet", "filename": "duorc-validation.parquet", - "size": 3114389 + "size": 3114390 } ], "pending": [], @@ -2906,6 +3131,7 @@ }, "duorc with ParaphraseRC config": { "summary": "duorc: three parquet files for ParaphraseRC, one per split", + "description": "Try with /parquet?dataset=duorc&config=ParaphraseRC", "value": { "parquet_files": [ { @@ -2914,7 +3140,7 @@ "split": "test", "url": "https://huggingface.co/datasets/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/duorc-test.parquet", "filename": "duorc-test.parquet", - "size": 6136590 + "size": 6136591 }, { "dataset": "duorc", @@ -2922,7 +3148,7 @@ "split": "train", "url": "https://huggingface.co/datasets/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/duorc-train.parquet", "filename": "duorc-train.parquet", - "size": 26005667 + "size": 26005668 }, { "dataset": "duorc", @@ -2930,49 +3156,27 @@ "split": "validation", "url": "https://huggingface.co/datasets/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/duorc-validation.parquet", "filename": "duorc-validation.parquet", - "size": 5566867 + "size": 5566868 } ], - "pending": [], - "failed": [], - "partial": false, "features": { - "plot_id": { - "dtype": "string", - "_type": "Value" - }, - "plot": { - "dtype": "string", - "_type": "Value" - }, - "title": { - "dtype": "string", - "_type": "Value" - }, - "question_id": { - "dtype": "string", - "_type": "Value" - }, - "question": { - "dtype": "string", - "_type": "Value" - }, + "plot_id": { "dtype": "string", "_type": "Value" }, + "plot": { "dtype": "string", "_type": "Value" }, + "title": { "dtype": "string", "_type": "Value" }, + "question_id": { "dtype": "string", "_type": "Value" }, + "question": { "dtype": "string", "_type": "Value" }, "answers": { - "feature": { - "dtype": "string", - "_type": "Value" - }, + "feature": { "dtype": "string", "_type": "Value" }, "_type": "Sequence" }, - "no_answer": { - "dtype": "bool", - "_type": "Value" - } - } + "no_answer": { "dtype": "bool", "_type": "Value" } + }, + "partial": false } }, - "sharded": { - "summary": "alexandrainst/da-wit: the parquet file for the train split is partitioned into 9 shards", + "sharded parquet files": { + "summary": "alexandrainst/da-wit: the parquet file for the train split is partitioned into 17 shards", + "description": "Try with /parquet?dataset=alexandrainst/da-wit", "value": { "parquet_files": [ { @@ -3132,13 +3336,39 @@ "failed": [], "partial": false } + }, + "dataset where no parquet file could be created": { + "summary": "When the parquet files cannot be created for a configuration, it's listed in 'failed'.", + "description": "Try with /parquet?dataset=atomic", + "value": { + "parquet_files": [], + "pending": [], + "failed": [ + { + "kind": "config-info", + "dataset": "atomic", + "config": "atomic", + "split": null + } + ], + "partial": false + } } } } } }, "401": { - "description": "If the external authentication step on the Hugging Face Hub failed, and no authentication mechanism has been provided. Retry with authentication.", + "$ref": "#/components/responses/Common401" + }, + "404": { + "$ref": "#/components/responses/DatasetConfig404" + }, + "422": { + "$ref": "#/components/responses/Dataset422" + }, + "500": { + "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" @@ -3147,7 +3377,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-splits-401" + "$ref": "#/components/headers/X-Error-Code-500-common" } }, "content": { @@ -3156,30 +3386,95 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "inexistent-dataset": { - "summary": "The dataset does not exist.", + "error in the dataset itself": { + "summary": "An error while processing the dataset prevents the response to be created.", + "description": "Try with /parquet?dataset=atomic&config=atomic", "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." + "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", + "cause_exception": "HTTPError", + "cause_message": "404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz", + "cause_traceback": [ + "Traceback (most recent call last):\n", + " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 506, in raise_if_too_big_from_external_data_files\n for i, size in enumerate(pool.imap_unordered(get_size, ext_data_files)):\n", + " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 870, in next\n raise value\n", + " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n", + " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 402, in _request_size\n response.raise_for_status()\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/requests/models.py\", line 1021, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\n", + "requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\n" + ] } }, - "gated-dataset": { - "summary": "The dataset is gated.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } + "response not ready": { + "$ref": "#/components/examples/ResponseNotReadyError" }, - "private-dataset": { - "summary": "The dataset is private.", + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" + } + } + }, + "text/plain": { + "schema": { + "$ref": "#/components/schemas/ServerErrorResponse" + }, + "examples": { + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" + } + } + } + } + } + } + } + }, + "/valid": { + "get": { + "summary": "Valid datasets", + "description": "The lists of the Hub datasets that work without an error, by type. It lists the datasets that have a Dataset Viewer (i.e. have been converted to Parquet format, and can be paginated) and the datasets that only have the Dataset Preview (the first 100 rows).", + "externalDocs": { + "description": "See Valid datasets (Hub docs)", + "url": "https://huggingface.co/docs/datasets-server/valid" + }, + "operationId": "listValidDatasets", + "parameters": [], + "responses": { + "200": { + "description": "The valid datasets.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ValidResponse" + }, + "examples": { + "an example of the response format (only kept the first values in each array for brevity)": { + "summary": "list of datasets", + "description": "Try with /valid", "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." + "preview": [ + "0n1xus/codexglue", + "0x7194633/rupile", + "AHussain0418/day2_data" + ], + "viewer": [ + "0n1xus/pytorrent-standalone", + "51la5/keyword-extraction" + ] } } } } } }, - "404": { - "description": "If the repository to download from cannot be found. This may be because it doesn't exist, or because it is set to `private` and you do not have access.", + "500": { + "description": "The server crashed.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" @@ -3188,7 +3483,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-splits-404" + "$ref": "#/components/headers/X-Error-Code-500-valid" } }, "content": { @@ -3197,30 +3492,132 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "inexistent-dataset": { - "summary": "The dataset does not exist, while authentication was provided in the request.", + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" + } + } + }, + "text/plain": { + "schema": { + "$ref": "#/components/schemas/ServerErrorResponse" + }, + "examples": { + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" + } + } + } + } + } + } + } + }, + "/is-valid": { + "get": { + "summary": "Check if a dataset is valid", + "description": "Returns the capabilities of the dataset: show a preview of the 100 first rows, show the viewer for all the rows, search the rows. Use the optional config and split parameters to filter the response.", + "externalDocs": { + "description": "See Valid datasets (Hub docs)", + "url": "https://huggingface.co/docs/datasets-server/valid" + }, + "operationId": "isValidDataset", + "security": [ + {}, + { + "HuggingFaceCookie": [] + }, + { + "HuggingFaceToken": [] + } + ], + "parameters": [ + { + "$ref": "#/components/parameters/RequiredDataset" + }, + { + "$ref": "#/components/parameters/OptionalConfig" + }, + { + "$ref": "#/components/parameters/OptionalSplit" + } + ], + "responses": { + "200": { + "description": "The capabilities of the dataset.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/IsValidResponse" + }, + "examples": { + "all the capabilities": { + "summary": "valid dataset", + "description": "Try with /is-valid?dataset=glue", + "value": { + "preview": true, + "viewer": true, + "search": true + } + }, + "only preview": { + "summary": "dataset with only preview", + "description": "Try with /is-valid?dataset=ehartford/dolphin", + "value": { + "preview": true, + "viewer": false, + "search": false + } + }, + "no capabilities": { + "summary": "dataset with no capabilities", + "description": "Try with /is-valid?dataset=atomic", "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." + "preview": false, + "viewer": false, + "search": false } }, - "gated-dataset": { - "summary": "The dataset is private, while authentication was provided in the request.", + "all the capabilities, for a config": { + "summary": "valid config", + "description": "Try with /is-valid?dataset=glue&config=ax", "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." + "preview": true, + "viewer": true, + "search": true } }, - "private-dataset": { - "summary": "The dataset is private, while authentication was provided in the request.", + "all the capabilities, for a split": { + "summary": "valid split", + "description": "Try with /is-valid?dataset=glue&config=ax&split=test", "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." + "preview": true, + "viewer": true, + "search": true } } } } } }, + "401": { + "$ref": "#/components/responses/Common401" + }, + "404": { + "$ref": "#/components/responses/Dataset404" + }, "422": { - "description": "The `dataset` parameter has not been provided.", + "$ref": "#/components/responses/Dataset422" + }, + "500": { + "description": "The server crashed.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" @@ -3229,7 +3626,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-splits-422" + "$ref": "#/components/headers/X-Error-Code-500-is-valid" } }, "content": { @@ -3238,18 +3635,235 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "missing-parameter": { - "summary": "The dataset parameter is missing.", - "value": { "error": "Parameter 'dataset' is required" } + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" + } + } + }, + "text/plain": { + "schema": { + "$ref": "#/components/schemas/ServerErrorResponse" + }, + "examples": { + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" + } + } + } + } + } + } + } + }, + "/info": { + "get": { + "summary": "Get the metadata of a dataset.", + "description": "Returns the metadata of the dataset: description, homepage, features, etc. Use the optional config parameter to filter the response.", + "externalDocs": { + "description": "The response is a dump of the DatasetInfo object from the datasets library", + "url": "https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetInfo" + }, + "operationId": "getInfo", + "security": [ + {}, + { + "HuggingFaceCookie": [] + }, + { + "HuggingFaceToken": [] + } + ], + "parameters": [ + { + "$ref": "#/components/parameters/RequiredDataset" + }, + { + "$ref": "#/components/parameters/OptionalConfig" + } + ], + "responses": { + "200": { + "description": "The metadata of the dataset.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InfoResponse" + }, + "examples": { + "dataset metadata": { + "summary": "metadata of a dataset. It's an object, with one key per config", + "description": "Try with /info?dataset=mnist", + "value": { + "dataset_info": { + "mnist": { + "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.\n", + "citation": "@article{lecun2010mnist,\n title={MNIST handwritten digit database},\n author={LeCun, Yann and Cortes, Corinna and Burges, CJ},\n journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},\n volume={2},\n year={2010}\n}\n", + "homepage": "http://yann.lecun.com/exdb/mnist/", + "license": "", + "features": { + "image": { "_type": "Image" }, + "label": { + "names": [ + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9" + ], + "_type": "ClassLabel" + } + }, + "supervised_keys": { + "input": "image", + "output": "label" + }, + "task_templates": [ + { + "task": "image-classification", + "label_column": "label" + } + ], + "builder_name": "mnist", + "config_name": "mnist", + "version": { + "version_str": "1.0.0", + "major": 1, + "minor": 0, + "patch": 0 + }, + "splits": { + "train": { + "name": "train", + "num_bytes": 17471100, + "num_examples": 60000, + "dataset_name": "mnist" + }, + "test": { + "name": "test", + "num_bytes": 2916482, + "num_examples": 10000, + "dataset_name": "mnist" + } + }, + "download_checksums": { + "https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz": { + "num_bytes": 9912422, + "checksum": null + }, + "https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz": { + "num_bytes": 28881, + "checksum": null + }, + "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz": { + "num_bytes": 1648877, + "checksum": null + }, + "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz": { + "num_bytes": 4542, + "checksum": null + } + }, + "download_size": 11594722, + "dataset_size": 20387582, + "size_in_bytes": 31982304 + } + }, + "pending": [], + "failed": [], + "partial": false + } }, - "empty-parameter": { - "summary": "The dataset parameter is empty (?dataset=).", - "value": { "error": "Parameter 'dataset' is required" } + "config metadata": { + "summary": "metadata for a dataset config", + "description": "Try with /info?dataset=glue&config=ax", + "value": { + "dataset_info": { + "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.\n\n", + "citation": "\n@inproceedings{wang2019glue,\n title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},\n author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},\n note={In the Proceedings of ICLR.},\n year={2019}\n}\n", + "homepage": "https://gluebenchmark.com/diagnostics", + "license": "", + "features": { + "premise": { "dtype": "string", "_type": "Value" }, + "hypothesis": { "dtype": "string", "_type": "Value" }, + "label": { + "names": ["entailment", "neutral", "contradiction"], + "_type": "ClassLabel" + }, + "idx": { "dtype": "int32", "_type": "Value" } + }, + "builder_name": "glue", + "config_name": "ax", + "version": { + "version_str": "1.0.0", + "description": "", + "major": 1, + "minor": 0, + "patch": 0 + }, + "splits": { + "test": { + "name": "test", + "num_bytes": 237694, + "num_examples": 1104, + "dataset_name": "glue" + } + }, + "download_checksums": { + "https://dl.fbaipublicfiles.com/glue/data/AX.tsv": { + "num_bytes": 222257, + "checksum": null + } + }, + "download_size": 222257, + "dataset_size": 237694, + "size_in_bytes": 459951 + }, + "partial": false + } + }, + "dataset metadata with failed configs": { + "summary": "metadata of a dataset which has failed configs. The failed configs are listed in 'failed'.", + "description": "Try with /info?dataset=atomic", + "value": { + "dataset_info": {}, + "pending": [], + "failed": [ + { + "kind": "config-info", + "dataset": "atomic", + "config": "atomic", + "split": null + } + ], + "partial": false + } } } } } }, + "401": { + "$ref": "#/components/responses/Common401" + }, + "404": { + "$ref": "#/components/responses/DatasetConfig404" + }, + "422": { + "$ref": "#/components/responses/Dataset422" + }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { @@ -3260,7 +3874,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-splits-500" + "$ref": "#/components/headers/X-Error-Code-500-common" } }, "content": { @@ -3269,17 +3883,29 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "not-ready": { - "summary": "the response is not ready yet.", + "error in the dataset itself": { + "summary": "An error while processing the dataset prevents the response to be created.", + "description": "Try with /info?dataset=atomic&config=atomic", "value": { - "error": "The server is busier than usual and the response is not ready yet. Please retry later." + "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", + "cause_exception": "HTTPError", + "cause_message": "404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz", + "cause_traceback": [ + "Traceback (most recent call last):\n", + " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 506, in raise_if_too_big_from_external_data_files\n for i, size in enumerate(pool.imap_unordered(get_size, ext_data_files)):\n", + " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 870, in next\n raise value\n", + " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n", + " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 402, in _request_size\n response.raise_for_status()\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/requests/models.py\", line 1021, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\n", + "requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\n" + ] } }, - "internal": { - "summary": "internal error", - "value": { - "error": "Unexpected error." - } + "response not ready": { + "$ref": "#/components/examples/ResponseNotReadyError" + }, + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" } } }, @@ -3288,11 +3914,8 @@ "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { - "internal": { - "summary": "internal error", - "value": { - "error": "Internal Server Error" - } + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" } } } @@ -3301,19 +3924,35 @@ } } }, - "/valid": { + "/size": { "get": { - "summary": "Valid datasets", - "description": "The lists of the Hub datasets that work without an error, by type. It lists the datasets that have a Dataset Viewer (i.e. have been converted to Parquet format, and can be paginated) and the datasets that only have the Dataset Preview (the first 100 rows).", + "summary": "Get the size of a dataset.", + "description": "Returns the size (number of rows, storage) of the dataset. Use the optional config parameter to filter the response.", "externalDocs": { - "description": "See Valid datasets (Hub docs)", - "url": "https://huggingface.co/docs/datasets-server/valid" + "description": "See size (Hub docs). The doc is still missing for the endpoint, see https://github.com/huggingface/datasets-server/issues/1664.", + "url": "https://huggingface.co/docs/datasets-server/" }, - "operationId": "listValidDatasets", - "parameters": [], + "operationId": "getSize", + "security": [ + {}, + { + "HuggingFaceCookie": [] + }, + { + "HuggingFaceToken": [] + } + ], + "parameters": [ + { + "$ref": "#/components/parameters/RequiredDataset" + }, + { + "$ref": "#/components/parameters/OptionalConfig" + } + ], "responses": { "200": { - "description": "The valid datasets.", + "description": "The size of the dataset.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" @@ -3325,29 +3964,129 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ValidResponse" + "$ref": "#/components/schemas/SizeResponse" }, "examples": { - "valid": { - "summary": "list of datasets", + "dataset size": { + "summary": "size of a dataset.", + "description": "Try with /size?dataset=mnist", "value": { - "preview": [ - "0n1xus/codexglue", - "0x7194633/rupile", - "AHussain0418/day2_data" + "size": { + "dataset": { + "dataset": "mnist", + "num_bytes_original_files": 11594722, + "num_bytes_parquet_files": 18157506, + "num_bytes_memory": 20387582, + "num_rows": 70000 + }, + "configs": [ + { + "dataset": "mnist", + "config": "mnist", + "num_bytes_original_files": 11594722, + "num_bytes_parquet_files": 18157506, + "num_bytes_memory": 20387582, + "num_rows": 70000, + "num_columns": 2 + } + ], + "splits": [ + { + "dataset": "mnist", + "config": "mnist", + "split": "train", + "num_bytes_parquet_files": 15561616, + "num_bytes_memory": 17471100, + "num_rows": 60000, + "num_columns": 2 + }, + { + "dataset": "mnist", + "config": "mnist", + "split": "test", + "num_bytes_parquet_files": 2595890, + "num_bytes_memory": 2916482, + "num_rows": 10000, + "num_columns": 2 + } + ] + }, + "pending": [], + "failed": [], + "partial": false + } + }, + "config size": { + "summary": "size of a dataset config", + "description": "Try with /size?dataset=glue&config=ax", + "value": { + "size": { + "config": { + "dataset": "glue", + "config": "ax", + "num_bytes_original_files": 222257, + "num_bytes_parquet_files": 80767, + "num_bytes_memory": 237694, + "num_rows": 1104, + "num_columns": 4 + }, + "splits": [ + { + "dataset": "glue", + "config": "ax", + "split": "test", + "num_bytes_parquet_files": 80767, + "num_bytes_memory": 237694, + "num_rows": 1104, + "num_columns": 4 + } + ] + }, + "partial": false + } + }, + "dataset size with failed configs": { + "summary": "size of a dataset which has failed configs. The failed configs are listed in 'failed'.", + "description": "Try with /size?dataset=atomic", + "value": { + "size": { + "dataset": { + "dataset": "atomic", + "num_bytes_original_files": 0, + "num_bytes_parquet_files": 0, + "num_bytes_memory": 0, + "num_rows": 0 + }, + "configs": [], + "splits": [] + }, + "pending": [], + "failed": [ + { + "kind": "config-size", + "dataset": "atomic", + "config": "atomic", + "split": null + } ], - "viewer": [ - "0n1xus/pytorrent-standalone", - "51la5/keyword-extraction" - ] + "partial": false } } } } } }, + "401": { + "$ref": "#/components/responses/Common401" + }, + "404": { + "$ref": "#/components/responses/DatasetConfig404" + }, + "422": { + "$ref": "#/components/responses/Dataset422" + }, "500": { - "description": "The server crashed.", + "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" @@ -3356,7 +4095,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-valid-500" + "$ref": "#/components/headers/X-Error-Code-500-common" } }, "content": { @@ -3365,11 +4104,29 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "internal": { - "summary": "internal error", + "error in the dataset itself": { + "summary": "An error while processing the dataset prevents the response to be created.", + "description": "Try with /size?dataset=atomic&config=atomic", "value": { - "error": "Unexpected error." + "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", + "cause_exception": "HTTPError", + "cause_message": "404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz", + "cause_traceback": [ + "Traceback (most recent call last):\n", + " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 506, in raise_if_too_big_from_external_data_files\n for i, size in enumerate(pool.imap_unordered(get_size, ext_data_files)):\n", + " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 870, in next\n raise value\n", + " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n", + " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 402, in _request_size\n response.raise_for_status()\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/requests/models.py\", line 1021, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\n", + "requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\n" + ] } + }, + "response not ready": { + "$ref": "#/components/examples/ResponseNotReadyError" + }, + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" } } }, @@ -3378,11 +4135,8 @@ "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { - "internal": { - "summary": "internal error", - "value": { - "error": "Internal Server Error" - } + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" } } } @@ -3391,15 +4145,15 @@ } } }, - "/is-valid": { + "/opt-in-out-urls": { "get": { - "summary": "Check if a dataset is valid", - "description": "Check if a dataset works without an error (for /splits and /first-rows).", + "summary": "Get the number of opted-in and opted-out image URLs in a dataset.", + "description": "Based on the API of spawning.ai, returns the number of image URLs that have been opted-in and opted-out. Use the optional config and splits parameters to filter the response. Only a sample of the rows is scanned, the first 100K rows at the moment.", "externalDocs": { - "description": "See Valid datasets (Hub docs)", - "url": "https://huggingface.co/docs/datasets-server/valid" + "description": "See spawning.io (Hub docs). The doc is still missing for the endpoint, see https://github.com/huggingface/datasets-server/issues/1664.", + "url": "https://huggingface.co/docs/datasets-server/" }, - "operationId": "isValidDataset", + "operationId": "getOptInOutUrls", "security": [ {}, { @@ -3411,23 +4165,18 @@ ], "parameters": [ { - "name": "dataset", - "in": "query", - "description": "The identifier of the dataset on the Hub.", - "required": true, - "schema": { "type": "string" }, - "examples": { - "glue": { "summary": "a canonical dataset", "value": "glue" }, - "Helsinki-NLP/tatoeba_mt": { - "summary": "a namespaced dataset", - "value": "Helsinki-NLP/tatoeba_mt" - } - } + "$ref": "#/components/parameters/RequiredDataset" + }, + { + "$ref": "#/components/parameters/OptionalConfig" + }, + { + "$ref": "#/components/parameters/OptionalSplit" } ], "responses": { "200": { - "description": "The valid datasets.", + "description": "The number of opted-in and opted-out image URLS in the dataset.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" @@ -3439,28 +4188,59 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/IsValidResponse" + "$ref": "#/components/schemas/OptInOutUrlsCountResponse" }, "examples": { - "valid": { - "summary": "valid dataset", + "number of URLS for a dataset": { + "summary": "number of URLs for a dataset.", + "description": "Try with /opt-in-out-urls?dataset=conceptual_captions", "value": { - "viewer": true, - "preview": true + "urls_columns": ["image_url"], + "has_urls_columns": true, + "num_opt_in_urls": 0, + "num_opt_out_urls": 54760, + "num_scanned_rows": 215840, + "num_urls": 215840, + "full_scan": false } }, - "preview": { - "summary": "dataset with only preview", + "number of URLS for a config": { + "summary": "number of URLs for a config.", + "description": "Try with /opt-in-out-urls?dataset=conceptual_captions&config=labeled", "value": { - "viewer": false, - "preview": true + "urls_columns": ["image_url"], + "has_urls_columns": true, + "num_opt_in_urls": 0, + "num_opt_out_urls": 16579, + "num_scanned_rows": 100000, + "num_urls": 100000, + "full_scan": false } }, - "invalid": { - "summary": "invalid dataset", + "number of URLS for a split": { + "summary": "number of URLs for a split.", + "description": "Try with /opt-in-out-urls?dataset=conceptual_captions&config=labeled&split=train", "value": { - "viewer": false, - "preview": false + "has_urls_columns": true, + "num_opt_in_urls": 0, + "num_opt_out_urls": 16579, + "num_scanned_rows": 100000, + "num_urls": 100000, + "urls_columns": ["image_url"], + "full_scan": false + } + }, + "dataset that has no image URLs columns": { + "summary": "no image URLs columns: values are zero.", + "description": "Try with /opt-in-out-urls?dataset=mnist", + "value": { + "urls_columns": [], + "has_urls_columns": false, + "num_opt_in_urls": 0, + "num_opt_out_urls": 0, + "num_scanned_rows": 0, + "num_urls": 0, + "full_scan": false } } } @@ -3468,7 +4248,16 @@ } }, "401": { - "description": "If the external authentication step on the Hugging Face Hub failed, and no authentication mechanism has been provided. Retry with authentication.", + "$ref": "#/components/responses/Common401" + }, + "404": { + "$ref": "#/components/responses/DatasetConfigSplit404" + }, + "422": { + "$ref": "#/components/responses/Dataset422" + }, + "500": { + "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" @@ -3477,7 +4266,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-is-valid-401" + "$ref": "#/components/headers/X-Error-Code-500" } }, "content": { @@ -3486,102 +4275,391 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "inexistent-dataset": { - "summary": "The dataset does not exist.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } - }, - "gated-dataset": { - "summary": "The dataset is gated.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } + "response not ready": { + "$ref": "#/components/examples/ResponseNotReadyError" }, - "private-dataset": { - "summary": "The dataset is private.", - "value": { - "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." - } + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" } } - } - } - }, - "404": { - "description": "If the dataset cannot be found. This may be because it doesn't exist, or because it is set to `private` and you do not have access.", - "headers": { - "Cache-Control": { - "$ref": "#/components/headers/Cache-Control" - }, - "Access-Control-Allow-Origin": { - "$ref": "#/components/headers/Access-Control-Allow-Origin" }, - "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-is-valid-404" - } - }, - "content": { - "application/json": { + "text/plain": { "schema": { - "$ref": "#/components/schemas/CustomError" + "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { - "inexistent-dataset": { - "summary": "The dataset does not exist, while authentication was provided in the request.", - "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." - } - }, - "gated-dataset": { - "summary": "The dataset is gated, while authentication was provided in the request.", - "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." - } - }, - "private-dataset": { - "summary": "The dataset is private, while authentication was provided in the request.", - "value": { - "error": "The dataset does not exist, or is not accessible with the current credentials (private or gated). Please check the spelling of the dataset name or retry with other authentication credentials." - } + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" } } } } + } + } + } + }, + "/statistics": { + "get": { + "summary": "Descriptive statistics of a split's columns", + "description": "Returns descriptive statistics, such as min, max, average, histogram, of the columns of a split.", + "externalDocs": { + "description": "See statistics (Hub docs). The doc is still missing for the endpoint, see https://github.com/huggingface/datasets-server/issues/1664.", + "url": "https://huggingface.co/docs/datasets-server/" + }, + "operationId": "getStatistics", + "security": [ + {}, + { + "HuggingFaceCookie": [] }, - "422": { - "description": "The `dataset` parameter has not been provided.", + { + "HuggingFaceToken": [] + } + ], + "parameters": [ + { + "$ref": "#/components/parameters/RequiredDataset" + }, + { + "$ref": "#/components/parameters/RequiredConfig" + }, + { + "$ref": "#/components/parameters/RequiredSplit" + } + ], + "responses": { + "200": { + "description": "The descriptive statistics for the columns of the split.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" - }, - "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-is-valid-422" } }, "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/CustomError" + "$ref": "#/components/schemas/StatisticsResponse" }, "examples": { - "missing-parameter": { - "summary": "The dataset parameter is missing.", - "value": { "error": "Parameter 'dataset' is required" } + "A split (mstz/wine) with numeric columns": { + "summary": "Statistics on numeric columns.", + "description": "Try it at https://datasets-server.huggingface.co/statistics?dataset=mstz/wine&config=wine&split=train.", + "value": { + "num_examples": 6497, + "statistics": [ + { + "column_name": "alcohol", + "column_type": "float", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 8.0, + "max": 14.9, + "mean": 10.4918, + "median": 10.3, + "std": 1.19271, + "histogram": { + "hist": [ + 40, 1133, 1662, 1156, 1092, 628, 569, 175, 41, 1 + ], + "bin_edges": [ + 8.0, 8.69, 9.38, 10.07, 10.76, 11.45, 12.14, + 12.83, 13.52, 14.21, 14.9 + ] + } + } + }, + { + "column_name": "chlorides", + "column_type": "float", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 0.009, + "max": 0.611, + "mean": 0.05603, + "median": 0.047, + "std": 0.03503, + "histogram": { + "hist": [5061, 1279, 92, 34, 8, 9, 10, 2, 0, 2], + "bin_edges": [ + 0.009, 0.0692, 0.1294, 0.1896, 0.2498, 0.31, + 0.3702, 0.4304, 0.4906, 0.5508, 0.611 + ] + } + } + }, + { + "column_name": "citric_acid", + "column_type": "float", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 0.0, + "max": 1.66, + "mean": 0.31863, + "median": 0.31, + "std": 0.14532, + "histogram": { + "hist": [ + 766, 3113, 2059, 420, 126, 5, 6, 1, 0, 1 + ], + "bin_edges": [ + 0.0, 0.166, 0.332, 0.498, 0.664, 0.83, 0.996, + 1.162, 1.328, 1.494, 1.66 + ] + } + } + }, + { + "column_name": "density", + "column_type": "float", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 0.98711, + "max": 1.03898, + "mean": 0.9947, + "median": 0.99489, + "std": 0.003, + "histogram": { + "hist": [1599, 3645, 1241, 9, 2, 0, 0, 0, 0, 1], + "bin_edges": [ + 0.98711, 0.9923, 0.99748, 1.00267, 1.00786, + 1.01304, 1.01823, 1.02342, 1.02861, 1.03379, + 1.03898 + ] + } + } + }, + { + "column_name": "fixed_acidity", + "column_type": "float", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 3.8, + "max": 15.9, + "mean": 7.21531, + "median": 7.0, + "std": 1.29643, + "histogram": { + "hist": [ + 63, 1151, 3248, 1339, 382, 177, 82, 41, 7, 7 + ], + "bin_edges": [ + 3.8, 5.01, 6.22, 7.43, 8.64, 9.85, 11.06, 12.27, + 13.48, 14.69, 15.9 + ] + } + } + }, + { + "column_name": "free_sulfur_dioxide", + "column_type": "float", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 1.0, + "max": 289.0, + "mean": 30.52532, + "median": 29.0, + "std": 17.7494, + "histogram": { + "hist": [3392, 2676, 401, 20, 6, 1, 0, 0, 0, 1], + "bin_edges": [ + 1.0, 29.8, 58.6, 87.4, 116.2, 145.0, 173.8, + 202.6, 231.4, 260.2, 289.0 + ] + } + } + }, + { + "column_name": "is_red", + "column_type": "class_label", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "n_unique": 2, + "frequencies": { "red": 1599, "white": 4898 } + } + }, + { + "column_name": "pH", + "column_type": "float", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 2.72, + "max": 4.01, + "mean": 3.2185, + "median": 3.21, + "std": 0.16079, + "histogram": { + "hist": [ + 16, 334, 1233, 2111, 1663, 802, 263, 59, 12, 4 + ], + "bin_edges": [ + 2.72, 2.849, 2.978, 3.107, 3.236, 3.365, 3.494, + 3.623, 3.752, 3.881, 4.01 + ] + } + } + }, + { + "column_name": "quality", + "column_type": "int", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 3, + "max": 9, + "mean": 5.81838, + "median": 6.0, + "std": 0.87326, + "histogram": { + "hist": [30, 216, 2138, 2836, 1079, 193, 5], + "bin_edges": [3, 4, 5, 6, 7, 8, 9, 9] + } + } + }, + { + "column_name": "residual_sugar", + "column_type": "float", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 0.6, + "max": 65.8, + "mean": 5.44324, + "median": 3.0, + "std": 4.7578, + "histogram": { + "hist": [4551, 1396, 533, 14, 2, 0, 0, 0, 0, 1], + "bin_edges": [ + 0.6, 7.12, 13.64, 20.16, 26.68, 33.2, 39.72, + 46.24, 52.76, 59.28, 65.8 + ] + } + } + }, + { + "column_name": "sulphates", + "column_type": "float", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 0.22, + "max": 2.0, + "mean": 0.53127, + "median": 0.51, + "std": 0.14881, + "histogram": { + "hist": [ + 1023, 3451, 1540, 382, 66, 21, 6, 4, 0, 4 + ], + "bin_edges": [ + 0.22, 0.398, 0.576, 0.754, 0.932, 1.11, 1.288, + 1.466, 1.644, 1.822, 2.0 + ] + } + } + }, + { + "column_name": "total_sulfur_dioxide", + "column_type": "float", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 6.0, + "max": 440.0, + "mean": 115.74457, + "median": 118.0, + "std": 56.52185, + "histogram": { + "hist": [ + 1088, 979, 2049, 1514, 721, 134, 8, 2, 1, 1 + ], + "bin_edges": [ + 6.0, 49.4, 92.8, 136.2, 179.6, 223.0, 266.4, + 309.8, 353.2, 396.6, 440.0 + ] + } + } + }, + { + "column_name": "volatile_acidity", + "column_type": "float", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 0.08, + "max": 1.58, + "mean": 0.33967, + "median": 0.29, + "std": 0.16464, + "histogram": { + "hist": [ + 1580, 3002, 996, 606, 214, 70, 22, 4, 2, 1 + ], + "bin_edges": [ + 0.08, 0.23, 0.38, 0.53, 0.68, 0.83, 0.98, 1.13, + 1.28, 1.43, 1.58 + ] + } + } + } + ] + } }, - "empty-parameter": { - "summary": "The dataset parameter is empty (?dataset=).", - "value": { "error": "Parameter 'dataset' is required" } + "A split (mnist) with a label column": { + "summary": "Statistics on a class label column. The image column is not processed.", + "description": "Try it at https://datasets-server.huggingface.co/statistics?dataset=mnist&config=mnist&split=train.", + "value": { + "num_examples": 60000, + "statistics": [ + { + "column_name": "label", + "column_type": "class_label", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "n_unique": 10, + "frequencies": { + "0": 5923, + "1": 6742, + "2": 5958, + "3": 6131, + "4": 5842, + "5": 5421, + "6": 5918, + "7": 6265, + "8": 5851, + "9": 5949 + } + } + } + ] + } } } } } }, + "401": { + "$ref": "#/components/responses/Common401" + }, + "404": { + "$ref": "#/components/responses/DatasetConfigSplit404" + }, + "422": { + "$ref": "#/components/responses/DatasetConfigSplit422" + }, "500": { - "description": "The server crashed.", + "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" @@ -3590,7 +4668,7 @@ "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { - "$ref": "#/components/headers/X-Error-Code-is-valid-500" + "$ref": "#/components/headers/X-Error-Code-500-common" } }, "content": { @@ -3599,11 +4677,29 @@ "$ref": "#/components/schemas/CustomError" }, "examples": { - "internal": { - "summary": "internal error", + "error in the dataset itself": { + "summary": "An error while processing the dataset prevents the response to be created.", + "description": "Try with /statistics?dataset=atomic&config=atomic&split=train", "value": { - "error": "Unexpected error." + "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", + "cause_exception": "HTTPError", + "cause_message": "404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz", + "cause_traceback": [ + "Traceback (most recent call last):\n", + " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 497, in _is_too_big_from_external_data_files\n for i, size in enumerate(pool.imap_unordered(get_size, ext_data_files)):\n", + " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 870, in next\n raise value\n", + " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n", + " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 396, in _request_size\n response.raise_for_status()\n", + " File \"/src/services/worker/.venv/lib/python3.9/site-packages/requests/models.py\", line 1021, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\n", + "requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\n" + ] } + }, + "response not ready": { + "$ref": "#/components/examples/ResponseNotReadyError" + }, + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" } } }, @@ -3612,11 +4708,8 @@ "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { - "internal": { - "summary": "internal error", - "value": { - "error": "Internal Server Error" - } + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" } } } diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index 5cb38fb339..e237dfe3b7 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -139,35 +139,35 @@ def __init__( class CacheDirectoryNotInitializedError(CacheableError): - """Raised when the cache directory has not been initialized before job compute.""" + """The cache directory has not been initialized before job compute.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "CacheDirectoryNotInitializedError", cause, True) class ConfigNamesError(CacheableError): - """Raised when the config names could not be fetched.""" + """The config names could not be fetched.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "ConfigNamesError", cause, True) class CreateCommitError(CacheableError): - """Raised when a commit could not be created on the Hub.""" + """A commit could not be created on the Hub.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "CreateCommitError", cause, False) class DatasetInBlockListError(CacheableError): - """Raised when the dataset is in the list of blocked datasets.""" + """The dataset is in the list of blocked datasets.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetInBlockListError", cause, False) class DatasetInfoHubRequestError(CacheableError): - """Raised when the request to the Hub's dataset-info endpoint times out.""" + """The request to the Hub's dataset-info endpoint times out.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__( @@ -180,21 +180,21 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): class DatasetManualDownloadError(CacheableError): - """Raised when the dataset requires manual download.""" + """The dataset requires manual download.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetManualDownloadError", cause, True) class DatasetModuleNotInstalledError(CacheableError): - """Raised when the dataset tries to import a module that is not installed.""" + """The dataset tries to import a module that is not installed.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetModuleNotInstalledError", cause, True) class DatasetNotFoundError(CacheableError): - """Raised when the dataset does not exist.""" + """The dataset does not exist.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__( @@ -207,42 +207,42 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): class DatasetRevisionEmptyError(CacheableError): - """Raised when the current git revision (branch, commit) could not be obtained.""" + """The current git revision (branch, commit) could not be obtained.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DatasetRevisionEmptyError", cause, False) class DatasetRevisionNotFoundError(CacheableError): - """Raised when the revision of a dataset repository does not exist.""" + """The revision of a dataset repository does not exist.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_FOUND, "DatasetRevisionNotFoundError", cause, False) class DatasetWithTooManyConfigsError(CacheableError): - """Raised when the number of configs of a dataset exceeded the limit.""" + """The number of configs of a dataset exceeded the limit.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyConfigsError", cause, True) class DatasetWithTooManyParquetFilesError(CacheableError): - """Raised when the number of parquet files of a dataset is too big.""" + """The number of parquet files of a dataset is too big.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "DatasetWithTooManyParquetFilesError", cause, True) class DuckDBIndexFileNotFoundError(CacheableError): - """Raised when no duckdb index file was found for split.""" + """No duckdb index file was found for split.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "DuckDBIndexFileNotFoundError", cause, False) class DisabledViewerError(CacheableError): - """Raised when the dataset viewer is disabled.""" + """The dataset viewer is disabled.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__( @@ -255,70 +255,70 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): class EmptyDatasetError(CacheableError): - """Raised when the dataset has no data.""" + """The dataset has no data.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "EmptyDatasetError", cause, True) class ExternalFilesSizeRequestConnectionError(CacheableError): - """Raised when we failed to get the size of the external files.""" + """We failed to get the size of the external files.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestConnectionError", cause, True) class ExternalFilesSizeRequestError(CacheableError): - """Raised when we failed to get the size of the external files.""" + """We failed to get the size of the external files.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestError", cause, True) class ExternalFilesSizeRequestHTTPError(CacheableError): - """Raised when we failed to get the size of the external files.""" + """We failed to get the size of the external files.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestHTTPError", cause, True) class ExternalFilesSizeRequestTimeoutError(CacheableError): - """Raised when we failed to get the size of the external files.""" + """We failed to get the size of the external files.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "ExternalFilesSizeRequestTimeoutError", cause, True) class ExternalServerError(CacheableError): - """Raised when the spawning.ai server is not responding.""" + """The spawning.ai server is not responding.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "ExternalServerError", cause, False) class FeaturesError(CacheableError): - """Raised when the features could not be fetched.""" + """The features could not be fetched.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "FeaturesError", cause, True) class FileSystemError(CacheableError): - """Raised when an error happen reading from File System.""" + """An error happen reading from File System.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "FileSystemError", cause, False) class InfoError(CacheableError): - """Raised when the info could not be fetched.""" + """The info could not be fetched.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "InfoError", cause, True) class JobManagerCrashedError(CacheableError): - """Raised when the job runner crashed and the job became a zombie.""" + """The job runner crashed and the job became a zombie.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__( @@ -331,7 +331,7 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): class JobManagerExceededMaximumDurationError(CacheableError): - """Raised when the job runner was killed because the job exceeded the maximum duration.""" + """The job runner was killed because the job exceeded the maximum duration.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__( @@ -344,42 +344,42 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): class LockedDatasetTimeoutError(CacheableError): - """Raised when a dataset is locked by another job.""" + """A dataset is locked by another job.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "LockedDatasetTimeoutError", cause, True) class MissingSpawningTokenError(CacheableError): - """Raised when the spawning.ai token is not set.""" + """The spawning.ai token is not set.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "MissingSpawningTokenError", cause, False) class NormalRowsError(CacheableError): - """Raised when the rows could not be fetched in normal mode.""" + """The rows could not be fetched in normal mode.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "NormalRowsError", cause, True) class NoIndexableColumnsError(CacheableError): - """Raised when split does not have string columns to index.""" + """The split does not have string columns to index.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoIndexableColumnsError", cause, True) class NoSupportedFeaturesError(CacheableError): - """Raised when dataset does not have any features which types are supported by a worker's processing pipeline.""" + """The dataset does not have any features which types are supported by a worker's processing pipeline.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "NoSupportedFeaturesError", cause, True) class ParameterMissingError(CacheableError): - """Raised when request is missing some parameter.""" + """The request is missing some parameter.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__( @@ -392,28 +392,28 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): class ParquetResponseEmptyError(CacheableError): - """Raised when no parquet files were found for split.""" + """No parquet files were found for split.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "ParquetResponseEmptyError", cause, False) class PreviousStepFormatError(CacheableError): - """Raised when the content of the previous step has not the expected format.""" + """The content of the previous step has not the expected format.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "PreviousStepFormatError", cause, False) class PreviousStepStatusError(CacheableError): - """Raised when the previous step gave an error. The job should not have been created.""" + """The previous step gave an error. The job should not have been created.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "PreviousStepStatusError", cause, False) class ResponseAlreadyComputedError(CacheableError): - """Raised when response has been already computed by another job runner.""" + """The response has been already computed by another job runner.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__( @@ -426,28 +426,28 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): class RowsPostProcessingError(CacheableError): - """Raised when the rows could not be post-processed successfully.""" + """The rows could not be post-processed successfully.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "RowsPostProcessingError", cause, False) class SplitsNamesError(CacheableError): - """Raised when the split names could not be fetched.""" + """The split names could not be fetched.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitsNamesError", cause, True) class SplitNamesFromStreamingError(CacheableError): - """Raised when the split names could not be fetched.""" + """The split names could not be fetched.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitNamesFromStreamingError", cause, True) class SplitNotFoundError(CacheableError): - """Raised when the split does not exist.""" + """The split does not exist.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__( @@ -460,21 +460,21 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): class SplitWithTooBigParquetError(CacheableError): - """Raised when the split parquet size (sum of parquet sizes given) is too big.""" + """The split parquet size (sum of parquet sizes given) is too big.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "SplitWithTooBigParquetError", cause, False) class StreamingRowsError(CacheableError): - """Raised when the rows could not be fetched in streaming mode.""" + """The rows could not be fetched in streaming mode.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "StreamingRowsError", cause, True) class TooBigContentError(CacheableError): - """Raised when content size in bytes is bigger than the supported value.""" + """The content size in bytes is bigger than the supported value.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__( @@ -487,14 +487,14 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): class TooManyColumnsError(CacheableError): - """Raised when the dataset exceeded the max number of columns.""" + """The dataset exceeded the max number of columns.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "TooManyColumnsError", cause, True) class UnexpectedError(CacheableError): - """Raised when the job runner raised an unexpected error.""" + """The job runner raised an unexpected error.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__( @@ -508,14 +508,14 @@ def __init__(self, message: str, cause: Optional[BaseException] = None): class UnsupportedExternalFilesError(CacheableError): - """Raised when we failed to get the size of the external files.""" + """We failed to get the size of the external files.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.NOT_IMPLEMENTED, "UnsupportedExternalFilesError", cause, True) class StatisticsComputationError(CacheableError): - """Raised in case of unexpected behaviour / errors during statistics computations.""" + """An unexpected behavior or error occurred during statistics computations.""" def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__(message, HTTPStatus.INTERNAL_SERVER_ERROR, "ComputationError", cause, True) diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index 30f21d0270..0f1711d757 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -76,24 +76,24 @@ class SplitHubFile(TypedDict): size: int +Row = Dict[str, Any] + + class RowItem(TypedDict): row_idx: int - row: Mapping[str, Any] + row: Row truncated_cells: List[str] -Row = Dict[str, Any] - - class FeatureItem(TypedDict): feature_idx: int name: str - type: Row + type: Dict[str, Any] class PaginatedResponse(TypedDict): features: List[FeatureItem] - rows: Any + rows: List[RowItem] num_total_rows: int diff --git a/services/rows/src/rows/routes/rows.py b/services/rows/src/rows/routes/rows.py index 30a83e9560..d354effef1 100644 --- a/services/rows/src/rows/routes/rows.py +++ b/services/rows/src/rows/routes/rows.py @@ -29,6 +29,7 @@ from libcommon.prometheus import StepProfiler from libcommon.simple_cache import CachedArtifactError from libcommon.storage import StrPath +from libcommon.utils import PaginatedResponse from libcommon.viewer_utils.asset import update_last_modified_date_of_rows_in_assets_dir from libcommon.viewer_utils.features import to_features_list from starlette.requests import Request @@ -57,14 +58,14 @@ def create_response( features: Features, unsupported_columns: List[str], num_total_rows: int, -) -> Any: +) -> PaginatedResponse: if set(pa_table.column_names).intersection(set(unsupported_columns)): raise RuntimeError( "The pyarrow table contains unsupported columns. They should have been ignored in the row group reader." ) - return { - "features": to_features_list(features), - "rows": to_rows_list( + return PaginatedResponse( + features=to_features_list(features), + rows=to_rows_list( pa_table, dataset, config, @@ -75,8 +76,8 @@ def create_response( features, unsupported_columns, ), - "num_total_rows": num_total_rows, - } + num_total_rows=num_total_rows, + ) def create_rows_endpoint( diff --git a/services/worker/src/worker/dtos.py b/services/worker/src/worker/dtos.py index d5fd3bde5f..43b98e7bcb 100644 --- a/services/worker/src/worker/dtos.py +++ b/services/worker/src/worker/dtos.py @@ -2,7 +2,7 @@ # Copyright 2023 The HuggingFace Authors. from dataclasses import dataclass, field -from typing import Any, Dict, List, Mapping, Optional, TypedDict +from typing import Any, Dict, List, Mapping, Optional, TypedDict, Union from libcommon.utils import FeatureItem, Row, RowItem, SplitHubFile @@ -40,25 +40,36 @@ class SplitItem(ConfigItem): split: Optional[str] +class FullConfigItem(DatasetItem): + config: str + + +class FullSplitItem(FullConfigItem): + split: str + + class SplitsList(TypedDict): - splits: List[SplitItem] + splits: List[FullSplitItem] -class FailedConfigItem(ConfigItem): +class FailedConfigItem(FullConfigItem): error: Mapping[str, Any] class DatasetSplitNamesResponse(TypedDict): - splits: List[SplitItem] - pending: List[ConfigItem] + splits: List[FullSplitItem] + pending: List[FullConfigItem] failed: List[FailedConfigItem] -class PreviousJob(SplitItem): +class PreviousJob(TypedDict): + dataset: str + config: Optional[str] + split: Optional[Union[str, None]] kind: str -class SplitFirstRowsResponse(SplitItem): +class SplitFirstRowsResponse(FullSplitItem): features: List[FeatureItem] rows: List[RowItem] @@ -76,7 +87,7 @@ class OptInOutUrlsCountResponse(TypedDict): num_urls: int num_scanned_rows: int has_urls_columns: bool - full_scan: Optional[bool] + full_scan: bool class OptInOutUrlsScanResponse(OptInOutUrlsCountResponse): @@ -136,7 +147,10 @@ class ConfigSize(TypedDict): num_columns: int -class SplitSize(SplitItem): +class SplitSize(TypedDict): + dataset: str + config: str + split: str num_bytes_parquet_files: int num_bytes_memory: int num_rows: int From a50a37b012b2f0a467fc0b42204394dee467efbe Mon Sep 17 00:00:00 2001 From: Test User Date: Thu, 10 Aug 2023 23:25:53 +0000 Subject: [PATCH 02/10] =?UTF-8?q?fix:=20=F0=9F=90=9B=20fix=20type?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/openapi.json | 2 +- services/rows/src/rows/routes/rows.py | 2 +- services/worker/src/worker/dtos.py | 2 +- .../job_runners/config/split_names_from_info.py | 4 ++-- .../job_runners/config/split_names_from_streaming.py | 4 ++-- .../src/worker/job_runners/dataset/split_names.py | 12 ++++++------ 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/source/openapi.json b/docs/source/openapi.json index f270e0d02d..19c962a0ac 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -1042,7 +1042,7 @@ "type": "boolean" }, "full_scan": { - "type": "boolean" + "type": ["boolean", "null"] } } }, diff --git a/services/rows/src/rows/routes/rows.py b/services/rows/src/rows/routes/rows.py index d354effef1..9d87192201 100644 --- a/services/rows/src/rows/routes/rows.py +++ b/services/rows/src/rows/routes/rows.py @@ -3,7 +3,7 @@ import logging import random -from typing import Any, List, Literal, Optional, Union +from typing import List, Literal, Optional, Union import pyarrow as pa from datasets import Audio, Features, Value diff --git a/services/worker/src/worker/dtos.py b/services/worker/src/worker/dtos.py index 43b98e7bcb..3e23217a33 100644 --- a/services/worker/src/worker/dtos.py +++ b/services/worker/src/worker/dtos.py @@ -87,7 +87,7 @@ class OptInOutUrlsCountResponse(TypedDict): num_urls: int num_scanned_rows: int has_urls_columns: bool - full_scan: bool + full_scan: Union[bool, None] class OptInOutUrlsScanResponse(OptInOutUrlsCountResponse): diff --git a/services/worker/src/worker/job_runners/config/split_names_from_info.py b/services/worker/src/worker/job_runners/config/split_names_from_info.py index 6b97ce14b8..aa71722545 100644 --- a/services/worker/src/worker/job_runners/config/split_names_from_info.py +++ b/services/worker/src/worker/job_runners/config/split_names_from_info.py @@ -11,7 +11,7 @@ from libcommon.exceptions import PreviousStepFormatError from libcommon.simple_cache import get_previous_step_or_raise -from worker.dtos import CompleteJobResult, JobRunnerInfo, SplitItem, SplitsList +from worker.dtos import CompleteJobResult, FullSplitItem, JobRunnerInfo, SplitsList from worker.job_runners.config.config_job_runner import ConfigJobRunner @@ -45,7 +45,7 @@ def compute_split_names_from_info_response(dataset: str, config: str) -> SplitsL except Exception as e: raise PreviousStepFormatError("Previous step 'config-info' did not return the expected content.") from e - split_name_items: List[SplitItem] = [ + split_name_items: List[FullSplitItem] = [ {"dataset": dataset, "config": config, "split": str(split)} for split in splits_content ] diff --git a/services/worker/src/worker/job_runners/config/split_names_from_streaming.py b/services/worker/src/worker/job_runners/config/split_names_from_streaming.py index 174e182a1f..da52d7f8d4 100644 --- a/services/worker/src/worker/job_runners/config/split_names_from_streaming.py +++ b/services/worker/src/worker/job_runners/config/split_names_from_streaming.py @@ -17,7 +17,7 @@ SplitNamesFromStreamingError, ) -from worker.dtos import CompleteJobResult, JobRunnerInfo, SplitItem, SplitsList +from worker.dtos import CompleteJobResult, FullSplitItem, JobRunnerInfo, SplitsList from worker.job_runners.config.config_job_runner import ConfigJobRunnerWithDatasetsCache @@ -59,7 +59,7 @@ def compute_split_names_from_streaming_response( """ logging.info(f"get split names for dataset={dataset}, config={config}") try: - split_name_items: List[SplitItem] = [ + split_name_items: List[FullSplitItem] = [ {"dataset": dataset, "config": config, "split": str(split)} for split in get_dataset_split_names(path=dataset, config_name=config, token=hf_token) ] diff --git a/services/worker/src/worker/job_runners/dataset/split_names.py b/services/worker/src/worker/job_runners/dataset/split_names.py index 06aa92227c..32a474ebfa 100644 --- a/services/worker/src/worker/job_runners/dataset/split_names.py +++ b/services/worker/src/worker/job_runners/dataset/split_names.py @@ -10,11 +10,11 @@ from libcommon.simple_cache import get_best_response, get_previous_step_or_raise from worker.dtos import ( - ConfigItem, + FullConfigItem, DatasetSplitNamesResponse, FailedConfigItem, JobResult, - SplitItem, + FullSplitItem, ) from worker.job_runners.dataset.dataset_job_runner import DatasetJobRunner @@ -48,8 +48,8 @@ def compute_dataset_split_names_response(dataset: str) -> Tuple[DatasetSplitName split_names_cache_kinds = ["config-split-names-from-info", "config-split-names-from-streaming"] try: - splits: List[SplitItem] = [] - pending: List[ConfigItem] = [] + splits: List[FullSplitItem] = [] + pending: List[FullConfigItem] = [] failed: List[FailedConfigItem] = [] total = 0 for config in config_names: @@ -60,7 +60,7 @@ def compute_dataset_split_names_response(dataset: str) -> Tuple[DatasetSplitName "No response (successful or erroneous) found in cache for the previous steps" f" '{split_names_cache_kinds}' for this dataset." ) - pending.append(ConfigItem({"dataset": dataset, "config": config})) + pending.append(FullConfigItem({"dataset": dataset, "config": config})) continue if best_response.response["http_status"] != HTTPStatus.OK: logging.debug(f"No successful response found in the previous steps {split_names_cache_kinds}.") @@ -76,7 +76,7 @@ def compute_dataset_split_names_response(dataset: str) -> Tuple[DatasetSplitName continue splits.extend( [ - SplitItem({"dataset": dataset, "config": config, "split": split_content["split"]}) + FullSplitItem({"dataset": dataset, "config": config, "split": split_content["split"]}) for split_content in best_response.response["content"]["splits"] ] ) From f7b5cab3f73b33aa999340baa0d52dff8817ad4e Mon Sep 17 00:00:00 2001 From: Test User Date: Thu, 10 Aug 2023 23:34:55 +0000 Subject: [PATCH 03/10] =?UTF-8?q?style:=20=F0=9F=92=84=20fix=20style?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- services/worker/src/worker/job_runners/dataset/split_names.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/worker/src/worker/job_runners/dataset/split_names.py b/services/worker/src/worker/job_runners/dataset/split_names.py index 32a474ebfa..aca1a59c02 100644 --- a/services/worker/src/worker/job_runners/dataset/split_names.py +++ b/services/worker/src/worker/job_runners/dataset/split_names.py @@ -10,11 +10,11 @@ from libcommon.simple_cache import get_best_response, get_previous_step_or_raise from worker.dtos import ( - FullConfigItem, DatasetSplitNamesResponse, FailedConfigItem, - JobResult, + FullConfigItem, FullSplitItem, + JobResult, ) from worker.job_runners.dataset.dataset_job_runner import DatasetJobRunner From b544b7e244e63dfd26c8aa6d7b7439a914b78441 Mon Sep 17 00:00:00 2001 From: Test User Date: Fri, 11 Aug 2023 15:03:28 +0000 Subject: [PATCH 04/10] =?UTF-8?q?fix:=20=F0=9F=90=9B=20the=20client=20shou?= =?UTF-8?q?ld=20be=20able=20to=20fix=204xx=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit here: the error is on the server, so: changing it to 500. by the way, we don't have any error of this kind in the cache, which is good since it would reveal a bug in the code. --- libs/libcommon/src/libcommon/exceptions.py | 2 +- .../worker/tests/job_runners/config/test_config_job_runner.py | 1 - .../worker/tests/job_runners/dataset/test_dataset_job_runner.py | 1 - .../worker/tests/job_runners/split/test_split_job_runner.py | 1 - 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/libs/libcommon/src/libcommon/exceptions.py b/libs/libcommon/src/libcommon/exceptions.py index e237dfe3b7..fd4c17e955 100644 --- a/libs/libcommon/src/libcommon/exceptions.py +++ b/libs/libcommon/src/libcommon/exceptions.py @@ -384,7 +384,7 @@ class ParameterMissingError(CacheableError): def __init__(self, message: str, cause: Optional[BaseException] = None): super().__init__( message=message, - status_code=HTTPStatus.BAD_REQUEST, + status_code=HTTPStatus.INTERNAL_SERVER_ERROR, code="ParameterMissingError", cause=cause, disclose_cause=False, diff --git a/services/worker/tests/job_runners/config/test_config_job_runner.py b/services/worker/tests/job_runners/config/test_config_job_runner.py index 77a9ebecb7..c14ed92cd0 100644 --- a/services/worker/tests/job_runners/config/test_config_job_runner.py +++ b/services/worker/tests/job_runners/config/test_config_job_runner.py @@ -45,7 +45,6 @@ def test_failed_creation(test_processing_step: ProcessingStep, app_config: AppCo app_config=app_config, ) assert exc_info.value.code == "ParameterMissingError" - assert exc_info.value.status_code == HTTPStatus.BAD_REQUEST def test_success_creation(test_processing_step: ProcessingStep, app_config: AppConfig) -> None: diff --git a/services/worker/tests/job_runners/dataset/test_dataset_job_runner.py b/services/worker/tests/job_runners/dataset/test_dataset_job_runner.py index 4fe427c9e5..5ef3aee93b 100644 --- a/services/worker/tests/job_runners/dataset/test_dataset_job_runner.py +++ b/services/worker/tests/job_runners/dataset/test_dataset_job_runner.py @@ -46,7 +46,6 @@ def test_failed_creation(test_processing_step: ProcessingStep, app_config: AppCo app_config=app_config, ) assert exc_info.value.code == "ParameterMissingError" - assert exc_info.value.status_code == HTTPStatus.BAD_REQUEST def test_success_creation(test_processing_step: ProcessingStep, app_config: AppConfig) -> None: diff --git a/services/worker/tests/job_runners/split/test_split_job_runner.py b/services/worker/tests/job_runners/split/test_split_job_runner.py index 2afde62345..1489f499db 100644 --- a/services/worker/tests/job_runners/split/test_split_job_runner.py +++ b/services/worker/tests/job_runners/split/test_split_job_runner.py @@ -46,7 +46,6 @@ def test_failed_creation(test_processing_step: ProcessingStep, app_config: AppCo app_config=app_config, ) assert exc_info.value.code == "ParameterMissingError" - assert exc_info.value.status_code == HTTPStatus.BAD_REQUEST def test_success_creation(test_processing_step: ProcessingStep, app_config: AppConfig) -> None: From 36178f6bef38710170612e4aa3aec2d7060a2385 Mon Sep 17 00:00:00 2001 From: Test User Date: Fri, 11 Aug 2023 16:11:49 +0000 Subject: [PATCH 05/10] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20document=20501?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/openapi.json | 278 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 274 insertions(+), 4 deletions(-) diff --git a/docs/source/openapi.json b/docs/source/openapi.json index 19c962a0ac..7a047b1d5d 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -119,7 +119,7 @@ { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" } ] }, - "required": true + "required": false }, "X-Error-Code-500-common": { "description": "A string that identifies the underlying error for 500 on /parquet. It's marked as required: false because the header can be missing on text-plain response.", @@ -134,7 +134,7 @@ } ] }, - "required": true + "required": false }, "X-Error-Code-500-rows": { "description": "A string that identifies the underlying error for 500 on /rows. It's marked as required: false because the header can be missing on text-plain response.", @@ -146,7 +146,7 @@ } ] }, - "required": true + "required": false }, "X-Error-Code-500-search": { "description": "A string that identifies the underlying error for 500 on /search. It's marked as required: false because the header can be missing on text-plain response.", @@ -158,7 +158,7 @@ } ] }, - "required": true + "required": false }, "X-Error-Code-500-valid": { "description": "A string that identifies the underlying error for 500 on /valid. It's marked as required: false because the header can be missing on text-plain response.", @@ -167,6 +167,20 @@ { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" } ] }, + "required": false + }, + "X-Error-Code-501": { + "description": "A string that identifies the underlying error for 501.", + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/X-Error-Code-DatasetInBlockListError" + }, + { + "$ref": "#/components/schemas/X-Error-Code-DatasetWithTooManyConfigsError" + } + ] + }, "required": true } }, @@ -1165,6 +1179,16 @@ } } }, + "X-Error-Code-DatasetInBlockListError": { + "type": "string", + "const": "DatasetInBlockListError", + "description": "The dataset is in the list of blocked datasets." + }, + "X-Error-Code-DatasetWithTooManyConfigsError": { + "type": "string", + "const": "DatasetWithTooManyConfigsError", + "description": "The number of configs of a dataset exceeded the limit." + }, "X-Error-Code-ExternalAuthenticatedError": { "type": "string", "const": "ExternalAuthenticatedError", @@ -1781,6 +1805,36 @@ } } } + }, + "501": { + "description": "The server does not implement the feature.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-501" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "too many configs in the dataset": { + "summary": "The dataset has too many configs. The server does not support more than 3,000 configs.", + "description": "Try with dataset=facebook/flores", + "value": { + "error": "The maximum number of configs allowed is 3000, dataset has 41617 configs." + } + } + } + } + } } }, "requestBody": { @@ -2282,6 +2336,28 @@ } } } + }, + "501": { + "description": "The server does not implement the feature.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-501" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": {} + } + } } } } @@ -2727,6 +2803,28 @@ } } } + }, + "501": { + "description": "The server does not implement the feature.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-501" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": {} + } + } } } } @@ -3023,6 +3121,36 @@ } } } + }, + "501": { + "description": "The server does not implement the feature.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-501" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "blocked dataset": { + "summary": "The dataset is blocked manually on the server.", + "description": "Try with /search?dataset=echarlaix/vqa-lxmert&config=vqa&split=validation&query=test", + "value": { + "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/datasets-server if you want this dataset to be supported." + } + } + } + } + } } } } @@ -3423,6 +3551,36 @@ } } } + }, + "501": { + "description": "The server does not implement the feature.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-501" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "blocked dataset": { + "summary": "The dataset is blocked manually on the server.", + "description": "Try with /parquet?dataset=echarlaix/vqa-lxmert&config=vqa", + "value": { + "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/datasets-server if you want this dataset to be supported." + } + } + } + } + } } } } @@ -3920,6 +4078,36 @@ } } } + }, + "501": { + "description": "The server does not implement the feature.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-501" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "blocked dataset": { + "summary": "The dataset is blocked manually on the server.", + "description": "Try with /info?dataset=echarlaix/vqa-lxmert&config=vqa", + "value": { + "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/datasets-server if you want this dataset to be supported." + } + } + } + } + } } } } @@ -4141,6 +4329,36 @@ } } } + }, + "501": { + "description": "The server does not implement the feature.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-501" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "blocked dataset": { + "summary": "The dataset is blocked manually on the server.", + "description": "Try with /size?dataset=echarlaix/vqa-lxmert&config=vqa", + "value": { + "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/datasets-server if you want this dataset to be supported." + } + } + } + } + } } } } @@ -4294,6 +4512,28 @@ } } } + }, + "501": { + "description": "The server does not implement the feature.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-501" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": {} + } + } } } } @@ -4714,6 +4954,36 @@ } } } + }, + "501": { + "description": "The server does not implement the feature.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-501" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "blocked dataset": { + "summary": "The dataset is blocked manually on the server.", + "description": "Try with /statistics?dataset=echarlaix/vqa-lxmert&config=vqa&split=validation", + "value": { + "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/datasets-server if you want this dataset to be supported." + } + } + } + } + } } } } From 46d267e88b3036453cca84a29bd0b5b9945780b8 Mon Sep 17 00:00:00 2001 From: Test User Date: Fri, 11 Aug 2023 16:21:42 +0000 Subject: [PATCH 06/10] =?UTF-8?q?fix:=20=F0=9F=90=9B=20improve=20coherency?= =?UTF-8?q?=20in=20examples=20description?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit alsways give a URL --- docs/source/openapi.json | 92 ++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/docs/source/openapi.json b/docs/source/openapi.json index 7a047b1d5d..c426cc0620 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -1684,7 +1684,7 @@ "examples": { "all splits in a dataset": { "summary": "duorc: two configs, six splits", - "description": "try it at https://datasets-server.huggingface.co/splits?dataset=duorc.", + "description": "Try with https://datasets-server.huggingface.co/splits?dataset=duorc.", "value": { "splits": [ { @@ -1724,7 +1724,7 @@ }, "splits for a single config": { "summary": "emotion has two configs. Setting config=unsplit only returns the splits for this config.", - "description": "try it at https://datasets-server.huggingface.co/splits?dataset=emotion&config=unsplit.", + "description": "Try with https://datasets-server.huggingface.co/splits?dataset=emotion&config=unsplit.", "value": { "splits": [ { @@ -1769,7 +1769,7 @@ "examples": { "error in the dataset itself": { "summary": "The dataset is empty, or a file is missing, or some other error that prevents the response to be created.", - "description": "Try with dataset=severo/empty", + "description": "Try with https://datasets-server.huggingface.co/splits?dataset=severo/empty", "value": { "error": "The dataset is empty.", "cause_exception": "EmptyDatasetError", @@ -1827,7 +1827,7 @@ "examples": { "too many configs in the dataset": { "summary": "The dataset has too many configs. The server does not support more than 3,000 configs.", - "description": "Try with dataset=facebook/flores", + "description": "Try with https://datasets-server.huggingface.co/splits?dataset=facebook/flores", "value": { "error": "The maximum number of configs allowed is 3000, dataset has 41617 configs." } @@ -1890,7 +1890,7 @@ "examples": { "A simple dataset (imdb) with text and label": { "summary": "Text, and label column. Only 3 rows are shown for brevity.", - "description": "Try it at https://datasets-server.huggingface.co/first-rows?dataset=imdb&config=plain_text&split=train.", + "description": "Try with https://datasets-server.huggingface.co/first-rows?dataset=imdb&config=plain_text&split=train.", "value": { "dataset": "imdb", "config": "plain_text", @@ -1943,7 +1943,7 @@ }, "Truncated cells": { "summary": "Truncated cells due to the response size (has a timestamp column). Only 3 rows are shown for brevity.", - "description": "Try it at https://datasets-server.huggingface.co/first-rows?dataset=ett&config=m2&split=test.", + "description": "Try with https://datasets-server.huggingface.co/first-rows?dataset=ett&config=m2&split=test.", "value": { "dataset": "ett", "config": "m2", @@ -2041,7 +2041,7 @@ }, "Image column": { "summary": "A column with images. Only 3 rows are shown for brevity.", - "description": "Try it at https://datasets-server.huggingface.co/first-rows?dataset=huggan/horse2zebra&config=huggan--horse2zebra-aligned&split=train.", + "description": "Try with https://datasets-server.huggingface.co/first-rows?dataset=huggan/horse2zebra&config=huggan--horse2zebra-aligned&split=train.", "value": { "dataset": "huggan/horse2zebra", "config": "huggan--horse2zebra-aligned", @@ -2116,7 +2116,7 @@ }, "Audio column": { "summary": "A column with audio files. Only 3 rows are shown for brevity.", - "description": "Try it at https://datasets-server.huggingface.co/first-rows?dataset=asapp%2Fslue&config=voxceleb&split=train.", + "description": "Try with https://datasets-server.huggingface.co/first-rows?dataset=asapp%2Fslue&config=voxceleb&split=train.", "value": { "dataset": "asapp/slue", "config": "voxceleb", @@ -2283,7 +2283,7 @@ "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", - "description": "Try with /first-rows?dataset=atomic&config=atomic&split=train", + "description": "Try with https://datasets-server.huggingface.co/first-rows?dataset=atomic&config=atomic&split=train", "value": { "error": "Cannot load the dataset split (in streaming mode) to extract the first rows.", "cause_exception": "FileNotFoundError", @@ -2447,7 +2447,7 @@ "examples": { "A slice of a simple dataset (imdb)": { "summary": "Get a slice of length 3 from row 234 (offset=234&length=3).", - "description": "Try it at https://datasets-server.huggingface.co/rows?dataset=imdb&config=plain_text&split=train&offset=234&length=3.", + "description": "Try with https://datasets-server.huggingface.co/rows?dataset=imdb&config=plain_text&split=train&offset=234&length=3.", "value": { "features": [ { @@ -2495,7 +2495,7 @@ }, "A slice of an image dataset (huggan/horse2zebra)": { "summary": "Get a slice of length 3 from row 234 (offset=234&length=3).", - "description": "Try it at https://datasets-server.huggingface.co/rows?dataset=huggan/horse2zebra&config=huggan--horse2zebra-aligned&split=train&offset=234&length=3.", + "description": "Try with https://datasets-server.huggingface.co/rows?dataset=huggan/horse2zebra&config=huggan--horse2zebra-aligned&split=train&offset=234&length=3.", "value": { "features": [ { @@ -2564,7 +2564,7 @@ }, "Audio is not supported at the moment (example: asapp/slue)": { "summary": "Get a slice of length 3 from row 234 (offset=234&length=3). The audio column is 'null'", - "description": "Try it at https://datasets-server.huggingface.co/rows?dataset=asapp/slue&config=voxceleb&split=train&offset=234&length=3.", + "description": "Try with https://datasets-server.huggingface.co/rows?dataset=asapp/slue&config=voxceleb&split=train&offset=234&length=3.", "value": { "features": [ { @@ -2690,7 +2690,7 @@ }, "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", - "description": "Try with /rows?dataset=atomic&config=atomic&split=train. It's a bug, it should be a 500 error, see https://github.com/huggingface/datasets-server/issues/1661.", + "description": "Try with https://datasets-server.huggingface.co/rows?dataset=atomic&config=atomic&split=train. It's a bug, it should be a 500 error, see https://github.com/huggingface/datasets-server/issues/1661.", "value": { "error": "Not found." } } } @@ -2929,7 +2929,7 @@ "examples": { "A slice of a simple dataset (imdb)": { "summary": "The first 3 rows that match the 'dog' search query (query=dog&length=3).", - "description": "Try it at https://datasets-server.huggingface.co/search?dataset=imdb&config=plain_text&split=train&query=dog&length=3.", + "description": "Try with https://datasets-server.huggingface.co/search?dataset=imdb&config=plain_text&split=train&query=dog&length=3.", "value": { "features": [ { @@ -3086,7 +3086,7 @@ "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", - "description": "Try with /search?dataset=atomic&config=atomic&split=train&query=dog", + "description": "Try with https://datasets-server.huggingface.co/search?dataset=atomic&config=atomic&split=train&query=dog", "value": { "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", "cause_exception": "HTTPError", @@ -3143,7 +3143,7 @@ "examples": { "blocked dataset": { "summary": "The dataset is blocked manually on the server.", - "description": "Try with /search?dataset=echarlaix/vqa-lxmert&config=vqa&split=validation&query=test", + "description": "Try with https://datasets-server.huggingface.co/search?dataset=echarlaix/vqa-lxmert&config=vqa&split=validation&query=test", "value": { "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/datasets-server if you want this dataset to be supported." } @@ -3200,7 +3200,7 @@ "examples": { "duorc": { "summary": "duorc: six parquet files, one per split", - "description": "Try with /parquet?dataset=duorc", + "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=duorc", "value": { "parquet_files": [ { @@ -3259,7 +3259,7 @@ }, "duorc with ParaphraseRC config": { "summary": "duorc: three parquet files for ParaphraseRC, one per split", - "description": "Try with /parquet?dataset=duorc&config=ParaphraseRC", + "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=duorc&config=ParaphraseRC", "value": { "parquet_files": [ { @@ -3304,7 +3304,7 @@ }, "sharded parquet files": { "summary": "alexandrainst/da-wit: the parquet file for the train split is partitioned into 17 shards", - "description": "Try with /parquet?dataset=alexandrainst/da-wit", + "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=alexandrainst/da-wit", "value": { "parquet_files": [ { @@ -3467,7 +3467,7 @@ }, "dataset where no parquet file could be created": { "summary": "When the parquet files cannot be created for a configuration, it's listed in 'failed'.", - "description": "Try with /parquet?dataset=atomic", + "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=atomic", "value": { "parquet_files": [], "pending": [], @@ -3516,7 +3516,7 @@ "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", - "description": "Try with /parquet?dataset=atomic&config=atomic", + "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=atomic&config=atomic", "value": { "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", "cause_exception": "HTTPError", @@ -3573,7 +3573,7 @@ "examples": { "blocked dataset": { "summary": "The dataset is blocked manually on the server.", - "description": "Try with /parquet?dataset=echarlaix/vqa-lxmert&config=vqa", + "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=echarlaix/vqa-lxmert&config=vqa", "value": { "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/datasets-server if you want this dataset to be supported." } @@ -3614,7 +3614,7 @@ "examples": { "an example of the response format (only kept the first values in each array for brevity)": { "summary": "list of datasets", - "description": "Try with /valid", + "description": "Try with https://datasets-server.huggingface.co/valid", "value": { "preview": [ "0n1xus/codexglue", @@ -3718,7 +3718,7 @@ "examples": { "all the capabilities": { "summary": "valid dataset", - "description": "Try with /is-valid?dataset=glue", + "description": "Try with https://datasets-server.huggingface.co/is-valid?dataset=glue", "value": { "preview": true, "viewer": true, @@ -3727,7 +3727,7 @@ }, "only preview": { "summary": "dataset with only preview", - "description": "Try with /is-valid?dataset=ehartford/dolphin", + "description": "Try with https://datasets-server.huggingface.co/is-valid?dataset=ehartford/dolphin", "value": { "preview": true, "viewer": false, @@ -3736,7 +3736,7 @@ }, "no capabilities": { "summary": "dataset with no capabilities", - "description": "Try with /is-valid?dataset=atomic", + "description": "Try with https://datasets-server.huggingface.co/is-valid?dataset=atomic", "value": { "preview": false, "viewer": false, @@ -3745,7 +3745,7 @@ }, "all the capabilities, for a config": { "summary": "valid config", - "description": "Try with /is-valid?dataset=glue&config=ax", + "description": "Try with https://datasets-server.huggingface.co/is-valid?dataset=glue&config=ax", "value": { "preview": true, "viewer": true, @@ -3754,7 +3754,7 @@ }, "all the capabilities, for a split": { "summary": "valid split", - "description": "Try with /is-valid?dataset=glue&config=ax&split=test", + "description": "Try with https://datasets-server.huggingface.co/is-valid?dataset=glue&config=ax&split=test", "value": { "preview": true, "viewer": true, @@ -3858,7 +3858,7 @@ "examples": { "dataset metadata": { "summary": "metadata of a dataset. It's an object, with one key per config", - "description": "Try with /info?dataset=mnist", + "description": "Try with https://datasets-server.huggingface.co/info?dataset=mnist", "value": { "dataset_info": { "mnist": { @@ -3946,7 +3946,7 @@ }, "config metadata": { "summary": "metadata for a dataset config", - "description": "Try with /info?dataset=glue&config=ax", + "description": "Try with https://datasets-server.huggingface.co/info?dataset=glue&config=ax", "value": { "dataset_info": { "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.\n\n", @@ -3994,7 +3994,7 @@ }, "dataset metadata with failed configs": { "summary": "metadata of a dataset which has failed configs. The failed configs are listed in 'failed'.", - "description": "Try with /info?dataset=atomic", + "description": "Try with https://datasets-server.huggingface.co/info?dataset=atomic", "value": { "dataset_info": {}, "pending": [], @@ -4043,7 +4043,7 @@ "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", - "description": "Try with /info?dataset=atomic&config=atomic", + "description": "Try with https://datasets-server.huggingface.co/info?dataset=atomic&config=atomic", "value": { "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", "cause_exception": "HTTPError", @@ -4100,7 +4100,7 @@ "examples": { "blocked dataset": { "summary": "The dataset is blocked manually on the server.", - "description": "Try with /info?dataset=echarlaix/vqa-lxmert&config=vqa", + "description": "Try with https://datasets-server.huggingface.co/info?dataset=echarlaix/vqa-lxmert&config=vqa", "value": { "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/datasets-server if you want this dataset to be supported." } @@ -4157,7 +4157,7 @@ "examples": { "dataset size": { "summary": "size of a dataset.", - "description": "Try with /size?dataset=mnist", + "description": "Try with https://datasets-server.huggingface.co/size?dataset=mnist", "value": { "size": { "dataset": { @@ -4206,7 +4206,7 @@ }, "config size": { "summary": "size of a dataset config", - "description": "Try with /size?dataset=glue&config=ax", + "description": "Try with https://datasets-server.huggingface.co/size?dataset=glue&config=ax", "value": { "size": { "config": { @@ -4235,7 +4235,7 @@ }, "dataset size with failed configs": { "summary": "size of a dataset which has failed configs. The failed configs are listed in 'failed'.", - "description": "Try with /size?dataset=atomic", + "description": "Try with https://datasets-server.huggingface.co/size?dataset=atomic", "value": { "size": { "dataset": { @@ -4294,7 +4294,7 @@ "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", - "description": "Try with /size?dataset=atomic&config=atomic", + "description": "Try with https://datasets-server.huggingface.co/size?dataset=atomic&config=atomic", "value": { "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", "cause_exception": "HTTPError", @@ -4351,7 +4351,7 @@ "examples": { "blocked dataset": { "summary": "The dataset is blocked manually on the server.", - "description": "Try with /size?dataset=echarlaix/vqa-lxmert&config=vqa", + "description": "Try with https://datasets-server.huggingface.co/size?dataset=echarlaix/vqa-lxmert&config=vqa", "value": { "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/datasets-server if you want this dataset to be supported." } @@ -4411,7 +4411,7 @@ "examples": { "number of URLS for a dataset": { "summary": "number of URLs for a dataset.", - "description": "Try with /opt-in-out-urls?dataset=conceptual_captions", + "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=conceptual_captions", "value": { "urls_columns": ["image_url"], "has_urls_columns": true, @@ -4424,7 +4424,7 @@ }, "number of URLS for a config": { "summary": "number of URLs for a config.", - "description": "Try with /opt-in-out-urls?dataset=conceptual_captions&config=labeled", + "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=conceptual_captions&config=labeled", "value": { "urls_columns": ["image_url"], "has_urls_columns": true, @@ -4437,7 +4437,7 @@ }, "number of URLS for a split": { "summary": "number of URLs for a split.", - "description": "Try with /opt-in-out-urls?dataset=conceptual_captions&config=labeled&split=train", + "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=conceptual_captions&config=labeled&split=train", "value": { "has_urls_columns": true, "num_opt_in_urls": 0, @@ -4450,7 +4450,7 @@ }, "dataset that has no image URLs columns": { "summary": "no image URLs columns: values are zero.", - "description": "Try with /opt-in-out-urls?dataset=mnist", + "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=mnist", "value": { "urls_columns": [], "has_urls_columns": false, @@ -4586,7 +4586,7 @@ "examples": { "A split (mstz/wine) with numeric columns": { "summary": "Statistics on numeric columns.", - "description": "Try it at https://datasets-server.huggingface.co/statistics?dataset=mstz/wine&config=wine&split=train.", + "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=mstz/wine&config=wine&split=train.", "value": { "num_examples": 6497, "statistics": [ @@ -4857,7 +4857,7 @@ }, "A split (mnist) with a label column": { "summary": "Statistics on a class label column. The image column is not processed.", - "description": "Try it at https://datasets-server.huggingface.co/statistics?dataset=mnist&config=mnist&split=train.", + "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=mnist&config=mnist&split=train.", "value": { "num_examples": 60000, "statistics": [ @@ -4919,7 +4919,7 @@ "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", - "description": "Try with /statistics?dataset=atomic&config=atomic&split=train", + "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=atomic&config=atomic&split=train", "value": { "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", "cause_exception": "HTTPError", @@ -4976,7 +4976,7 @@ "examples": { "blocked dataset": { "summary": "The dataset is blocked manually on the server.", - "description": "Try with /statistics?dataset=echarlaix/vqa-lxmert&config=vqa&split=validation", + "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=echarlaix/vqa-lxmert&config=vqa&split=validation", "value": { "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/datasets-server if you want this dataset to be supported." } From 28aeaebca9110e9be92533359475bfff11307ebb Mon Sep 17 00:00:00 2001 From: Test User Date: Fri, 11 Aug 2023 16:41:41 +0000 Subject: [PATCH 07/10] =?UTF-8?q?docs:=20=E2=9C=8F=EF=B8=8F=20move=20opena?= =?UTF-8?q?pi=20to=20the=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chart/values.yaml | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/chart/values.yaml b/chart/values.yaml index fbffde0b26..5e2bd89b04 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -71,7 +71,6 @@ images: repository: datasets-server-services-worker tag: sha-fb3399a - common: # URL of the HuggingFace Hub hfEndpoint: "" @@ -87,7 +86,7 @@ secrets: enabled: false secretName: "" secretStoreName: "" - parameters: { } + parameters: {} mongoUrl: fromSecret: false secretName: "mongo-url" @@ -211,13 +210,13 @@ optInOutUrlsScan: columnsMaxNumber: 10 # the max number of columns to scan maxConcurrentRequestsNumber: 10 - # the max concurrent request number + # the max concurrent request number maxRequestsPerSecond: 20 # the max number of request allowed to process in parallel per second rowsMaxNumber: 1_000 # the max number of rows to scan urlsNumberPerBatch: 1_000 - # the number of grouped urls to be send in every request to spawning + # the number of grouped urls to be send in every request to spawning spawningUrl: "https://opts-api.spawningaiapi.com/api/v2/query/urls" # the URL for spawning requests @@ -245,7 +244,6 @@ cachedAssets: # When cleaning the cached assets directory: maximum number of rows to discard. maxCleanedRowsNumber: 10000 - parquetMetadata: # Directory on the shared storage (parquet metadata files used for random access in /rows) storageDirectory: "/storage/parquet-metadata" @@ -263,7 +261,7 @@ duckDBIndex: urlTemplate: "/datasets/%s/resolve/%s/%s" # the maximum size of the split parquets. maxParquetSizeBytes: "100_000_000" - # the time interval at which a downloaded index will be considered as expired and will be deleted + # the time interval at which a downloaded index will be considered as expired and will be deleted expiredTimeIntervalSeconds: 600 descriptiveStatistics: @@ -324,7 +322,6 @@ backfill: cpu: 0 tolerations: [] - deleteIndexes: enabled: true log: @@ -341,7 +338,6 @@ deleteIndexes: cpu: 0 tolerations: [] - queueMetricsCollector: enabled: true action: "collect-queue-metrics" @@ -355,7 +351,6 @@ queueMetricsCollector: cpu: 0 tolerations: [] - cacheMetricsCollector: enabled: true action: "collect-cache-metrics" @@ -542,8 +537,7 @@ search: tolerations: [] workers: - - - # name of the deployment + - # name of the deployment deployName: "all" # max difficulty of the jobs that this worker will process workerDifficultyMax: 100 From 729b4b1aa3fb99677ca4162ed3478c8c6050cd9a Mon Sep 17 00:00:00 2001 From: Test User Date: Fri, 11 Aug 2023 17:51:31 +0000 Subject: [PATCH 08/10] =?UTF-8?q?style:=20=F0=9F=92=84=20remove=20unused?= =?UTF-8?q?=20imports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../worker/tests/job_runners/config/test_config_job_runner.py | 2 -- .../worker/tests/job_runners/dataset/test_dataset_job_runner.py | 2 -- .../worker/tests/job_runners/split/test_split_job_runner.py | 2 -- 3 files changed, 6 deletions(-) diff --git a/services/worker/tests/job_runners/config/test_config_job_runner.py b/services/worker/tests/job_runners/config/test_config_job_runner.py index c14ed92cd0..57f60cd18a 100644 --- a/services/worker/tests/job_runners/config/test_config_job_runner.py +++ b/services/worker/tests/job_runners/config/test_config_job_runner.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2023 The HuggingFace Authors. -from http import HTTPStatus - import pytest from libcommon.exceptions import CustomError from libcommon.processing_graph import ProcessingStep diff --git a/services/worker/tests/job_runners/dataset/test_dataset_job_runner.py b/services/worker/tests/job_runners/dataset/test_dataset_job_runner.py index 5ef3aee93b..a34f747b22 100644 --- a/services/worker/tests/job_runners/dataset/test_dataset_job_runner.py +++ b/services/worker/tests/job_runners/dataset/test_dataset_job_runner.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2023 The HuggingFace Authors. -from http import HTTPStatus - import pytest from libcommon.exceptions import CustomError from libcommon.processing_graph import ProcessingStep diff --git a/services/worker/tests/job_runners/split/test_split_job_runner.py b/services/worker/tests/job_runners/split/test_split_job_runner.py index 1489f499db..e6d6c4eab4 100644 --- a/services/worker/tests/job_runners/split/test_split_job_runner.py +++ b/services/worker/tests/job_runners/split/test_split_job_runner.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2023 The HuggingFace Authors. -from http import HTTPStatus - import pytest from libcommon.exceptions import CustomError from libcommon.processing_graph import ProcessingStep From 0a36b4626d32a43708954cc0940102b9b16244b8 Mon Sep 17 00:00:00 2001 From: Test User Date: Fri, 11 Aug 2023 19:09:03 +0000 Subject: [PATCH 09/10] =?UTF-8?q?test:=20=F0=9F=92=8D=20fix=20e2e=20that?= =?UTF-8?q?=20use=20openapi=20(contract=20testing)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- e2e/tests/test_12_splits.py | 34 ++++++++++++++++++--------------- e2e/tests/test_13_first_rows.py | 26 +++++++++++-------------- e2e/tests/utils.py | 29 +++++++++++++++++++++++----- 3 files changed, 54 insertions(+), 35 deletions(-) diff --git a/e2e/tests/test_12_splits.py b/e2e/tests/test_12_splits.py index 26017ae784..504c560ead 100644 --- a/e2e/tests/test_12_splits.py +++ b/e2e/tests/test_12_splits.py @@ -7,47 +7,51 @@ @pytest.mark.parametrize( - "status,name,dataset,error_code", + "status,name,dataset,config,error_code", [ - # (200, "duorc", "duorc", None), - # (200, "emotion", "emotion", None), + # (200, "all splits in a dataset", "duorc", None, None), + # (200, "splits for a single config", "emotion", "unsplit", None) ( 401, - "inexistent-dataset", + "inexistent dataset, and not authenticated", "severo/inexistent-dataset", + None, "ExternalUnauthenticatedError", ), # ( # 401, # "gated-dataset", - # "severo/dummy_gated", + # "severo/dummy_gated", None, # "ExternalUnauthenticatedError", # ), # ( # 401, # "private-dataset", - # "severo/dummy_private", + # "severo/dummy_private", None, # "ExternalUnauthenticatedError", # ), - (422, "empty-parameter", "", "MissingRequiredParameter"), - (422, "missing-parameter", None, "MissingRequiredParameter"), - # (500, "SplitsNotFoundError", "natural_questions", "SplitsNamesError"), - # (500, "FileNotFoundError", "akhaliq/test", "SplitsNamesError"), - # (500, "not-ready", "severo/fix-401", "SplitsResponseNotReady"), + (422, "missing dataset parameter", "", None, "MissingRequiredParameter"), + (422, "empty dataset parameter", None, None, "MissingRequiredParameter"), + # (500, "SplitsNotFoundError", "natural_questions", None, "SplitsNamesError"), + # (500, "FileNotFoundError", "akhaliq/test", None, "SplitsNamesError"), + # (500, "not-ready", "severo/fix-401", None, "SplitsResponseNotReady"), # not tested: 'internal_error' ], ) -def test_splits_using_openapi(status: int, name: str, dataset: str, error_code: str) -> None: +def test_splits_using_openapi(status: int, name: str, dataset: str, config: str, error_code: str) -> None: body = get_openapi_body_example("/splits", status, name) + config_query = f"&config={config}" if config else "" - if name == "empty-parameter": + if name == "empty dataset parameter": r_splits = poll("/splits?dataset=", error_field="error") - elif name == "missing-parameter": + elif name == "missing dataset parameter": r_splits = poll("/splits", error_field="error") else: post_refresh(dataset) # poll the endpoint before the worker had the chance to process it - r_splits = get(f"/splits?dataset={dataset}") if name == "not-ready" else poll_splits(dataset) + r_splits = ( + get(f"/splits?dataset={dataset}{config_query}") if name == "not-ready" else poll_splits(dataset, config) + ) assert r_splits.status_code == status, f"{r_splits.status_code} - {r_splits.text}" assert r_splits.json() == body, r_splits.text diff --git a/e2e/tests/test_13_first_rows.py b/e2e/tests/test_13_first_rows.py index b59eb8a70a..dd96bbfe40 100644 --- a/e2e/tests/test_13_first_rows.py +++ b/e2e/tests/test_13_first_rows.py @@ -27,18 +27,18 @@ def prepare_json(response: requests.Response) -> Any: [ ( 401, - "inexistent-dataset", + "inexistent dataset, and not authenticated", "severo/inexistent-dataset", "plain_text", "train", "ExternalUnauthenticatedError", ), - (422, "missing-dataset", None, "plain_text", "train", "MissingRequiredParameter"), - (422, "missing-config", "imdb", None, "train", "MissingRequiredParameter"), - (422, "missing-split", "imdb", "plain_text", None, "MissingRequiredParameter"), - (422, "empty-dataset", "", "plain_text", "train", "MissingRequiredParameter"), - (422, "empty-config", "imdb", "", "train", "MissingRequiredParameter"), - (422, "empty-split", "imdb", "plain_text", "", "MissingRequiredParameter"), + (422, "missing required parameter", None, "plain_text", "train", "MissingRequiredParameter"), + (422, "missing required parameter", "imdb", None, "train", "MissingRequiredParameter"), + (422, "missing required parameter", "imdb", "plain_text", None, "MissingRequiredParameter"), + (422, "empty required parameter", "", "plain_text", "train", "MissingRequiredParameter"), + (422, "empty required parameter", "imdb", "", "train", "MissingRequiredParameter"), + (422, "empty required parameter", "imdb", "plain_text", "", "MissingRequiredParameter"), ], ) def test_first_rows(status: int, name: str, dataset: str, config: str, split: str, error_code: str) -> None: @@ -46,9 +46,9 @@ def test_first_rows(status: int, name: str, dataset: str, config: str, split: st # the logic here is a bit convoluted, because we have no way to refresh a split, we have to refresh the whole # dataset and depend on the result of /splits - if name.startswith("empty-"): + if name == "empty required parameter": r_rows = poll(f"/first-rows?dataset={dataset}&config={config}&split={split}", error_field="error") - elif name.startswith("missing-"): + elif name == "missing required parameter": d = f"dataset={dataset}" if dataset is not None else "" c = f"config={config}" if config is not None else "" s = f"split={split}" if split is not None else "" @@ -56,12 +56,8 @@ def test_first_rows(status: int, name: str, dataset: str, config: str, split: st r_rows = poll(f"/first-rows?{params}", error_field="error") else: post_refresh(dataset) - poll_splits(dataset) - if name == "not-ready": - # poll the endpoint before the worker had the chance to process it - r_rows = get(f"/first-rows?dataset={dataset}&config={config}&split={split}") - else: - r_rows = poll_first_rows(dataset, config, split) + poll_splits(dataset, config) + r_rows = poll_first_rows(dataset, config, split) assert r_rows.status_code == status, f"{r_rows.status_code} - {r_rows.text}" assert prepare_json(r_rows) == body, r_rows.text diff --git a/e2e/tests/utils.py b/e2e/tests/utils.py index 22c1ef2990..15e9b6a9db 100644 --- a/e2e/tests/utils.py +++ b/e2e/tests/utils.py @@ -79,8 +79,9 @@ def poll_parquet(dataset: str, headers: Optional[Headers] = None) -> Response: return poll(f"/parquet?dataset={dataset}", error_field="error", headers=headers) -def poll_splits(dataset: str, headers: Optional[Headers] = None) -> Response: - return poll(f"/splits?dataset={dataset}", error_field="error", headers=headers) +def poll_splits(dataset: str, config: Optional[str], headers: Optional[Headers] = None) -> Response: + config_query = f"&config={config}" if config else "" + return poll(f"/splits?dataset={dataset}{config_query}", error_field="error", headers=headers) def poll_first_rows(dataset: str, config: str, split: str, headers: Optional[Headers] = None) -> Response: @@ -92,9 +93,27 @@ def get_openapi_body_example(path: str, status: int, example_name: str) -> Any: openapi_filename = root / "docs" / "source" / "openapi.json" with open(openapi_filename) as json_file: openapi = json.load(json_file) - return openapi["paths"][path]["get"]["responses"][str(status)]["content"]["application/json"]["examples"][ - example_name - ]["value"] + steps = [ + "paths", + path, + "get", + "responses", + str(status), + "content", + "application/json", + "examples", + example_name, + "value", + ] + result = openapi + for step in steps: + if "$ref" in result: + new_steps = result["$ref"].split("/")[1:] + result = openapi + for new_step in new_steps: + result = result[new_step] + result = result[step] + return result def get_default_config_split() -> Tuple[str, str]: From f67c74bcab0ac05beedadb3ebccaa99b2ec26006 Mon Sep 17 00:00:00 2001 From: Test User Date: Fri, 11 Aug 2023 19:15:04 +0000 Subject: [PATCH 10/10] =?UTF-8?q?style:=20=F0=9F=92=84=20remove=20unused?= =?UTF-8?q?=20import?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- e2e/tests/test_13_first_rows.py | 1 - 1 file changed, 1 deletion(-) diff --git a/e2e/tests/test_13_first_rows.py b/e2e/tests/test_13_first_rows.py index dd96bbfe40..beda30f853 100644 --- a/e2e/tests/test_13_first_rows.py +++ b/e2e/tests/test_13_first_rows.py @@ -9,7 +9,6 @@ from .utils import ( URL, - get, get_openapi_body_example, poll, poll_first_rows,