Replace canonical datasets with community ones in the docs/tests (#2579)

* replace glue with nyu-mll/glue (or mnist in one case) * remove unused pytest marks (somewhat unrelated to the PR, sorry) * fix ibm/duorc * fix assets URL * add statistics field in docs (from #2577) * replace emotion and c4 with their moved new neame * replace squad with its moved version * rename datasets in openapi spec + add missing links to docs * fix test
huggingface · Mar 13, 2024 · f45af9c · f45af9c
1 parent db70a71
commit f45af9c
Show file tree

Hide file tree

Showing 25 changed files with 195 additions and 215 deletions.
diff --git a/docs/source/filter.md b/docs/source/filter.md
@@ -12,7 +12,7 @@ This guide shows you how to use Datasets Server's `/filter` endpoint to filter r
 Feel free to also try it out with [ReDoc](https://redocly.github.io/redoc/?url=https://datasets-server.huggingface.co/openapi.json#operation/filterRows).
 
 The `/filter` endpoint accepts the following query parameters:
-- `dataset`: the dataset name, for example `glue` or `mozilla-foundation/common_voice_10_0`
+- `dataset`: the dataset name, for example `nyu-mll/glue` or `mozilla-foundation/common_voice_10_0`
 - `config`: the configuration name, for example `cola`
 - `split`: the split name, for example `train`
 - `where`: the filter condition
@@ -88,7 +88,7 @@ The endpoint response is a JSON containing two keys (same format as [`/rows`](./
 
 The rows are ordered by the row index.
 
-For example, here are the `features` and the slice 150-151 of matching `rows` of the `ibm.duorc`/`SelfRC` train split for the `where` condition `no_answer=true`:
+For example, here are the `features` and the slice 150-151 of matching `rows` of the `ibm/duorc`/`SelfRC` train split for the `where` condition `no_answer=true`:
 
 ```json
 {
@@ -197,4 +197,4 @@ For example, here are the `features` and the slice 150-151 of matching `rows` of
 
 If the result has `partial: true` it means that the filtering couldn't be run on the full dataset because it's too big.
 
-Indeed, the indexing for `/filter` can be partial if the dataset is bigger than 5GB. In that case, it only uses the first 5GB.
+Indeed, the indexing for `/filter` can be partial if the dataset is bigger than 5GB. In that case, it only uses the first 5GB.
diff --git a/docs/source/first_rows.md b/docs/source/first_rows.md
@@ -8,7 +8,7 @@ This guide shows you how to use Datasets Server's `/first-rows` endpoint to prev
 
 The `/first-rows` endpoint accepts three query parameters:
 
-- `dataset`: the dataset name, for example `glue` or `mozilla-foundation/common_voice_10_0`
+- `dataset`: the dataset name, for example `nyu-mll/glue` or `mozilla-foundation/common_voice_10_0`
 - `config`: the configuration name, for example `cola`
 - `split`: the split name, for example `train`
 

diff --git a/docs/source/info.md b/docs/source/info.md
@@ -51,76 +51,50 @@ The endpoint response is a JSON with the `dataset_info` key. Its structure and c
 
 ```json
 {
-   "dataset_info":{
-      "description":"",
-      "citation":"",
-      "homepage":"",
-      "license":"",
-      "features":{
-         "plot_id":{
-            "dtype":"string",
-            "_type":"Value"
-         },
-         "plot":{
-            "dtype":"string",
-            "_type":"Value"
-         },
-         "title":{
-            "dtype":"string",
-            "_type":"Value"
-         },
-         "question_id":{
-            "dtype":"string",
-            "_type":"Value"
-         },
-         "question":{
-            "dtype":"string",
-            "_type":"Value"
-         },
-         "answers":{
-            "feature":{
-               "dtype":"string",
-               "_type":"Value"
-            },
-            "_type":"Sequence"
-         },
-         "no_answer":{
-            "dtype":"bool",
-            "_type":"Value"
-         }
+  "dataset_info": {
+    "description": "",
+    "citation": "",
+    "homepage": "",
+    "license": "",
+    "features": {
+      "plot_id": { "dtype": "string", "_type": "Value" },
+      "plot": { "dtype": "string", "_type": "Value" },
+      "title": { "dtype": "string", "_type": "Value" },
+      "question_id": { "dtype": "string", "_type": "Value" },
+      "question": { "dtype": "string", "_type": "Value" },
+      "answers": {
+        "feature": { "dtype": "string", "_type": "Value" },
+        "_type": "Sequence"
       },
-      "builder_name":"parquet",
-      "dataset_name":"duorc",
-      "config_name":"SelfRC",
-      "version":{
-         "version_str":"0.0.0",
-         "major":0,
-         "minor":0,
-         "patch":0
+      "no_answer": { "dtype": "bool", "_type": "Value" }
+    },
+    "builder_name": "parquet",
+    "dataset_name": "duorc",
+    "config_name": "SelfRC",
+    "version": { "version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0 },
+    "splits": {
+      "train": {
+        "name": "train",
+        "num_bytes": 248966361,
+        "num_examples": 60721,
+        "dataset_name": null
       },
-      "splits":{
-         "train":{
-            "name":"train",
-            "num_bytes":248966361,
-            "num_examples":60721,
-            "dataset_name":null
-         },
-         "validation":{
-            "name":"validation",
-            "num_bytes":56359392,
-            "num_examples":12961,
-            "dataset_name":null
-         },
-         "test":{
-            "name":"test",
-            "num_bytes":51022318,
-            "num_examples":12559,
-            "dataset_name":null
-         }
+      "validation": {
+        "name": "validation",
+        "num_bytes": 56359392,
+        "num_examples": 12961,
+        "dataset_name": null
       },
-      "download_size":21001846,
-      "dataset_size":356348071
-   },
-   "partial":false
+      "test": {
+        "name": "test",
+        "num_bytes": 51022318,
+        "num_examples": 12559,
+        "dataset_name": null
+      }
+    },
+    "download_size": 21001846,
+    "dataset_size": 356348071
+  },
+  "partial": false
 }
-```
+```