[ES] Added content for issue #1

opentraveldata · Mar 21, 2020 · 68e9f68 · 68e9f68
1 parent 488c393
commit 68e9f68
Show file tree

Hide file tree

Showing 4 changed files with 175 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -38,8 +38,23 @@ not for the data itself.
   + [Quality Assurance samples](https://github.com/service-delivery-quality/quality-assurance)
 * [Geonames' QA dashboard](http://qa.geonames.org/qa/)
 * [Quality Assurance (QA) images on Docker Cloud](https://cloud.docker.com/u/opentraveldata/repository/docker/opentraveldata/quality-assurance)
+* [Induction on monitoring with Elasticsearch](https://github.com/infra-helpers/induction-monitoring)
 * [How to set up a Python virtual environment](https://github.com/machine-learning-helpers/induction-python/tree/master/installation/virtual-env)
 
+## ElasticSearch (ES)
+* [ElasticSearch](https://elasitc.co) stacks:
+  + [EFK (ElasticSearch, Fluentd, Kibana](https://docs.fluentd.org/v/0.12/articles/docker-logging-efk-compose)
+  + [Kibana](https://www.elastic.co/products/kibana)
+  + [Fluentd](https://www.fluentd.org/)
+* [Elasticsearch geo-point](https://www.elastic.co/guide/en/elasticsearch/reference/current/geo-point.html)
+
+### Ingest processors
+* Main: https://www.elastic.co/guide/en/elasticsearch/reference/current/ingest-processors.html
+* [Grok processor](https://www.elastic.co/guide/en/elasticsearch/reference/current/grok-processor.html)
+* [CSV processor](https://www.elastic.co/guide/en/elasticsearch/reference/current/csv-processor.html)
+* [Date processor](https://www.elastic.co/guide/en/elasticsearch/reference/current/date-processor.html)
+* [Script processor](https://www.elastic.co/guide/en/elasticsearch/reference/current/script-processor.html)
+
 # Quick starter
 
 ## Through a pre-built Docker image
@@ -110,6 +125,63 @@ $ make checkers
 $ pipenv run checkers/check-por-cmp-optd-unlc.py
 ```
 
+## Elasticsearch
+
+* Simulate the targetted pipeline:
+```bash
+$ curl -XPOST "http://localhost:9200/_ingest/pipeline/_simulate" -H "Content-Type: application/json" --data "@elastic/optd-qa-index-sim-por-optd-geo-diff.json"|jq
+  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+                                 Dload  Upload   Total   Spent    Left  Speed
+100  1435  100   496  100   939  62000   114k --:--:-- --:--:-- --:--:--  175k
+```
+```javascript
+{
+  "docs": [
+    {
+      "doc": {
+        "_index": "subway_info",
+        "_type": "_doc",
+        "_id": "AVvJZVQEBr2flFKzrrkr",
+        "_source": {
+          "iata_code": "DOH",
+          "optd_coord": {
+            "lon": "51.565056",
+            "lat": "25.261125"
+          },
+          "distance": "4.368154282573759",
+          "weighted_distance": "20197.72392862065",
+          "location_type": "C",
+          "geoname_id": "290030",
+          "country_code": "QA",
+          "page_rank": "0.4622857726179021",
+          "geo_coord": {
+            "lon": "51.53096",
+            "lat": "25.28545"
+          },
+          "adm1_code": "01",
+          "timestamp": "2020-03-20T15:12:23.000+01:00"
+        },
+        "_ingest": {
+          "timestamp": "2020-03-20T23:26:02.29742Z"
+        }
+      }
+    }
+  ]
+}
+```
+
+### Todo
+* [Issue #1](https://github.com/opentraveldata/quality-assurance/issues/1)
+
+As of March 2020, the resulting CSV data files have various formats. Dumping
+the corresponding content into Elasticsearch (ES) would force to have almost
+an index per CSV file type, which would slightly defeat the interest of
+using ES. Rather, it seems better to merge all the CSV file types into a
+single format, allowing to get a single ES index. Then, every CSF file
+will be tagged with their respective checking intent. The search and
+time-series analysis will be much easier.
+So, the next step is to merge all the formats of the CSF files.
+
 # Checks
 
 ## Points of Reference (POR)

diff --git a/elastic/optd-qa-index-por-big-city-around.json b/elastic/optd-qa-index-por-big-city-around.json
@@ -0,0 +1,29 @@
+{
+	"mappings": {
+		"properties": {
+			"por_code": {
+				"type": "keyword"
+			},
+			"por_page_rank": {
+				"type": "double"
+			},
+			"por_feat_code": {
+				"type": "keyword"
+			},
+			"por_geo_id": {
+				"type": "long"
+			},
+			"city_code": {
+				"type": "keyword"
+			},
+			"city_page_rank": {
+				"type": "double"
+			},
+			"city_code_list": {
+				"type": "keyword"
+			},
+			"distance": {
+				"type": "double"
+			}		}
+	}
+}
diff --git a/elastic/optd-qa-index-por-optd-geo-diff.json b/elastic/optd-qa-index-por-optd-geo-diff.json
@@ -0,0 +1,36 @@
+{
+	"mappings": {
+		"properties": {
+			"iata_code": {
+				"type": "keyword"
+			},
+			"geoname_id": {
+				"type": "long"
+			},
+			"location_type": {
+				"type": "keyword"
+			},
+			"country_code": {
+				"type": "keyword"
+			},
+			"adm1_code": {
+				"type": "keyword"
+			},
+			"page_rank": {
+				"type": "double"
+			},
+			"optd_coord": {
+				"type": "geo_point"
+			},
+			"geo_coord": {
+				"type": "geo_point"
+			},
+			"distance": {
+				"type": "double"
+			},
+			"weighted_distance": {
+				"type": "double"
+			}
+		}
+	}
+}
diff --git a/elastic/optd-qa-index-sim-por-optd-geo-diff.json b/elastic/optd-qa-index-sim-por-optd-geo-diff.json
@@ -0,0 +1,38 @@
+{
+ "pipeline": {
+   "description": "Parsing the OPTD QA CSV",
+   "processors": [
+     {
+       "csv": {
+         "field": "optd-qa",
+         "separator": "^",
+         "target_fields": ["iata_code", "geoname_id", "location_type", "country_code", "adm1_code", "page_rank", "optd_coord.lat", "optd_coord.lon", "distance", "geo_coord.lat", "geo_coord.lon", "weighted_distance"]
+       }
+     },
+     {
+       "remove": {
+         "field": "optd-qa"
+       }
+     },
+     {
+       "date" : {
+         "field" : "timestamp",
+         "target_field" : "timestamp",
+         "formats" : ["yyyy-MM-dd HH:mm:ss"],
+         "timezone" : "Europe/Paris"
+       }
+     }
+   ]
+ },
+"docs": [
+   {
+     "_index": "subway_info",
+     "_id": "AVvJZVQEBr2flFKzrrkr",
+     "_score": 1,
+     "_source": {
+       "timestamp": "2020-03-20 15:12:23",
+       "optd-qa": "DOH^290030^C^QA^01^0.4622857726179021^25.261125^51.565056^4.368154282573759^25.28545^51.53096^20197.72392862065"
+     }
+   }
+ ]
+}