diff --git a/docs/apidocs/bullet-bql/1.0.0/index.html b/docs/apidocs/bullet-bql/1.0.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-bql/1.0.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-bql/1.1.0/index.html b/docs/apidocs/bullet-bql/1.1.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-bql/1.1.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-core/1.0.0/index.html b/docs/apidocs/bullet-core/1.0.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-core/1.0.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-core/1.1.0/index.html b/docs/apidocs/bullet-core/1.1.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-core/1.1.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-core/1.2.0/index.html b/docs/apidocs/bullet-core/1.2.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-core/1.2.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-dsl/1.0.0/index.html b/docs/apidocs/bullet-dsl/1.0.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-dsl/1.0.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-dsl/1.0.1/index.html b/docs/apidocs/bullet-dsl/1.0.1/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-dsl/1.0.1/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-dsl/1.1.0/index.html b/docs/apidocs/bullet-dsl/1.1.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-dsl/1.1.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-kafka/1.0.0/index.html b/docs/apidocs/bullet-kafka/1.0.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-kafka/1.0.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-kafka/1.0.1/index.html b/docs/apidocs/bullet-kafka/1.0.1/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-kafka/1.0.1/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-pulsar/1.0.0/index.html b/docs/apidocs/bullet-pulsar/1.0.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-pulsar/1.0.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-record/1.0.0/index.html b/docs/apidocs/bullet-record/1.0.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-record/1.0.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-record/1.1.0/index.html b/docs/apidocs/bullet-record/1.1.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-record/1.1.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-service/1.0.0/index.html b/docs/apidocs/bullet-service/1.0.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-service/1.0.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-spark/1.0.0/index.html b/docs/apidocs/bullet-spark/1.0.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-spark/1.0.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/apidocs/bullet-storm/1.0.0/index.html b/docs/apidocs/bullet-storm/1.0.0/index.html new file mode 100644 index 00000000..210ebe8a --- /dev/null +++ b/docs/apidocs/bullet-storm/1.0.0/index.html @@ -0,0 +1 @@ +Replace me with the real documentation. diff --git a/docs/backend/dsl.md b/docs/backend/dsl.md index c5b5a488..f583bf31 100644 --- a/docs/backend/dsl.md +++ b/docs/backend/dsl.md @@ -5,17 +5,14 @@ Bullet DSL is a configuration-based DSL that allows users to plug their data int To support this, Bullet DSL provides two major components. The first is for reading data from a pluggable data source (the *connectors* for talking to various data sources), and the second is for converting data (the *converters* for understanding your data formats) into [BulletRecords](ingestion.md). By enabling Bullet DSL in the Backend and configuring Bullet DSL, your backend will use the two components to read from the configured data source and convert the data into BulletRecords, without you having to write any code. -The three interfaces that the DSL uses are: +There is also an optional minor component that acts as the glue between the connectors and the converters. These are the *deserializers*. They exist if the data coming out of connector is of a format that cannot be understood by a converter. Typically, this happens for serialized data that needs to be deserialized first before a converter can understand it. -1. The **BulletConnector** : Bullet DSL's reading component -2. The **BulletRecordConverter** : Bullet DSL's converting component -3. The **Bullet Backend** : The implementation of Bullet on a Stream Processor - -There is also an optional BulletDeserializer component that sits between the Connector and the Converter to deserialize the data. +The four interfaces that the DSL uses are: -!!!note - - For the Backend, please refer to the DSL-specific Bullet Storm setup [here](storm-setup.md#using-bullet-dsl). (Currently, only Bullet Storm supports Bullet DSL.) +1. The **BulletConnector** : Bullet DSL's reading component +2. The **BulletDeserializer** : Bullet DSL's optional deserializing component +3. The **BulletRecordConverter** : Bullet DSL's converting component +4. The **Bullet Backend** : The implementation of Bullet on a Stream Processor ## BulletConnector @@ -135,6 +132,10 @@ bullet.dsl.converter.pojo.class.name: "com.your.package.YourPOJO" The MapBulletRecordConverter is used to convert Java Maps of Objects into BulletRecords. Without a schema, it simply inserts every entry in the Map into a BulletRecord without any type-checking. If the Map contains objects that are not types supported by the BulletRecord, you might have issues when serializing the record. +### JSONBulletRecordConverter + +The JSONBulletRecordConverter is used to convert String JSON representations of records into BulletRecords. Without a schema, it simply inserts every entry in the JSON object into a BulletRecord without any type-checking and it only uses the Double type for all numeric values (since it is unable to guess whether records might need a wider type). You should use a schema and mention the appropriate types if you want more specific numeric types for the fields in your record. If the JSON contains objects that are not types supported by the BulletRecord, you might have issues when serializing the record. + ### AvroBulletRecordConverter The AvroBulletRecordConverter is used to convert Avro records into BulletRecords. Without a schema, it inserts every field into a BulletRecord without any type-checking. With a schema, you get type-checking, and you can also specify a RECORD field, and the converter will accept Avro Records in addition to Maps, flattening them into the BulletRecord. @@ -146,7 +147,6 @@ The schema consists of a list of fields each described by a name, reference, typ 1. `name` : The name of the field in the BulletRecord 2. `reference` : The field to extract from the to-be-converted object 3. `type` : The type of the field -4. `subtype` : The subtype of any nested fields in this field (if any) When using the schema: @@ -154,8 +154,7 @@ When using the schema: 1. The `name` of the field in the schema will be the name of the field in the BulletRecord. 2. The `reference` of the field in the schema is the field/value to be extracted from an object when it is converted to a BulletRecord. 3. If the `reference` is null, it is assumed that the `name` and the `reference` are the same. -4. The `type` must be specified and will be used for type-checking. -5. The `subtype` must be specified for certain `type` values (`LIST`, `LISTOFMAP`, `MAP`, or `MAPOFMAP`). Otherwise, it must be null. +4. The `type` must be specified and can be used for type-checking. If you provide a schema and set the `bullet.dsl.converter.schema.type.check.enable` setting, then the converter will validate that the types in the source data matches the given type here. Otherwise, the type provided will be assumed. This is useful when initially using the DSL and you are not sure of the types. #### Types @@ -165,24 +164,34 @@ When using the schema: 4. FLOAT 5. DOUBLE 6. STRING -7. LIST -8. LISTOFMAP -9. MAP -10. MAPOFMAP -11. RECORD - -#### Subtypes - -1. BOOLEAN -2. INTEGER -3. LONG -4. FLOAT -5. DOUBLE -6. STRING - -!!!note "RECORD" - - For RECORD type, you should normally reference a Map. For each key-value pair in the Map, a field will be inserted into the BulletRecord. Hence, the name in a RECORD field is left empty. +7. BOOLEAN_MAP +8. INTEGER_MAP +9. LONG_MAP +10. FLOAT_MAP +11. DOUBLE_MAP +12. STRING_MAP +13. BOOLEAN_MAP_MAP +14. INTEGER_MAP_MAP +15. LONG_MAP_MAP +16. FLOAT_MAP_MAP +17. DOUBLE_MAP_MAP +18. STRING_MAP_MAP +19. BOOLEAN_LIST +20. INTEGER_LIST +21. LONG_LIST +22. FLOAT_LIST +23. DOUBLE_LIST +24. STRING_LIST +25. BOOLEAN_MAP_LIST +26. INTEGER_MAP_LIST +27. LONG_MAP_LIST +28. FLOAT_MAP_LIST +29. DOUBLE_MAP_LIST +30. STRING_MAP_LIST + +!!!note "Special Type for a RECORD" + + There is a special case where if you omit the `type` and the `name` for an entry in the schema, the reference is assumed to be a map containing arbitrary fields with types in the list above. You can use this if you have a map field that contains various objects with one or more types in the list above and want to flatten that map out into the target record using the respective types of each field in the map. The names of the fields in the map will be used as the top-level names in the resulting record. #### Example Schema @@ -195,13 +204,11 @@ When using the schema: }, { "name": "myBoolMap", - "type": "MAP", - "subtype": "BOOLEAN" + "type": "BOOLEAN_MAP" }, { "name": "myLongMapMap", - "type": "MAPOFMAP", - "subtype": "LONG" + "type": "LONG_MAP_MAP" }, { "name": "myIntFromSomeMap", @@ -217,10 +224,9 @@ When using the schema: "name": "myIntFromSomeNestedMapsAndLists", "reference": "someMap.nestedMap.nestedList.0", "type": "INTEGER" - }, + }, { - "reference" : "someMap", - "type": "RECORD" + "reference" : "someMap" } ] } @@ -228,7 +234,7 @@ When using the schema: ## BulletDeserializer -BulletDeserializer is an abstract Java class that can be implemented to deserialize/transform output from BulletConnector to input for BulletRecordConverter. It is an *optional* component and whether it's necessary or not depends on the output of your data sources. For example, if your KafkaConnector outputs byte arrays that are actually Java-serialized Maps, and you're using a MapBulletRecordConverter, you would use the JavaDeserializer, which would deserialize byte arrays into Java Maps for the converter. +BulletDeserializer is an abstract Java class that can be implemented to deserialize/transform output from BulletConnector to input for BulletRecordConverter. It is an *optional* component and whether it's necessary or not depends on the output of your data sources. If one is not needed, the `IdentityDeserializer` can be used. For example, if your KafkaConnector outputs byte arrays that are actually Java-serialized Maps, and you're using a MapBulletRecordConverter, you would use the JavaDeserializer, which would deserialize byte arrays into Java Maps for the converter. Currently, we support two BulletDeserializer implementations: diff --git a/docs/backend/ingestion.md b/docs/backend/ingestion.md index 6368bd7e..80a59e77 100644 --- a/docs/backend/ingestion.md +++ b/docs/backend/ingestion.md @@ -33,6 +33,7 @@ Data placed into a Bullet Record is strongly typed. We support these types curre 1. Map of Strings to any of the [Primitives](#primitives) 2. Map of Strings to any Map in 1 +3. List of any of the [Primitives](#primitives) 3. List of any Map in 1 With these types, it is unlikely you would have data that cannot be represented as Bullet Record but if you do, please let us know and we are more than willing to accommodate. diff --git a/docs/backend/spark-setup.md b/docs/backend/spark-setup.md index 8a725b79..69e1f4e6 100644 --- a/docs/backend/spark-setup.md +++ b/docs/backend/spark-setup.md @@ -12,7 +12,7 @@ Download the Bullet Spark standalone jar from [JCenter](http://jcenter.bintray.c If you are using Bullet Kafka as pluggable PubSub, you can download the fat jar from [JCenter](http://jcenter.bintray.com/com/yahoo/bullet/bullet-kafka/). Otherwise, you need to plug in your own PubSub jar or use the RESTPubSub built-into bullet-core and turned on in the API. -To use Bullet Spark, you need to implement your own [Data Producer Trait](https://github.com/bullet-db/bullet-spark/blob/master/src/main/scala/com/yahoo/bullet/spark/DataProducer.scala) with a JVM based project. You have two ways to implement it as described in the [Spark Architecture](spark-architecture.md#data-processing) section. You include the Bullet artifact and Spark dependencies in your pom.xml or other equivalent build tools. The artifacts are available through JCenter. Here is an example if you use Scala and Maven: +To use Bullet Spark, you need to implement your own [Data Producer Trait](https://github.com/bullet-db/bullet-spark/blob/master/src/main/scala/com/yahoo/bullet/spark/DataProducer.scala) with a JVM based project or you can use Bullet DSL (see below). If you choose to implement your own, you have two ways as described in the [Spark Architecture](spark-architecture.md#data-processing) section. You include the Bullet artifact and Spark dependencies in your pom.xml or other equivalent build tools. The artifacts are available through JCenter. Here is an example if you use Scala and Maven: ```xml @@ -65,9 +65,26 @@ To use Bullet Spark, you need to implement your own [Data Producer Trait](https: You can also add ```sources``` or ```javadoc``` if you want the sources or javadoc. +### Using Bullet DSL + +Instead of implementing your own Data Producer, you can also use the provided DSL receiver with [Bullet DSL](dsl.md). To do so, add the following settings to your YAML configuration: + +```yaml +# If true, enables the Bullet DSL data producer which can be configured to read from a custom data source. If enabled, +# the DSL data producer is used instead of the producer. +bullet.spark.dsl.data.producer.enable: true + +# If true, enables the deserializer between the Bullet DSL connector and converter components. Otherwise, this step is skipped. +bullet.spark.dsl.deserializer.enable: false +``` + +You may then use the appropriate DSL settings to point to the class names of the Connector and Converter you wish to use to read from your data source and convert it to BulletRecord instances. + +There is also a setting to enable [BulletDeserializer](dsl.md#bulletdeserializer), which is an optional component of Bullet DSL for deserializing data between reading and converting. + ## Launch -After you have implemented your own data producer and built a jar, you could launch your Bullet Spark application. Here is an example command for a [YARN cluster](https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html). +After you have implemented your own data producer or used Bullet DSL and built a jar, you could launch your Bullet Spark application. Here is an example command for a [YARN cluster](https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html). ```bash ./bin/spark-submit \ diff --git a/docs/backend/storm-architecture.md b/docs/backend/storm-architecture.md index 821ed258..1886687f 100644 --- a/docs/backend/storm-architecture.md +++ b/docs/backend/storm-architecture.md @@ -6,15 +6,21 @@ This section describes how the [Backend architecture](../index.md#backend) is im For Bullet on Storm, the Storm topology implements the backend piece from the full [Architecture](../index.md#architecture). The topology is implemented with the standard Storm spout and bolt components: -![Bullet Storm Topology](../img/topology-3.png) +![Bullet Storm Topology](../img/topology-4.svg) -The components in [Architecture](../index.md#architecture) have direct counterparts here. The Query spouts reading from the PubSub layer using plugged-in PubSub consumers make up the Request Processor. The Filter bolts and your plugin for your source of data (generally a spout but could be a topology) make up the Data Processor. The Join bolt, the Loop bolt and the Result bolt make up the Combiner. +The components in [Architecture](../index.md#architecture) have direct counterparts here. The Query spouts reading from the PubSub layer using plugged-in PubSub consumers make up the Request Processor. The Filter bolts and your plugin for your source of data (shown here using plugged with both the DSL spouts and DSL bolts from [Bullet DSL](#dsl.md)) make up the Data Processor. The Join bolt, the Loop bolt and the Result bolt make up the Combiner. There are peripheral components such as the Loop bolts, Tick spouts or the Replay bolts to handle metadata management in the topology. -The red colored lines are the path for the queries that come in through the PubSub, the blue is for the data from your data source and the orange is for metadata and loop-back signals used internally by the backend. The pattern on the lines denote how the data (Storm tuples) is moved to the next component. Dashed indicates a broadcast (sent to all instances of the component), dotted indicates a key grouping (sent to a particular instance based on hashing on a particular field), and solid indicates a shuffle (randomly sent to an instance). +The red colored lines are the path for the queries that come in through the PubSub, the blue is for the data from your data source and the orange is for metadata and loop-back signals used internally by the backend. The purple lines highlight the most important components where queries and data intermix (the Filter and Join bolts). -!!! note "What's a Tick Spout?" +The pattern on the lines denote how the data (Storm tuples) is moved to the next component. Dashed indicates a broadcast (sent to all instances of the component), dotted indicates a key grouping (sent to a particular instance based on hashing on a particular field), solid indicates a shuffle (randomly sent to an instance) and dashed-dotted indicates a custom grouping. - The Tick Spout component produces Storm tuples at predefined intervals to the Filter and Join Bolts. These tuples, called tick tuples, behave like CPU clock cycles for Bullet. Bullet performs all its system related activities on a tick. This includes purging stale queries, emitting left over data for queries, etc. We could have gone the route of having asynchronous threads that do the same thing but this was a far more simpler solution. The downside is that Bullet is as fast or as slow as its tick period, which can be configured on launch (defaults to ```100 ms```). In practice, this means that your time-based windows need to be at least twice as long as your tick period. +!!! note "What's a Replay?" + + The Replay and the [pluggable storage](storm-setup.md#storage) are *optional components* in Bullet on Storm. They exist to replay existing queries if you plugged in a storage layer into the [API](#../ws/setup.md#storage). You would use this if you have long running queries and are not tolerant to losing queries for your use-case. Currently, we do not support storage intermediate results in the storage though. For instance, if you restart the topology but have storage and replay configured, you will recreate all the queries on startup but you will lose all intermediate results that were held in memory so far. We plan to add intermediate result storage as well soon! + +!!! note "What's a Tick spout?" + + The Tick spout component produces Storm tuples at predefined intervals to the Filter and Join bolts. These tuples, called tick tuples, behave like CPU clock cycles for Bullet. Bullet performs all its system related activities on a tick. This includes purging stale queries, emitting left over data for queries, etc. We could have gone the route of having asynchronous threads that do the same thing but this was a far simpler solution. The downside is that Bullet is as fast or as slow as its tick period, which can be configured on launch (defaults to ```100 ms```). In practice, this means that your time-based windows need to be at least twice as long as your tick period. As a practical example of how Bullet uses ticks: when the final data is emitted from the Filter bolts when the query has expired, the Join bolt receiving it waits for 3 (this is configurable) ticks after *its query* expires to collect all the last intermediate results from the Filter bolts. If the tick period is set as high as 5 s, this means that a query will take 3 * 15 or 15 s to get back after its expiry! Setting it to 1 s, makes it 1 * 3 s. Similarly, intermediate windows are buffered (for certain kinds of windowed queries) to collect all results for that window before sending it back to the user. @@ -22,16 +28,21 @@ The red colored lines are the path for the queries that come in through the PubS Bullet can accept arbitrary sources of data as long as they can be read from Storm. You can either: -1. Write a Storm spout that reads your data from where ever it is (Kafka etc) and [converts it to Bullet Records](ingestion.md). See [Quick Start](../quick-start/storm.md#storm-topology) for an example. +1. Write a Storm spout (or optionally topology) that reads your data from where ever it is (Kafka etc) and [converts it to Bullet Records](ingestion.md). See [Quick Start](../quick-start/storm.md#storm-topology) for an example. 2. Hook up an existing topology that is doing something else directly to Bullet. You will still write and hook up a component that converts your data into Bullet Records in your existing topology. +3. Use [Bullet DSL](dsl.md) to configure a DSL spout (and optionally a DSL bolt) to use the DSL interfaces to automatically understand your data source with its data format and convert it to the interface Bullet uses without code. + +| | Option 1 | Option 2 | Option 3 | +| ------------------------------------------------------------------------------------------- | -------- | -------- | -------- | +| Write code to read from your data source and convert to Bullet records | Y | Y | N | +| Write Storm spouts and/or bolts | Y | Y | N | +| Saves a persistence/pubsub layer | N | Y | N | +| Separate reading data from converting and allowing fan-out | N | Y | Y | +| Full control over how data is read, processed and converted to Bullet records | Y | Y | N | -| | Pros | Cons | -| -------- | -------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------ | -| Option 1 | Very simple to get started. Just implement a spout | Need a storage layer that your spout pulls or some system has to push to your spouts | -| Option 2 | Saves a persistence layer | Ties your topology to Bullet directly, making it affected by Storm Backpressure etc | -| Option 2 | You can add bolts to do more processing on your data before sending it to Bullet | Increases the complexity of the topology | +Option 3 is generally flexible and is recommended. Having a code-less way to plug into Bullet is the fastest way to get started. We are adding new data sources and formats to the DSL so that we can support more ways to get your data into Bullet. If a connector or converter is not supported in DSL for your specific data source, you can also implement your own. It will save you from having to write Storm spouts or bolts and lets you reuse the Bullet DSL spout and/or bolt. -Your data is then emitted to the Filter bolt. If you have no queries in your system, the Filter bolt will promptly drop all Bullet Records and do absolutely nothing. If there are queries in the Filter bolt, the record is checked against the [filters](../index.md#filters) in each query and if it matches, it is processed by the query. Each query type can choose when to emit based on what window is configured for it. Depending on this, the matched record could be immediately emitted (if it is a RAW query or the intermediate aggregate if anything else) or it could be buffered till a specific time is reached (or the query has expired). +Regardless of how your data is read, it is then emitted to the Filter bolt. If you have no queries in your system, the Filter bolt will promptly drop all Bullet Records and do nothing. If there are queries in the Filter bolt, the record is checked against each query and if it matches, it is processed by the query. Each query type can choose when to emit its intermediate result based on what window is configured for it. Depending on this, the matched record could be immediately emitted (if it is a RAW query or the intermediate aggregate if anything else) or it could be buffered till a specific time is reached (or the query has expired). ### Request processing @@ -39,6 +50,11 @@ The Query spouts fetch Bullet queries through the PubSub layer using the Subscri The Query spout also key groups the query and additional query metadata to the Join bolts. This means that the query and the metadata will be end up at exactly one Join bolt. + +!!! note "Key grouping" + + Technically, Bullet uses a Custom grouping in Storm instead of Key grouping. The Custom grouping does the same thing as what the Key grouping would do in Storm. The reason why we use the Custom grouping is for Replay so that we can deterministically control the Join bolt component that will receive a query and reuse that information for certain aspects of replaying. This is just defensive programming in case Storm changes their key group algorithm to function differently. + ### Combining Since the data from the Query spout (query and metadata) and the data from all Filter bolts (intermediate results) is key grouped by the unique query id, only one particular Join bolt receives both the query and the intermediate results for a particular query. The Join bolt can then combine the intermediate results and produce a final result. This result is joined (hence the name) along with the metadata for the query and is shuffled to the Result bolts. This bolt then uses the particular Publisher from the plugged in PubSub layer and uses the metadata if it needs to and sends the results back through the PubSub layer to the requestor. @@ -49,14 +65,14 @@ Since the data from the Query spout (query and metadata) and the data from all F !!! note "Loop back" - We have not mentioned the loop components. These are mainly used to perform house-keeping within the topology. For instance, there is a Rate Limit concept in the Bullet core libraries that if violated in any instance of the query being executed, should cause the query to be killed. Wherever this error originates, it will trickle to the Loop bolt and be looped back through the PubSub, through the Query Spout and sent to all components that know about the query. These components will then kill the query as well. We call this a loop because strictly speaking, the topology is a Directed Acyclic Graph and we violate it by making a loop. These are also used to deliver external signals such as killing a query etc from the API or the UI. If you disable windows entirely, the Loop bolt will not be wired up when you launch your Bullet topology. + We have not mentioned the loop components or the replay bolts. These are mainly used to perform house-keeping within the topology or if you have configured storage/replay. For instance, there is a Rate Limit concept in the Bullet core libraries that if violated in any instance of the query being executed, should cause the query to be killed. Wherever this error originates, it will trickle to the Loop bolt and be looped back through the PubSub, through the Query spout and sent to all components that know about the query. These components will then kill the query as well. We call this a loop because strictly speaking, the topology is a Directed Acyclic Graph and we violate it by making a loop. These are also used to deliver external signals such as killing a query etc from the API or the UI. If you disable windows entirely, the Loop bolt will not be wired up when you launch your Bullet topology. ## Scalability The topology set up this way scales horizontally and has some nice properties: * If you want to scale for processing more data but the same amount of queries, you only need to scale the components that read your data (the spout reading the data or your custom topology) and the Filter bolts. - * If you want to scale for more queries but the same amount of data, you generally need to scale up the Filter Bolts. If you need it, you can scale the Query spouts, Join bolts, Loop bolts and Result bolts. You should ensure that your PubSub layer (if you're using the Storm DRPC PubSub layer, then this is the number of DRPC servers in your Storm cluster) can handle the volume of queries and results being sent through it. These components generally have low parallelisms compared to your data processing components since the data volume is generally much higher than your query volume, so this is generally not needed. + * If you want to scale for more queries but the same amount of data, you generally need to scale up the Filter bolts. If you need it, you can scale the Query spouts, Join bolts, Loop bolts and Result bolts. You should ensure that your PubSub layer (if you're using the Storm DRPC PubSub layer, then this is the number of DRPC servers in your Storm cluster) can handle the volume of queries and results being sent through it. These components generally have low parallelisms compared to your data processing components since the data volume is generally much higher than your query volume, so this is generally not needed. See [Scaling for more Queries](storm-performance.md#test-7-scaling-for-more-queries) and [Scaling for more Data](storm-performance.md#test-6-scaling-for-more-data) for more details. diff --git a/docs/backend/storm-performance.md b/docs/backend/storm-performance.md index eab8d918..142352f5 100644 --- a/docs/backend/storm-performance.md +++ b/docs/backend/storm-performance.md @@ -86,7 +86,7 @@ The parallelisms, CPU and memory settings for the components are listed below. ## Testing on Kafka 0.9.0.1 -For Tests 1 through 4, we read from a Kafka 0.9 cluster with the following configuration for the various Bullet components (unless specified). We use the single Spout model to read from the Kafka topic, partitioned into ```64``` partitions. +For Tests 1 through 4, we read from a Kafka 0.9 cluster with the following configuration for the various Bullet components (unless specified). We use the single spout model to read from the Kafka topic, partitioned into ```64``` partitions. ### Resource utilization @@ -94,12 +94,12 @@ For Tests 1 through 4, we read from a Kafka 0.9 cluster with the following confi | Component | Parallelism |CPU (cores) | On Heap Memory (MiB) | Off Heap Memory (MiB) |Total Memory (MiB) | | :------------------ | ----------: | ---------: | -------------------: | --------------------: | ----------------: | -| DataSource Spout |64 |64 |768.0 |192.0 |61440 | -| Filter Bolt |128 |128 |384.0 |192.0 |73728 | -| Join Bolt |2 |1 |384.0 |192.0 |1152 | -| DRPC Spout |2 |0.4 |128.0 |192.0 |640 | -| PrepareRequest Bolt |1 |0.2 |128.0 |192.0 |320 | -| ReturnResults Bolt |1 |0.2 |128.0 |192.0 |320 | +| DataSource spout |64 |64 |768.0 |192.0 |61440 | +| Filter bolt |128 |128 |384.0 |192.0 |73728 | +| Join bolt |2 |1 |384.0 |192.0 |1152 | +| DRPC spout |2 |0.4 |128.0 |192.0 |640 | +| PrepareRequest bolt |1 |0.2 |128.0 |192.0 |320 | +| ReturnResults bolt |1 |0.2 |128.0 |192.0 |320 | | IMetricsConsumer |1 |0.1 |128.0 |0 |128 | | Ackers |256 |25.6 |128.0 |0 |32768 | | **Total** |**455** |**219.5** | | |**170496** | @@ -124,19 +124,19 @@ storm jar ``` 1. The spout parallelism is 64 because it is going to read from a Kafka topic with 64 partitions (any more is meaningless since it cannot be split further). It reads and converts the data into Bullet Records. -2. We've fanned out from the spouts to the Filter Bolts by a ratio of 2. We may or may not need this. -3. We use ```topology.max.spout.pending=20000``` to limit the number of in-flight tuples there can be from a DataSource Spout instance and throttle it if too many queries are slowing down processing downstream. This is set pretty high to account for catch-up and skew in our Kafka partitions +2. We've fanned out from the spouts to the Filter bolts by a ratio of 2. We may or may not need this. +3. We use ```topology.max.spout.pending=20000``` to limit the number of in-flight tuples there can be from a DataSource spout instance and throttle it if too many queries are slowing down processing downstream. This is set pretty high to account for catch-up and skew in our Kafka partitions 4. We have set the max heap size for a worker to ```4 GiB``` since we do not want too large of a worker. If a component dies or a worker is killed by RAS, it will not affect too many other components. It also makes heap dumps etc manageable. 5. We set ```topology.worker.gc.childopts``` to use ```ParNewGC``` and ```CMS```. These are our cluster defaults but we are listing them here since this may not be true for all Storm clusters. We have also added the ```-XX:NewRatio=1``` to the defaults since most of our objects are short-lived and having a larger Young Generation reduces our Young Generation GC (ParNew) frequency. -6. We are using 256 acker tasks. There is acking from the DataSource Spout to the Filter Bolt and from the DRPCSpout and the PrepareRequestBolt, so about ~130 components will be acking. We could get away with using much less ackers as they are very light-weight. +6. We are using 256 acker tasks. There is acking from the DataSource spout to the Filter bolt and from the DRPCSpout and the PrepareRequestBolt, so about ~130 components will be acking. We could get away with using much less ackers as they are very light-weight. ## Test 1: Measuring the minimum latency of Bullet -We are [running this query](../ws/examples.md#simplest-query) in this test. This ```RAW``` query without any filters will serve to measure the intrinsic delay added by Bullet. The data record pulled out has a timestamp for when the record was emitted into Kafka, Bullet will inject the timestamp into the record when the Filter Bolt sends it on and the metadata collection logs timestamps for when the query was received and terminated. Using these, we can measure the end-to-end latency for getting one record through Bullet. +We are [running this query](../ws/examples.md#simplest-query) in this test. This ```RAW``` query without any filters will serve to measure the intrinsic delay added by Bullet. The data record pulled out has a timestamp for when the record was emitted into Kafka, Bullet will inject the timestamp into the record when the Filter bolt sends it on and the metadata collection logs timestamps for when the query was received and terminated. Using these, we can measure the end-to-end latency for getting one record through Bullet. ### Result -The following table shows the timestamps averaged by running **100** of these queries. The delays below are shown *relative* to the Query Received timestamp (when the query was received by Bullet at the Join Bolt). +The following table shows the timestamps averaged by running **100** of these queries. The delays below are shown *relative* to the Query Received timestamp (when the query was received by Bullet at the Join bolt).
@@ -147,7 +147,7 @@ The following table shows the timestamps averaged by running **100** of these qu | Query Received | 0 | | Query Finished | 1.66 | -The Bullet Filtered timestamp above is negative because the Filter Bolt received the query and emitted an arbitrary record ```2.16 ms``` before the Join Bolt received the query. The data was submitted into Kafka about ```710.75 ms``` before the query was received by Bullet and that difference is the processing time of Kafka and the time for our spouts to read the data into Bullet. +The Bullet Filtered timestamp above is negative because the Filter bolt received the query and emitted an arbitrary record ```2.16 ms``` before the Join bolt received the query. The data was submitted into Kafka about ```710.75 ms``` before the query was received by Bullet and that difference is the processing time of Kafka and the time for our spouts to read the data into Bullet. ### Conclusion @@ -157,7 +157,7 @@ Bullet adds a delay of a few ms - **```1.66 ms```** in the test above - to just The [last test](#test-1-measuring-the-minimum-latency-of-bullet) attempted to measure how long Bullet takes to pick out a record. Here we will measure how long it takes to find a record *that we generate*. This is the average of running **100** queries across a time interval of 30 minutes trying to filter for a record with a single unique value in a field [similar to this query](../ws/examples.md#simple-filtering). -We added a timestamp into the record when the record was initially read by the DataSource Spout. Using this and the Bullet Filtered timestamp and Query Finished timestamps, we can easily track the record through Bullet. +We added a timestamp into the record when the record was initially read by the DataSource spout. Using this and the Bullet Filtered timestamp and Query Finished timestamps, we can easily track the record through Bullet. Since we are looking at values in the data, the average data volume across this test was: ```Data: 76,000 R/s and 101 MiB/s``` @@ -179,7 +179,7 @@ Bullet received the record ```996.5 ms``` after the query was received. The dela ### Conclusion -We see that Bullet took on average ```1006.8 ms - 996.5 ms``` or **```10.3 ms```** from the time it saw the record first in DataSource Spout to finishing up the query and returning it in the Join Bolt. +We see that Bullet took on average ```1006.8 ms - 996.5 ms``` or **```10.3 ms```** from the time it saw the record first in DataSource spout to finishing up the query and returning it in the Join bolt. ## Test 3: Measuring the maximum number of parallel ```RAW``` queries @@ -187,7 +187,7 @@ This test runs a query similar to the [simple filtering query](../ws/examples.md ### What is meant by maximum? -We want to see how many of these queries we can have running simultaneously till the Filter Bolt is unable to process records from the spouts in time. If a Filter Bolt is unable to keep up with the rate of data produced by the spouts, our queries will not find all 10 records. Workers may start dying (killed by RAS for exceeding capacity) as well. We will be trying to find the number of queries in parallel that we can run without these happening. +We want to see how many of these queries we can have running simultaneously till the Filter bolt is unable to process records from the spouts in time. If a Filter bolt is unable to keep up with the rate of data produced by the spouts, our queries will not find all 10 records. Workers may start dying (killed by RAS for exceeding capacity) as well. We will be trying to find the number of queries in parallel that we can run without these happening. The average data volume across this test was: ```Data: 85,000 R/s and 126 MiB/s``` @@ -213,7 +213,7 @@ We ran a number of queries in parallel (you may have to use ```ulimit``` to chan ### Result -We were able to run 200 queries successfully but 300 and higher started causing our Filter Bolts to slow down. This slow down caused our spouts to be throttled and fall behind reading data. This caused the matching data to not show up in time during the queries. Some of our attempts would not return all the expected 10 records. +We were able to run 200 queries successfully but 300 and higher started causing our Filter bolts to slow down. This slow down caused our spouts to be throttled and fall behind reading data. This caused the matching data to not show up in time during the queries. Some of our attempts would not return all the expected 10 records. Using our metrics that were captured using our in-house metrics aggregation system (that our IMetricsConsumer publishes to), let's take a look at the CPU, Heap utilizations. @@ -221,8 +221,8 @@ Before you look at the figures: 1. All the figures below are for the same time interval. The X-axis represents time in ```1 minute``` intervals 2. [Figure 1](#figure-1-queries-running) shows the number of queries running for a time interval -3. The other figures show a metric across **all** the workers (JVMs) in the Storm topology, each running a mix of a components (spouts reading from Kafka, Filter Bolts etc) -4. The majority of the components (excluding ackers) are spouts reading from Kafka or Filter Bolts, so the figures can be taken to be primarily describing those workers +3. The other figures show a metric across **all** the workers (JVMs) in the Storm topology, each running a mix of a components (spouts reading from Kafka, Filter bolts etc) +4. The majority of the components (excluding ackers) are spouts reading from Kafka or Filter bolts, so the figures can be taken to be primarily describing those workers #### Figure 1. Queries running @@ -250,7 +250,7 @@ Before you look at the figures: !!! note "Garbage collection" - As we increase the number of queries sent into Bullet, more objects are created in the Filter and Join Bolts. These quickly fill up our heap and cause GCs. The zig-zags represents heaps being cleared after GC and filling back up quickly. Also, note that the CPU usage is directly related to the GC times. In other words, performance is pretty much directly correlated with the amount of GC we do. + As we increase the number of queries sent into Bullet, more objects are created in the Filter and Join bolts. These quickly fill up our heap and cause GCs. The zig-zags represents heaps being cleared after GC and filling back up quickly. Also, note that the CPU usage is directly related to the GC times. In other words, performance is pretty much directly correlated with the amount of GC we do. The following table summarizes these figures: @@ -291,12 +291,12 @@ Our resource utilization is now: | Component | Parallelism |CPU (cores) |On Heap Memory (MiB) | Off Heap Memory (MiB) | Total Memory (MiB) | | :------------------ | ----------: | ---------: | ------------------: | --------------------: | -----------------: | -| DataSource Spout |64 |64 |**1280.0** |192.0 | 94208 | -| Filter Bolt |128 |128 |**1024.0** |192.0 | 155648 | -| Join Bolt |2 |1 |384.0 |192.0 | 1152 | -| DRPC Spout |2 |0.4 |128.0 |192.0 | 640 | -| PrepareRequest Bolt |1 |0.2 |128.0 |192.0 | 320 | -| ReturnResults Bolt |1 |0.2 |128.0 |192.0 | 320 | +| DataSource spout |64 |64 |**1280.0** |192.0 | 94208 | +| Filter bolt |128 |128 |**1024.0** |192.0 | 155648 | +| Join bolt |2 |1 |384.0 |192.0 | 1152 | +| DRPC spout |2 |0.4 |128.0 |192.0 | 640 | +| PrepareRequest bolt |1 |0.2 |128.0 |192.0 | 320 | +| ReturnResults bolt |1 |0.2 |128.0 |192.0 | 320 | | IMetricsConsumer |1 |0.1 |128.0 |0 | 128 | | Ackers |256 |25.6 |128.0 |0 | 32768 | | **Total** |**455** |**219.5** | | | **285184** | @@ -349,7 +349,7 @@ With this change in heap usage, we could get to **```735```** of these queries s ## Testing on Kafka 0.10.0.1 -For this and subsequent tests, we upgraded our Kafka cluster to 0.10. We used the new Kafka consumer APIs to read *batches* of messages instead of a message at a time. We changed our DataSource Spout to read batches of messages (raw bytes) instead and added a DataSource Bolt that converts each batch message into Bullet records. Switching to this model let us be a lot more efficient in our data reading. +For this and subsequent tests, we upgraded our Kafka cluster to 0.10. We used the new Kafka consumer APIs to read *batches* of messages instead of a message at a time. We changed our DataSource spout to read batches of messages (raw bytes) instead and added a DataSource bolt that converts each batch message into Bullet records. Switching to this model let us be a lot more efficient in our data reading. To read more data, we will be trying to read a topic that is a superset of our data set so far and produces up to **13** times the number of records (maximum of 1.3 million records/sec) and **20** times the size of the data we were reading till now. This Kafka topic has **256** partitions. @@ -361,13 +361,13 @@ Our average data volume across this test was: ```Data: 756,000 R/s and 3080 MiB/ | Component | Parallelism |CPU (cores) |On Heap Memory (MiB) |Off Heap Memory (MiB) | Total Memory (MiB) | | :------------------ | ----------: | ---------: | ------------------: | -------------------: | -----------------: | -| DataSource Spout |128 |128 |1024.0 |192.0 | 155648 | -| DataSource Bolt |256 |512 |2580.0 |192.0 | 709632 | -| Filter Bolt |512 |512 |1024.0 |192.0 | 622592 | -| Join Bolt |2 |1 |512.0 |192.0 | 1408 | -| DRPC Spout |2 |0.4 |128.0 |192.0 | 640 | -| PrepareRequest Bolt |1 |0.2 |128.0 |192.0 | 320 | -| ReturnResults Bolt |1 |0.2 |128.0 |192.0 | 320 | +| DataSource spout |128 |128 |1024.0 |192.0 | 155648 | +| DataSource bolt |256 |512 |2580.0 |192.0 | 709632 | +| Filter bolt |512 |512 |1024.0 |192.0 | 622592 | +| Join bolt |2 |1 |512.0 |192.0 | 1408 | +| DRPC spout |2 |0.4 |128.0 |192.0 | 640 | +| PrepareRequest bolt |1 |0.2 |128.0 |192.0 | 320 | +| ReturnResults bolt |1 |0.2 |128.0 |192.0 | 320 | | IMetricsConsumer |4 |0.4 |128.0 |0 | 512 | | Ackers |256 |25.6 |128.0 |0 | 32768 | | **Total** |**1162** |**1179.8** | | | **1523840** | @@ -393,7 +393,7 @@ We capped the GC threads to ```8``` and ```4```, which helps performance on our !!! note "Max Spout Pending is now 30 ?!" - We use ```topology.max.spout.pending``` as a way to throttle how fast we read from Kafka. There is no acking past the Filter Bolt. The maximum number of batch messages we read is ```500``` from Kafka. This makes our true max spout pending: ```500 * 30 = 15,000```. The tuple that is emitted from the spout is a large tuple that contains up to ```500``` records and we limit up to ```30``` of those to go unacked from any single spout before we throttle it. Since we have ```128``` spouts, we can have ```128 * 15,000``` messages unacked in the topology at any time at the most. + We use ```topology.max.spout.pending``` as a way to throttle how fast we read from Kafka. There is no acking past the Filter bolt. The maximum number of batch messages we read is ```500``` from Kafka. This makes our true max spout pending: ```500 * 30 = 15,000```. The tuple that is emitted from the spout is a large tuple that contains up to ```500``` records and we limit up to ```30``` of those to go unacked from any single spout before we throttle it. Since we have ```128``` spouts, we can have ```128 * 15,000``` messages unacked in the topology at any time at the most. ### Result @@ -432,13 +432,13 @@ hiccups in our collection mechanism (the collection granularity was not 5s as we ### Conclusion -We are trying to read a data source that could have```13``` times more records and ```20``` times more data volume. So we have roughly increased the parallelism of the components reading the data by 10x (```128 + 512 = 768``` cores to read and convert the data whereas previously we were using ```64``` cores). Once this is fixed and we can read the data comfortably using our DataSource Spouts and Bolts, we can scale the Filter Bolts and other components to accommodate for queries. We set our Filter Bolt parallelism (dominates the rest of the components) to ```512```. We need about ```25``` machines (5 times more than the previous of ```5```). +We are trying to read a data source that could have```13``` times more records and ```20``` times more data volume. So we have roughly increased the parallelism of the components reading the data by 10x (```128 + 512 = 768``` cores to read and convert the data whereas previously we were using ```64``` cores). Once this is fixed and we can read the data comfortably using our DataSource spouts and bolts, we can scale the Filter bolts and other components to accommodate for queries. We set our Filter bolt parallelism (dominates the rest of the components) to ```512```. We need about ```25``` machines (5 times more than the previous of ```5```). With this configuration, we were able to run **```680```** queries simultaneously before we hit the DRPC limit. Since DRPC is a shared resource for the cluster, this limit is slightly lower than the previously observed number possibly due to our test environment being multi-tenant and other topologies using the shared resource. !!! note "Measuring latency in Bullet" - So far, we have been using data being delayed long enough as a proxy for queries failing. [Bullet-Storm 0.4.3](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.4.3) adds an average latency metric computed in the Filter Bolts. For the next tests, we add a timestamp in the Data Source spouts when the record is read and this latency metric tells us exactly how long it takes for the record to be matched against a query and acked. By setting a limit for this latency, we can much more accurately measure acceptable performance. + So far, we have been using data being delayed long enough as a proxy for queries failing. [Bullet-Storm 0.4.3](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.4.3) adds an average latency metric computed in the Filter bolts. For the next tests, we add a timestamp in the Data Source spouts when the record is read and this latency metric tells us exactly how long it takes for the record to be matched against a query and acked. By setting a limit for this latency, we can much more accurately measure acceptable performance. ## Test 6: Scaling for More Data @@ -451,7 +451,7 @@ For this test, we'll establish how much resources we need to read various data v For reading the data, we have to first scale the DataSource spouts and bolts and then set the parallelism of the Filter bolts to support the minimum 400 queries we want at the data volume. We leave the rest of the components at their default values as seen in in [Test 5](#test-5-reading-more-data). -To get various data volumes, we read a large Kafka topic with (256 partitions) with over 1 million R/s and sample various percentages to get less data. The sampling is done in our DataSource Spouts. +To get various data volumes, we read a large Kafka topic with (256 partitions) with over 1 million R/s and sample various percentages to get less data. The sampling is done in our DataSource spouts. ### Result @@ -462,29 +462,29 @@ The following table summarizes the results: | Data (MiB/s, R/s) | Component | Parallelism | CPU cores | On Heap (MiB) | Total CPU cores | Total Memory (MiB) | | :---------------- | :------------- | ----------: | --------: | ------------: | --------------: | -----------------: | |**307, 69700** | | | | |**98.3** |**123648** | -| |DataSource Spout|16 |0.5 |1024 | | | -| |DataSource Bolt |32 |2 |2048 | | | -| |Filter Bolt |24 |1 |1024 | | | +| |DataSource spout|16 |0.5 |1024 | | | +| |DataSource bolt |32 |2 |2048 | | | +| |Filter bolt |24 |1 |1024 | | | |**920, 216825** | | | | |**242.7** |**281856** | -| |DataSource Spout|32 |1 |1024 | | | -| |DataSource Bolt |72 |2 |2048 | | | -| |Filter Bolt |64 |1 |1024 | | | +| |DataSource spout|32 |1 |1024 | | | +| |DataSource bolt |72 |2 |2048 | | | +| |Filter bolt |64 |1 |1024 | | | |**1535, 374370** | | | | |**531.5** |**616192** | -| |DataSource Spout|64 |1 |1024 | | | -| |DataSource Bolt |160 |2 |2048 | | | -| |Filter Bolt |144 |1 |1024 | | | +| |DataSource spout|64 |1 |1024 | | | +| |DataSource bolt |160 |2 |2048 | | | +| |Filter bolt |144 |1 |1024 | | | |**2149, 524266** | | | | |**812.3** |**939264** | -| |DataSource Spout|72 |1 |1024 | | | -| |DataSource Bolt |256 |2 |2048 | | | -| |Filter Bolt |224 |1 |1024 | | | +| |DataSource spout|72 |1 |1024 | | | +| |DataSource bolt |256 |2 |2048 | | | +| |Filter bolt |224 |1 |1024 | | | |**3070, 724390** | | | | |**997.1** |**1321984** | -| |DataSource Spout|96 |1 |1024 | | | -| |DataSource Bolt |320 |2 |2580 | | | -| |Filter Bolt |256 |1 |1024 | | | +| |DataSource spout|96 |1 |1024 | | | +| |DataSource bolt |320 |2 |2580 | | | +| |Filter bolt |256 |1 |1024 | | | |**4024, 1004500** | | | | |**1189.4** |**1582208** | -| |DataSource Spout|96 |1 |1024 | | | -| |DataSource Bolt |384 |2 |2580 | | | -| |Filter Bolt |320 |1 |1024 | | | +| |DataSource spout|96 |1 |1024 | | | +| |DataSource bolt |384 |2 |2580 | | | +| |Filter bolt |320 |1 |1024 | | | The following figures graphs how the data volume relates to the total CPU and Memory needed. @@ -507,11 +507,11 @@ For this test, we'll establish how much resources we need to support more querie As our 3 server DRPC cluster currently does not let us do more than ```680 RAW``` queries, in this test, we will: -* Vary the number of Filter Bolts as they are the primary bottleneck for supporting more queries. -* To simplify things, we will only vary the **parallelism** and fix the CPU and memory of each Filter Bolt to ```1 Core``` and ```1 GiB Memory``` +* Vary the number of Filter bolts as they are the primary bottleneck for supporting more queries. +* To simplify things, we will only vary the **parallelism** and fix the CPU and memory of each Filter bolt to ```1 Core``` and ```1 GiB Memory``` * We will use ```RAW``` queries as the queries to scale -* Each ```RAW``` query will run for ```30 s``` and search for ```10``` records that we generate. The query will actually look for *11* records to force it to run for the full ```30 s```. This is because we want to stress the Filter Bolt as much as possible. As long as there is a query in the system, the Filter Bolt will deserialize and check every record that it processes -* We will measure the same filtering latency: the time taken from the record read in DataSource Spout to its emission in the Filter Bolt. We want the maximum latency to be less than ```200 ms``` +* Each ```RAW``` query will run for ```30 s``` and search for ```10``` records that we generate. The query will actually look for *11* records to force it to run for the full ```30 s```. This is because we want to stress the Filter bolt as much as possible. As long as there is a query in the system, the Filter bolt will deserialize and check every record that it processes +* We will measure the same filtering latency: the time taken from the record read in DataSource spout to its emission in the Filter bolt. We want the maximum latency to be less than ```200 ms``` ### Results @@ -519,7 +519,7 @@ The following table summarizes the results:
-| Filter Bolt Parallelism | Queries | Average Latency (ms) | Status | Topology CPU (cores) | Topology Memory (MiB) | +| Filter bolt Parallelism | Queries | Average Latency (ms) | Status | Topology CPU (cores) | Topology Memory (MiB) | | :---------------------- | ------: | -------------------: | -----: | -------------------: | --------------------: | |**4** | | | |**78.3** |**112256** | | |1 |7 |OK | | | @@ -575,7 +575,7 @@ The following table summarizes the results: | |600 |26 |OK | | | | |**650** |32 |OK | | | -The following figure summarizes the minimum number of CPU cores (which are also the number of Filter Bolts) needed to support the the maximum number of ```RAW``` queries with latency < 200 ms. +The following figure summarizes the minimum number of CPU cores (which are also the number of Filter bolts) needed to support the the maximum number of ```RAW``` queries with latency < 200 ms. #### Figure 15. CPU vs Max Concurrent Queries @@ -583,7 +583,7 @@ The following figure summarizes the minimum number of CPU cores (which are also This shows that the queries supported also scale pretty linearly. -You may have noticed how when latency starts to increase, it increases pretty rapidly. This suggests that there is a *knee* or *exponential* curve for latency. The following figure shows this in the graph of the latency for queries with ```20``` Filter Bolts. +You may have noticed how when latency starts to increase, it increases pretty rapidly. This suggests that there is a *knee* or *exponential* curve for latency. The following figure shows this in the graph of the latency for queries with ```20``` Filter bolts. #### Figure 16. Max Concurrent Queries vs Latency @@ -591,4 +591,4 @@ You may have noticed how when latency starts to increase, it increases pretty ra ### Conclusion -Since Filter Bolts tend to be the most CPU intensive of the query processing components, this test measured how scaling Filter Bolts affected the number of queries that can be supported. For the fixed data volume, this relationship is linear. +Since Filter bolts tend to be the most CPU intensive of the query processing components, this test measured how scaling Filter bolts affected the number of queries that can be supported. For the fixed data volume, this relationship is linear. diff --git a/docs/backend/storm-setup.md b/docs/backend/storm-setup.md index b29f4389..568a5c4a 100644 --- a/docs/backend/storm-setup.md +++ b/docs/backend/storm-setup.md @@ -8,12 +8,12 @@ Bullet is configured at run-time using settings defined in a file. Settings not ## Installation -To use Bullet, you need to implement a way to read from your data source and convert your data into Bullet Records (bullet-record is a transitive dependency for Bullet and can be found [in JCenter](ingestion.md#installing-the-record-directly). You have two options in how to get your data into Bullet: +To use Bullet, you need to implement a way to read from your data source and convert your data into Bullet Records (bullet-record is a transitive dependency for Bullet and can be found [in JCenter](ingestion.md#installing-the-record-directly). You have a couple of options in how to get your data into Bullet: -1. You can implement a Spout that reads from your data source and emits Bullet Record. This spout must have a constructor that takes a List of Strings. -2. You can pipe your existing Storm topology directly into Bullet. In other words, you convert the data you wish to be query-able through Bullet into Bullet Records from a bolt in your topology. +1. You can implement a spout (or even a topology) that reads from your data source and emits Bullet Records. You then write a main class that submits the topology with your topology wired in [using our submit method](https://github.com/bullet-db/bullet-storm/blob/master/src/main/java/com/yahoo/bullet/storm/StormUtils.java). +2. Use Bullet DSL to configure a spout (and optionally a bolt) that you provide in the settings to our main class. This will wire up your data source and data format to Bullet without you having to write code! -Option 1 is the simplest to start with and should accommodate most scenarios. See [Pros and Cons](storm-architecture.md#data-processing). +You can refer to the [Pros and Cons](storm-architecture.md#data-processing) of the various approaches to determine what works best for you. You need a JVM based project that implements one of the two options above. You include the Bullet artifact and Storm dependencies in your pom.xml or other dependency management system. The artifacts are available through JCenter, so you will need to add the repository. @@ -51,7 +51,7 @@ You can also add ```sources``` or ```javad If you are going to use the second option (directly pipe data into Bullet from your Storm topology), then you will need a main class that directly calls the submit method with your wired up topology and the name of the component that is going to emit Bullet Records in that wired up topology. The submit method can be found in [Topology.java](https://github.com/bulletbullet-storm/blob/master/src/main/java/com/yahoo/bullet/Topology.java). The submit method submits the topology so it should be the last thing you do in your main. -If you are just implementing a Spout, see the [Launch](#launch) section below on how to use the main class in Bullet to create and submit your topology. +If you are just implementing a spout, see the [Launch](#launch) section below on how to use the main class in Bullet to create and submit your topology. Storm topologies are generally launched with "fat" jars (jar-with-dependencies), excluding storm itself: @@ -129,7 +129,7 @@ You can pass other arguments to Storm using the -c argument. The example above u ## Using Bullet DSL -Instead of implementing your own spout or Topology, you can also use the provided DSL spout (and optionally, DSL Bolt) with [Bullet DSL](dsl.md). To do so, add the following settings to your YAML configuration: +Instead of implementing your own spout or Topology, you can also use the provided DSL spout (and optionally, DSL bolt) with [Bullet DSL](dsl.md). To do so, add the following settings to your YAML configuration: ```yaml bullet.topology.dsl.spout.enable: true @@ -147,7 +147,7 @@ bullet.topology.dsl.bolt.memory.off.heap.load: bullet.topology.dsl.deserializer.enable: false ``` -If the DSL Bolt is enabled in addition to the spout (the spout is always required!), Storm will read your data in the spout and convert it in the bolt. Without the bolt, reading and converting are done entirely in the spout. If you wish to separate the two by enabling the DSL Bolt, you can lower per-worker latencies when data volume is large and scale them independently. +If the DSL bolt is enabled in addition to the spout (the spout is always required!), Storm will read your data in the spout and convert it in the bolt. Without the bolt, reading and converting are done entirely in the spout. If you wish to separate the two by enabling the DSL bolt, you can lower per-worker latencies when data volume is large and scale them independently. There is also a setting to enable [BulletDeserializer](dsl.md#bulletdeserializer), which is an optional component of Bullet DSL for deserializing data between reading and converting. @@ -178,3 +178,9 @@ storm jar bullet-storm-0.9.1.jar \ --bullet-conf ./bullet_settings.yaml \ --jars "bullet-dsl-0.1.2.jar,pulsar-client-2.2.1.jar,pulsar-client-schema-2.2.1.jar,protobuf-shaded-2.1.0-incubating.jar" ``` + +## Storage and Replay + +If you set up the [Storage layer in the Web Service](../ws/setup.md#storage-configuration), you can turn on the replaying feature in Bullet on Storm. This wires up the Replay bolts to the topology. This component keeps track of the queries in the backend and replays queries from the Storage layer upon restart or component failure. + +Currently, only queries are stored. In the future, the Storage module will also be used for storing intermediate results in addition to queries to accommodate for restarts or component failures without loss of data for executing queries. diff --git a/docs/img/overallarch-2.png b/docs/img/overallarch-2.png deleted file mode 100644 index d42fdca1..00000000 Binary files a/docs/img/overallarch-2.png and /dev/null differ diff --git a/docs/img/overallarch-3.svg b/docs/img/overallarch-3.svg new file mode 100644 index 00000000..96ffa99a --- /dev/null +++ b/docs/img/overallarch-3.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/img/topology-3.png b/docs/img/topology-3.png deleted file mode 100755 index 1fb05949..00000000 Binary files a/docs/img/topology-3.png and /dev/null differ diff --git a/docs/img/topology-4.svg b/docs/img/topology-4.svg new file mode 100644 index 00000000..1145fd2f --- /dev/null +++ b/docs/img/topology-4.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 5f9bd59e..d8270a1b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,7 +15,7 @@ * Filter raw data or aggregate data -* Can be run on storm or spark streaming +* Can be run on Storm or Spark Streaming * A look-forward query system - operates on data that arrive after the query is submitted @@ -60,11 +60,11 @@ To set up Bullet on a real data stream, you need: # Querying in Bullet -Bullet queries allow you to filter, project and aggregate data. You can also specify a window to get incremental results. Bullet lets you fetch raw (the individual data records) as well as aggregated data. +Bullet queries allow you to filter, project and aggregate data. You can also specify a window to get incremental results. We support basically first-order SQL (no joins or nested queries). Bullet lets you fetch raw (the individual data records) as well as aggregated data. * See the [UI Usage section](ui/usage.md) for using the UI to build Bullet queries. This is the same UI you will build in the Quick Starts. -* See the API section ([BQL](ws/api-bql.md), or the more verbose, underlying query format - [JSON](ws/api-json.md)) for building Bullet API queries +* See the API section ([API](ws/api.md) for building Bullet API queries * For examples using the API, see [Examples](ws/examples.md). These are actual albeit cleansed queries sourced from the instance at Yahoo. @@ -75,55 +75,7 @@ A Bullet query terminates and returns whatever has been collected so far when: 1. A maximum duration is reached. In other words, a query runs for a defined time window (which can be infinite). 2. A maximum number of records is reached (only applicable for queries that are fetching raw data records and not aggregating). -## Filters - -Bullet supports two kinds of filters: - -| Filter Type | Meaning | -| ------------------ | ------- | -| Logical filter | Allow you to combine filter clauses (Logical or Relational) with logical operations like AND, OR and NOTs | -| Relational filters | Allow you to use comparison operations like equals, not equals, greater than, less than, regex like etc, on fields | - -## Projections - -Projections allow you to pull out only the fields needed and rename them when you are querying for raw data records. - -## Aggregations - -Aggregations allow you to perform some operation on the collected records. - -The current aggregation types that are supported are: - -| Aggregation | Meaning | -| -------------- | ------- | -| GROUP | The resulting output would be a record containing the result of an operation for each unique value combination in your specified fields | -| COUNT DISTINCT | Computes the number of distinct elements in the fields. (May be approximate) | -| LIMIT or RAW | The resulting output would be at most the number specified in size. | -| DISTRIBUTION | Computes distributions of the elements in the field. E.g. Find the median value or various percentile of a field, or get frequency or cumulative frequency distributions | -| TOP K | Returns the top K most frequently appearing values in the column | - -Currently we support ```GROUP``` aggregations with the following operations: - -| Operation | Meaning | -| -------------- | ------- | -| COUNT | Computes the number of the elements in the group | -| SUM | Computes the sum of the non-null values in the provided field for all elements in the group | -| MIN | Returns the minimum of the non-null values in the provided field for all the elements in the group | -| MAX | Returns the maximum of the non-null values in the provided field for all the elements in the group | -| AVG | Computes the average of the non-null values in the provided field for all the elements in the group | - -If you ```GROUP``` with no operation, you are performing a ```DISTINCT``` on the field(s). If you ```GROUP``` with no field(s), you are performing the operation(s) across all your data. - -## Post Aggregations - -Post Aggregations let you perform some operation before finalizing and returning the results to you. This is applied every time a result is returned to you (see below). The current operations supported are: - -| Post Aggregation | Meaning | -| ---------------- | ------- | -| ORDER BY | Orders your result by your specified fields in ascending or descending order | -| COMPUTATION | Specify an expression (can be nested expressions) [here](ws/api-json.md#expressions) to do math with or cast fields in your result | - -## Windows +## Windowing Windows in a Bullet query allow you to specify how often you'd like Bullet to return results. @@ -151,9 +103,9 @@ We also use Sketches as a way to control high cardinality grouping (group by a n ## End-to-End Architecture -![Overall Architecture](img/overallarch-2.png) +![Overall Architecture](img/overallarch-3.svg) -The image above shows how the various pieces of the Bullet interact at a high-level. All these layers are modular and pluggable. You can choose an implementation for the Backend and the PubSub (or create your own). The core of Bullet is abstracted into a [library](https://github.com/yahoo/bullet-core) that can be reused to implement the Backend, Web Service and PubSub layers in a platform agnostic manner. +The image above shows how the various pieces of the Bullet interact at a high-level. All these layers are modular and pluggable. Some, like the persistence layer for queries, are optional. You can choose an implementation for the Backend and the PubSub (or create your own). The core of Bullet is abstracted into a [library](https://github.com/yahoo/bullet-core) that can be reused to implement the Backend, Web Service and PubSub layers in a platform agnostic manner. --- diff --git a/docs/quick-start/spark.md b/docs/quick-start/spark.md index ae43bedb..350fbd76 100644 --- a/docs/quick-start/spark.md +++ b/docs/quick-start/spark.md @@ -37,7 +37,7 @@ mkdir -p $BULLET_HOME/pubsub mkdir -p $BULLET_HOME/service mkdir -p $BULLET_HOME/ui cd $BULLET_HOME -curl -LO https://github.com/bullet-db/bullet-db.github.io/releases/download/v0.6.1/examples_artifacts.tar.gz +curl -LO https://github.com/bullet-db/bullet-db.github.io/releases/download/v1.0.0/examples_artifacts.tar.gz tar -xzf examples_artifacts.tar.gz export BULLET_EXAMPLES=$BULLET_HOME/bullet-examples ``` @@ -50,10 +50,10 @@ For this instance of Bullet we will use the Kafka PubSub implementation found in ```bash cd $BULLET_HOME/pubsub -curl -Lo bullet-kafka.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-kafka/0.3.2/bullet-kafka-0.3.2-fat.jar -curl -LO https://archive.apache.org/dist/kafka/0.11.0.1/kafka_2.12-0.11.0.1.tgz -tar -xzf kafka_2.12-0.11.0.1.tgz -export KAFKA_DIR=$BULLET_HOME/pubsub/kafka_2.12-0.11.0.1 +curl -Lo bullet-kafka.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-kafka/1.0.1/bullet-kafka-1.0.1-fat.jar +curl -LO https://archive.apache.org/dist/kafka/2.3.1/kafka_2.12-2.3.1.tgz +tar -xzf kafka_2.12-2.3.1.tgz +export KAFKA_DIR=$BULLET_HOME/pubsub/kafka_2.12-2.3.1 ``` #### Step 3: Start Zookeeper @@ -81,22 +81,22 @@ $KAFKA_DIR/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication ### Setup Bullet Backend on Spark -We will run the bullet-spark backend using [Spark 2.2.1](https://spark.apache.org/releases/spark-release-2-2-1.html). +We will run the bullet-spark backend using [Spark 3.0.1](https://spark.apache.org/releases/spark-release-3-0-1.html). -#### Step 6: Install Spark 2.2.1 +#### Step 6: Install Spark 3.0.1 ```bash export BULLET_SPARK=$BULLET_HOME/backend/spark cd $BULLET_SPARK -curl -O https://archive.apache.org/dist/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz -tar -xzf spark-2.2.1-bin-hadoop2.7.tgz +curl -O https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz +tar -xzf spark-3.0.1-bin-hadoop2.7.tgz ``` #### Step 7: Setup Bullet-Spark and Example Data Producer ```bash cp $BULLET_HOME/bullet-examples/backend/spark/* $BULLET_SPARK -curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/0.2.2/bullet-spark-0.2.2-standalone.jar +curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spark/1.0.0/bullet-spark-1.0.0-standalone.jar ``` #### Step 8: Launch the Bullet Spark Backend @@ -104,7 +104,7 @@ curl -Lo bullet-spark.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-spa Run this multi-line command (new lines are escaped): ```bash -$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \ +$BULLET_SPARK/spark-3.0.1-bin-hadoop2.7/bin/spark-submit \ --master local[10] \ --class com.yahoo.bullet.spark.BulletSparkStreamingMain \ --jars $BULLET_HOME/pubsub/bullet-kafka.jar,$BULLET_SPARK/bullet-spark-example.jar \ @@ -121,8 +121,9 @@ The Backend will usually be up and running usually within 5-10 seconds. Once it ```bash cd $BULLET_HOME/service -curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/0.4.3/bullet-service-0.4.3-embedded.jar +curl -Lo bullet-service.jar http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/1.0.0/bullet-service-1.0.0-embedded.jar cp $BULLET_EXAMPLES/web-service/example_kafka_pubsub_config.yaml $BULLET_HOME/service/ +cp $BULLET_EXAMPLES/web-service/example_query_config.yaml $BULLET_HOME/service/ cp $BULLET_EXAMPLES/web-service/example_columns.json $BULLET_HOME/service/ ``` @@ -133,6 +134,7 @@ Run this multi-line command (new lines are escaped): ```bash java -Dloader.path=$BULLET_HOME/pubsub/bullet-kafka.jar -jar bullet-service.jar \ --bullet.pubsub.config=$BULLET_HOME/service/example_kafka_pubsub_config.yaml \ + --bullet.query.config=${BULLET_HOME}/service/example_query_config.yaml \ --bullet.schema.file=$BULLET_HOME/service/example_columns.json \ --server.port=9999 \ --logging.path=. \ @@ -148,7 +150,7 @@ curl -s http://localhost:9999/api/bullet/columns ``` ```bash -curl -s -H 'Content-Type: text/plain' -X POST -d '{"aggregation": {"size": 1}}' http://localhost:9999/api/bullet/sse-query +curl -s -H 'Content-Type: text/plain' -X POST -d 'SELECT * FROM STREAM(2000, TIME) LIMIT 1;' http://localhost:9999/api/bullet/queries/sse-query ``` This query will return a result JSON containing a "records" field containing a single record, and a "meta" field with some meta information. @@ -165,17 +167,17 @@ You can also check the status of the Web Service by looking at the Web Service l ```bash cd $BULLET_HOME/ui -curl -s https://raw.githubusercontent.com/creationix/nvm/v0.33.1/install.sh | bash +curl -s https://raw.githubusercontent.com/creationix/nvm/v0.37.2/install.sh | bash source ~/.bashrc -nvm install v6.9.4 -nvm use v6.9.4 +nvm install v10.20.1 +nvm use v10.20.1 ``` #### Step 13: Install the Bullet UI ```bash -curl -LO https://github.com/bullet-db/bullet-ui/releases/download/v0.6.2/bullet-ui-v0.6.2.tar.gz -tar -xzf bullet-ui-v0.6.2.tar.gz +curl -LO https://github.com/bullet-db/bullet-ui/releases/download/v1.0.0/bullet-ui-v1.0.0.tar.gz +tar -xzf bullet-ui-v1.0.0.tar.gz cp $BULLET_EXAMPLES/ui/env-settings.json config/ ``` @@ -227,7 +229,7 @@ This section will go over the various custom pieces this example plugged into Bu The Spark Streaming application we ran was Bullet plugged in with a custom Receiver in our implementation of the Bullet Spark DataProducer trait. This Receiver and DataProducer are implemented in this [example project](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/) and was already built for you when you [downloaded the examples](#step-1-setup-directories-and-examples). It does not read from any data source and just produces random, structured data. It also produces only up to a maximum number of records in a given period. Both this maximum and the length of a period are configured in the Receiver (at most 100 every 1 second). ```bash -$BULLET_SPARK/spark-2.2.1-bin-hadoop2.7/bin/spark-submit \ +$BULLET_SPARK/spark-3.0.1-bin-hadoop2.7/bin/spark-submit \ --master local[10] \ --class com.yahoo.bullet.spark.BulletSparkStreamingMain \ --jars $BULLET_HOME/pubsub/bullet-kafka.jar,$BULLET_SPARK/bullet-spark-example.jar \ @@ -248,74 +250,74 @@ The settings defined by ```--bullet-spark-conf=$BULLET_SPARK/bullet_spark_kafka_ Let's look at the [custom Receiver code](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/spark/src/main/scala/com/yahoo/bullet/spark/examples/receiver/RandomReceiver.scala) that generates the data. ```scala - private def receive(): Unit = { - nextIntervalStart = System.currentTimeMillis() - while (!isStopped) { - val timeNow = System.currentTimeMillis() - // Only emit if we are still in the interval and haven't gone over our per period max - if (timeNow <= nextIntervalStart && generatedThisPeriod < maxPerPeriod) { - store(generateRecord()) - generatedThisPeriod += 1 - } - if (timeNow > nextIntervalStart) { - logger.info("Generated {} tuples out of {}", generatedThisPeriod, maxPerPeriod) - nextIntervalStart = timeNow + period - generatedThisPeriod = 0 - periodCount += 1 - } - // It is courteous to sleep for a short time. - try { - Thread.sleep(1) - } catch { - case e: InterruptedException => logger.error("Error: ", e) + private def receive(): Unit = { + nextIntervalStart = System.currentTimeMillis() + while (!isStopped) { + val timeNow = System.currentTimeMillis() + // Only emit if we are still in the interval and haven't gone over our per period max + if (timeNow <= nextIntervalStart && generatedThisPeriod < maxPerPeriod) { + store(generateRecord()) + generatedThisPeriod += 1 + } + if (timeNow > nextIntervalStart) { + logger.info("Generated {} tuples out of {}", generatedThisPeriod, maxPerPeriod) + nextIntervalStart = timeNow + period + generatedThisPeriod = 0 + periodCount += 1 + } + // It is courteous to sleep for a short time. + try { + Thread.sleep(1) + } catch { + case e: InterruptedException => logger.error("Error: ", e) + } } } - } ``` This method above emits the data. This method is wrapped in a thread that is called by the Spark framework. This function only emits at most the given maximum tuples per period. ```scala - private def makeRandomMap: Map[java.lang.String, java.lang.String] = { - val randomMap = new HashMap[java.lang.String, java.lang.String](2) - randomMap.put(RandomReceiver.RANDOM_MAP_KEY_A, RandomReceiver.STRING_POOL(Random.nextInt(RandomReceiver.STRING_POOL.length))) - randomMap.put(RandomReceiver.RANDOM_MAP_KEY_B, RandomReceiver.STRING_POOL(Random.nextInt(RandomReceiver.STRING_POOL.length))) - randomMap - } + private def makeRandomMap: Map[java.lang.String, java.lang.String] = { + val randomMap = new HashMap[java.lang.String, java.lang.String](2) + randomMap.put(RandomReceiver.RANDOM_MAP_KEY_A, RandomReceiver.STRING_POOL(Random.nextInt(RandomReceiver.STRING_POOL.length))) + randomMap.put(RandomReceiver.RANDOM_MAP_KEY_B, RandomReceiver.STRING_POOL(Random.nextInt(RandomReceiver.STRING_POOL.length))) + randomMap + } - private def generateRecord(): BulletRecord = { - val record = new SimpleBulletRecord() - val uuid = UUID.randomUUID().toString - record.setString(RandomReceiver.STRING, uuid) - record.setLong(RandomReceiver.LONG, generatedThisPeriod) - record.setDouble(RandomReceiver.DOUBLE, Random.nextDouble()) - record.setDouble(RandomReceiver.GAUSSIAN, Random.nextGaussian()) - record.setString(RandomReceiver.TYPE, RandomReceiver.STRING_POOL(Random.nextInt(RandomReceiver.STRING_POOL.length))) - record.setLong(RandomReceiver.DURATION, System.nanoTime() % RandomReceiver.INTEGER_POOL(Random.nextInt(RandomReceiver.INTEGER_POOL.length))) - - // Don't use Scala Map and convert it by asJava when calling setxxxMap method in BulletRecord. - // It converts Scala Map to scala.collection.convert.Wrappers$MapWrapper which is not serializable in scala 2.11.x (https://issues.scala-lang.org/browse/SI-8911). - - record.setStringMap(RandomReceiver.SUBTYPES_MAP, makeRandomMap); - - val booleanMap = new HashMap[java.lang.String, java.lang.Boolean](4) - booleanMap.put(uuid.substring(0, 8), Random.nextBoolean()) - booleanMap.put(uuid.substring(9, 13), Random.nextBoolean()) - booleanMap.put(uuid.substring(14, 18), Random.nextBoolean()) - booleanMap.put(uuid.substring(19, 23), Random.nextBoolean()) - record.setBooleanMap(RandomReceiver.BOOLEAN_MAP, booleanMap) - - - val statsMap = new HashMap[java.lang.String, java.lang.Long](4) - statsMap.put(RandomReceiver.PERIOD_COUNT, periodCount) - statsMap.put(RandomReceiver.RECORD_NUMBER, periodCount * maxPerPeriod + generatedThisPeriod) - statsMap.put(RandomReceiver.NANO_TIME, System.nanoTime()) - statsMap.put(RandomReceiver.TIMESTAMP, System.nanoTime()) - record.setLongMap(RandomReceiver.STATS_MAP, statsMap) - - record.setListOfStringMap(RandomReceiver.LIST, asList(makeRandomMap, makeRandomMap)) - record - } + private def generateRecord(): BulletRecord[_ <: java.io.Serializable] = { + val record = new TypedSimpleBulletRecord() + val uuid = UUID.randomUUID().toString + record.setString(RandomReceiver.STRING, uuid) + record.setLong(RandomReceiver.LONG, generatedThisPeriod) + record.setDouble(RandomReceiver.DOUBLE, Random.nextDouble()) + record.setDouble(RandomReceiver.GAUSSIAN, Random.nextGaussian()) + record.setString(RandomReceiver.TYPE, RandomReceiver.STRING_POOL(Random.nextInt(RandomReceiver.STRING_POOL.length))) + record.setLong(RandomReceiver.DURATION, System.nanoTime() % RandomReceiver.INTEGER_POOL(Random.nextInt(RandomReceiver.INTEGER_POOL.length))) + + // Don't use Scala Map and convert it by asJava when calling setxxxMap method in BulletRecord. + // It converts Scala Map to scala.collection.convert.Wrappers$MapWrapper which is not serializable in scala 2.11.x (https://issues.scala-lang.org/browse/SI-8911). + + record.setStringMap(RandomReceiver.SUBTYPES_MAP, makeRandomMap); + + val booleanMap = new HashMap[java.lang.String, java.lang.Boolean](4) + booleanMap.put(uuid.substring(0, 8), Random.nextBoolean()) + booleanMap.put(uuid.substring(9, 13), Random.nextBoolean()) + booleanMap.put(uuid.substring(14, 18), Random.nextBoolean()) + booleanMap.put(uuid.substring(19, 23), Random.nextBoolean()) + record.setBooleanMap(RandomReceiver.BOOLEAN_MAP, booleanMap) + + + val statsMap = new HashMap[java.lang.String, java.lang.Long](4) + statsMap.put(RandomReceiver.PERIOD_COUNT, periodCount) + statsMap.put(RandomReceiver.RECORD_NUMBER, periodCount * maxPerPeriod + generatedThisPeriod) + statsMap.put(RandomReceiver.NANO_TIME, System.nanoTime()) + statsMap.put(RandomReceiver.TIMESTAMP, System.nanoTime()) + record.setLongMap(RandomReceiver.STATS_MAP, statsMap) + + record.setListOfStringMap(RandomReceiver.LIST, asList(makeRandomMap, makeRandomMap)) + record + } ``` This ```generateRecord``` method generates some fields randomly and inserts them into a BulletRecord (simple). Note that the BulletRecord is typed and all data must be inserted with the proper types. @@ -342,7 +344,7 @@ class RandomProducer extends DataProducer { } ``` -If you put Bullet on your data, you will need to write a DataProducer (or a full on Spark DAG if your reading is complex), that reads from your data source and emits a DStream of BulletRecords with the fields you wish to be query-able similar to this example. +If you put Bullet on your data, you will need to write a DataProducer (or a full on Spark DAG if your reading is complex), that reads from your data source and emits a DStream of BulletRecords with the fields you wish to be query-able similar to this example, or you can use [Bullet DSL to configure and plug in](../backend/spark-setup.md#using-bullet-dsl) a DSL based receiver that uses the Bullet DSL's Connector -> Serializer -> Converter system to read and convert your dataset without having to write code! ### PubSub @@ -410,8 +412,9 @@ Finally, we configured the UI with the custom environment specific settings file { "default": { "queryHost": "http://localhost:9999", - "queryNamespace": "api/bullet", + "queryNamespace": "api/bullet/queries", "queryPath": "ws-query", + "validationPath": "validate-query", "queryStompRequestChannel": "/server/request", "queryStompResponseChannel": "/client/response", "schemaHost": "http://localhost:9999", @@ -423,14 +426,14 @@ Finally, we configured the UI with the custom environment specific settings file } ], "bugLink": "https://github.com/bullet-db/bullet-ui/issues", - "modelVersion": 3, + "modelVersion": 4, "migrations": { "deletions": "query" }, "defaultValues": { "aggregationMaxSize": 1024, "rawMaxSize": 500, - "durationMaxSecs": 86400, + "durationMaxSecs": 9007199254740, "distributionNumberOfPoints": 11, "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", "distributionQuantileStart": 0, @@ -438,7 +441,7 @@ Finally, we configured the UI with the custom environment specific settings file "distributionQuantileIncrement": 0.1, "windowEmitFrequencyMinSecs": 1, "everyForRecordBasedWindow": 1, - "everyForTimeBasedWindow": 2, + "everyForTimeBasedWindow": 2000, "sketches": { "countDistinctMaxEntries": 16384, "groupByMaxEntries": 512, diff --git a/docs/quick-start/storm.md b/docs/quick-start/storm.md index 69e8897e..45da0dde 100644 --- a/docs/quick-start/storm.md +++ b/docs/quick-start/storm.md @@ -4,15 +4,15 @@ This section gets you running a mock instance of Bullet to play around with. The At the end of this section, you will have: - * Setup the Bullet topology using a custom spout on [bullet-storm-0.8.5](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.8.5) - * Setup the [Web Service](../ws/setup.md) talking to the topology and serving a schema for your UI using [bullet-service-0.4.3](https://github.com/bullet-db/bullet-service/releases/tag/bullet-service-0.4.3) - * Setup the [REST PubSub](../pubsub/rest.md) talking to the topology and Web Service using [bullet-core-0.6.4](https://github.com/bullet-db/bullet-core/releases/tag/bullet-core-0.6.4). - * Setup the [UI](../ui/setup.md) talking to the Web Service using [bullet-ui-0.6.2](https://github.com/bullet-db/bullet-ui/releases/tag/v0.6.2) + * Setup the Bullet topology using a custom spout on [bullet-storm-1.0.0](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-1.0.0) + * Setup the [Web Service](../ws/setup.md) talking to the topology and serving a schema for your UI using [bullet-service-1.0.0](https://github.com/bullet-db/bullet-service/releases/tag/bullet-service-1.0.0) + * Setup the [REST PubSub](../pubsub/rest.md) talking to the topology and Web Service using [bullet-core-1.2.0](https://github.com/bullet-db/bullet-core/releases/tag/bullet-core-1.2.0). + * Setup the [UI](../ui/setup.md) talking to the Web Service using [bullet-ui-1.0.0](https://github.com/bullet-db/bullet-ui/releases/tag/v1.0.0) **Prerequisites** * You will need to be on an Unix-based system (Mac OS X, Ubuntu ...) with ```curl``` installed - * You will need [JDK 8](http://www.oracle.com/technetwork/java/javase/downloads/index.html) installed + * You will need [JDK 8+](http://www.oracle.com/technetwork/java/javase/downloads/index.html) installed * You will need enough CPU and RAM on your machine to run about 8-10 JVMs in ```server``` mode. You should have at least 2 GB free space on your disk. We will be setting up a Storm cluster with multiple components, an embedded Tomcat server and a Node server. ## Install Script @@ -43,18 +43,18 @@ mkdir -p $BULLET_HOME/backend/storm mkdir -p $BULLET_HOME/service mkdir -p $BULLET_HOME/ui cd $BULLET_HOME -curl -LO https://github.com/bullet-db/bullet-db.github.io/releases/download/v0.6.1/examples_artifacts.tar.gz +curl -LO https://github.com/bullet-db/bullet-db.github.io/releases/download/v1.0.0/examples_artifacts.tar.gz tar -xzf examples_artifacts.tar.gz export BULLET_EXAMPLES=$BULLET_HOME/bullet-examples ``` -#### Step 2: Install Storm 1.2 +#### Step 2: Install Storm 2.2 ```bash cd $BULLET_HOME/backend -curl -O http://apache.org/dist/storm/apache-storm-1.2.2/apache-storm-1.2.2.zip -unzip apache-storm-1.2.2.zip -export PATH=$(pwd)/apache-storm-1.2.2/bin/:$PATH +curl -O http://apache.org/dist/storm/apache-storm-2.2.0/apache-storm-2.2.0.zip +unzip apache-storm-2.2.0.zip +export PATH=$(pwd)/apache-storm-2.2.0/bin/:$PATH ``` #### Step 3: Launch Storm components @@ -78,7 +78,7 @@ Once everything is up without errors, visit [http://localhost:8080](http://local ### Setting up the example Bullet topology -Now that Storm is up and running, we can put Bullet on it. We will use an example Spout that runs on Bullet 0.8.3 on our Storm cluster. The source is available [here](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/storm). This was part of the artifact that you installed in Step 1. +Now that Storm is up and running, we can put Bullet on it. We will use an example spout that runs on Bullet 1.2.0 on our Storm cluster. The source is available [here](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/storm). This was part of the artifact that you installed in Step 1. #### Step 4: Setup the Storm example @@ -92,8 +92,6 @@ cp $BULLET_EXAMPLES/backend/storm/* $BULLET_HOME/backend/storm ```bullet.query.aggregation.raw.max.size: 500``` The max ```RAW``` records you can fetch is 500. - ```bullet.query.aggregation.max.size: 1024``` The max records you can fetch for any query is 1024. - ```bullet.query.aggregation.count.distinct.sketch.entries: 16384``` We can count 16384 unique values exactly. Approximates after. ```bullet.query.aggregation.group.sketch.entries: 1024``` The max unique groups can be 1024. Uniform sample after. @@ -153,23 +151,30 @@ curl -s -H 'Content-Type: text/plain' -X POST -d '{"aggregation": {"size": 1}}' curl -s http://localhost:9999/api/bullet/columns ``` +!!! note "Settings" + + Take a look at example_query_settings.yaml for the settings that are being overridden for this example. You can add or change the query settings (used by BQL when creating the query) by referring to [core Bullet settings in bullet_defaults.yaml](https://github.com/bullet-db/bullet-core/blob/master/src/main/resources/bullet_defaults.yaml). We have [customized these settings](https://github.com/bullet-db/bullet-db.github.io/blob/src/examples/web-service/example_query_settings.yaml): + + ```bullet.query.aggregation.max.size: 1024``` The max records you can fetch for any query is 1024. + + ### Setting up the Bullet UI #### Step 8: Install Node ```bash -curl -s https://raw.githubusercontent.com/creationix/nvm/v0.33.1/install.sh | bash +curl -s https://raw.githubusercontent.com/creationix/nvm/v0.37.2/install.sh | bash source ~/.bashrc -nvm install v6.9.4 -nvm use v6.9.4 +nvm install v10.20.1 +nvm use v10.20.1 ``` #### Step 9: Install the Bullet UI ```bash cd $BULLET_HOME/ui -curl -LO https://github.com/bullet-db/bullet-ui/releases/download/v0.6.2/bullet-ui-v0.6.2.tar.gz -tar -xzf bullet-ui-v0.6.2.tar.gz +curl -LO https://github.com/bullet-db/bullet-ui/releases/download/v1.0.0/bullet-ui-v1.0.0.tar.gz +tar -xzf bullet-ui-v1.0.0.tar.gz cp $BULLET_EXAMPLES/ui/env-settings.json config/ ``` @@ -203,7 +208,7 @@ If you were performing the steps yourself, you can also manually cleanup **all t | -------------- | ---------------------------------------------------------------- | | UI | ```pkill -f [e]xpress-server.js``` | | Web Service | ```pkill -f [e]xample_rest_pubsub_config.yaml``` | -| Storm | ```pkill -f [a]pache-storm-1.2.2``` | +| Storm | ```pkill -f [a]pache-storm-2.2.0``` | | File System | ```rm -rf $BULLET_HOME /tmp/dev-storm-zookeeper``` | This does *not* delete ```$HOME/.nvm``` and some extra lines nvm may have added to your ```$HOME/{.profile, .bash_profile, .zshrc, .bashrc}```. @@ -228,7 +233,7 @@ storm jar bullet-storm-example-1.0-SNAPSHOT-jar-with-dependencies.jar \ ... ``` -This command launches the jar (an uber or "fat" jar) containing the custom spout code and all dependencies you copied in Step 5. We pass the name of your spout class with ```--bullet-spout com.yahoo.bullet.storm.examples.RandomSpout``` to the Bullet main class ```com.yahoo.bullet.Topology``` with two arguments ```--bullet-spout-arg 20``` and ```--bullet-spout-arg 101```. The first argument tells the Spout to generate at most 20 tuples (records) in a period and the second argument says a period is 101 ms long. +This command launches the jar (an uber or "fat" jar) containing the custom spout code and all dependencies you copied in Step 5. We pass the name of your spout class with ```--bullet-spout com.yahoo.bullet.storm.examples.RandomSpout``` to the Bullet main class ```com.yahoo.bullet.Topology``` with two arguments ```--bullet-spout-arg 20``` and ```--bullet-spout-arg 101```. The first argument tells the spout to generate at most 20 tuples (records) in a period and the second argument says a period is 101 ms long. The settings defined by ```--bullet-conf ./bullet_settings.yaml``` and the arguments here run all components in the topology with a parallelism of 1. So there will be one spout that is producing ~200 rps. @@ -313,7 +318,7 @@ private BulletRecord generateRecord() { This ```generateRecord``` method generates some fields randomly and inserts them into a BulletRecord. Note that the BulletRecord is typed and all data must be inserted with the proper types. -If you put Bullet on your data, you will need to write a Spout (or a topology if your reading is complex), that reads from your data source and emits BulletRecords with the fields you wish to be query-able placed into a BulletRecord similar to this example. +If you put Bullet on your data, you will need to write a spout (or a topology if your reading is complex), that reads from your data source and emits BulletRecords with the fields you wish to be query-able placed into a BulletRecord similar to this example. ### PubSub @@ -386,8 +391,9 @@ Finally, we configured the UI with the custom environment specific settings file { "default": { "queryHost": "http://localhost:9999", - "queryNamespace": "api/bullet", + "queryNamespace": "api/bullet/queries", "queryPath": "ws-query", + "validationPath": "validate-query", "queryStompRequestChannel": "/server/request", "queryStompResponseChannel": "/client/response", "schemaHost": "http://localhost:9999", @@ -399,14 +405,14 @@ Finally, we configured the UI with the custom environment specific settings file } ], "bugLink": "https://github.com/bullet-db/bullet-ui/issues", - "modelVersion": 3, + "modelVersion": 4, "migrations": { "deletions": "query" }, "defaultValues": { "aggregationMaxSize": 1024, "rawMaxSize": 500, - "durationMaxSecs": 86400, + "durationMaxSecs": 9007199254740, "distributionNumberOfPoints": 11, "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", "distributionQuantileStart": 0, @@ -414,7 +420,7 @@ Finally, we configured the UI with the custom environment specific settings file "distributionQuantileIncrement": 0.1, "windowEmitFrequencyMinSecs": 1, "everyForRecordBasedWindow": 1, - "everyForTimeBasedWindow": 2, + "everyForTimeBasedWindow": 2000, "sketches": { "countDistinctMaxEntries": 16384, "groupByMaxEntries": 512, diff --git a/docs/releases.md b/docs/releases.md index c05a238e..58a5d729 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -10,7 +10,7 @@ API (Java and Scala) docs can also be found for the releases below. ## Download -For downloading any artifact listed below manually, you should preferably use the [**JCenter mirror here**](https://jcenter.bintray.com/com/yahoo/bullet/). For resolving artifacts in your build tool, follow the directions in each of the components' Package Manager Setup sections. +For downloading any artifact listed below manually, you should preferably use the [**JCenter mirror here**](https://jcenter.bintray.com/com/yahoo/bullet/). For resolving artifacts in your build tool, follow the direcions in each of the components' Package Manager Setup sections. ----- @@ -30,6 +30,7 @@ The core Bullet logic (a library) that can be used to implement Bullet on differ | Date | Release | Highlights | APIDocs | | ------------ | ------------------------------------------------------------------------------------- | ---------- | ------- | +| 2021-01-04 | [**1.2.0**](https://github.com/bullet-db/bullet-core/releases/tag/bullet-core-1.2.0) | Storage layer updates and extensions | [JavaDocs](apidocs/bullet-core/1.2.0/index.html) | | 2020-10-30 | [**1.1.0**](https://github.com/bullet-db/bullet-core/releases/tag/bullet-core-1.1.0) | Ternary Logic, Bullet Record 1.1 | [JavaDocs](apidocs/bullet-core/1.1.0/index.html) | | 2020-10-02 | [**1.0.0**](https://github.com/bullet-db/bullet-core/releases/tag/bullet-core-1.0.0) | Major release - Expressions, Storage, Async queries, No JSON queries | [JavaDocs](apidocs/bullet-core/1.0.0/index.html) | | 2019-02-01 | [**0.6.6**](https://github.com/bullet-db/bullet-core/releases/tag/bullet-core-0.6.6) | QueryManager partition leak cleanup | [JavaDocs](apidocs/bullet-core/0.6.6/index.html) | @@ -85,14 +86,14 @@ The implementation of Bullet on Storm. Due to major API changes between Storm <= | Date | Storm 1.0+ | Storm 0.10 | Highlights | APIDocs | | ------------ | --------------------------------------------------------------------------- | --------------------------------------------------------------------------- | ---------- | ------- | -| 2021-01-01 | [**1.0.0**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-1.0.0) | - | Bullet Core 1.1, Replay, Storage | [JavaDocs](apidocs/bullet-storm/1.0.0/index.html) | +| 2021-01-12 | [**1.0.0**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-1.0.0) | - | Bullet Core 1.1, Replay, Storage | [JavaDocs](apidocs/bullet-storm/1.0.0/index.html) | | 2019-02-07 | [**0.9.1**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.9.1) | [**0.9.1**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.9.1) | Bullet DSL 0.1.2 and packaging fixes | [JavaDocs](apidocs/bullet-storm/0.9.1/index.html) | | 2019-02-07 | [**0.9.0**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.9.0) | [**0.9.0**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.9.0) | Bullet DSL support! | [JavaDocs](apidocs/bullet-storm/0.9.0/index.html) | | 2018-11-26 | [**0.8.5**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.8.5) | [**0.8.5**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.8.5) | Extended field notation and updates bullet-core to 0.6.4| [JavaDocs](apidocs/bullet-storm/0.8.5/index.html) | | 2018-11-20 | [**0.8.4**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.8.4) | [**0.8.4**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.8.4) | Partitioning and updates bullet-core to 0.6.2 | [JavaDocs](apidocs/bullet-storm/0.8.4/index.html) | | 2018-06-18 | [**0.8.3**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.8.3) | [**0.8.3**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.8.3) | Using new bullet-record and bullet-core supporting Integer and Float data types | [JavaDocs](apidocs/bullet-storm/0.8.3/index.html) | -| 2018-04-12 | [**0.8.2**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.8.2) | [**0.8.2**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.8.2) | Delaying query start in Join Bolt | | -| 2018-04-04 | [**0.8.1**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.8.1) | [**0.8.1**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.8.1) | Fixed bug in JoinBolt | | +| 2018-04-12 | [**0.8.2**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.8.2) | [**0.8.2**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.8.2) | Delaying query start in Join bolt | | +| 2018-04-04 | [**0.8.1**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.8.1) | [**0.8.1**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.8.1) | Fixed bug in Joinbolt | | | 2018-03-30 | [**0.8.0**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.8.0) | [**0.8.0**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.8.0) | Supports windowing / incremental updates | | | 2017-11-07 | [**0.7.0**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.7.0) | [**0.7.0**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.7.0) | Merge Query and Metadata Streams | | | 2017-10-24 | [**0.6.2**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.6.2) | [**0.6.2**](https://github.com/bullet-db/bullet-storm/releases/tag/bullet-storm-0.10-0.6.2) | Adds a fat jar for using the DRPC PubSub in the Web Service | | @@ -125,7 +126,7 @@ The implementation of Bullet on Spark Streaming. | Date | Release | Highlights | APIDocs | | ------------ | --------------------------------------------------------------------------------- | ---------- | ------- | -| 2021-01-01 | [**1.0.0**](https://github.com/bullet-db/bullet-spark/releases/tag/bullet-spark-1.0.0) | Bullet Core 1.1, Storage | [SparkDocs](apidocs/bullet-spark/1.0.0/index.html) | +| 2021-02-12 | [**1.0.0**](https://github.com/bullet-db/bullet-spark/releases/tag/bullet-spark-1.0.0) | Bullet Core 1.2, DSL | [SparkDocs](apidocs/bullet-spark/1.0.0/index.html) | | 2019-02-07 | [**0.2.2**](https://github.com/bullet-db/bullet-spark/releases/tag/bullet-spark-0.2.2) | Fixes a NPE in JoinStreaming for very short queries | [SparkDocs](apidocs/bullet-spark/0.2.2/index.html) | | 2018-11-26 | [**0.2.1**](https://github.com/bullet-db/bullet-spark/releases/tag/bullet-spark-0.2.1) | Uses bullet-core 0.6.4 and supports extended field notation in queries | [SparkDocs](apidocs/bullet-spark/0.2.1/index.html) | | 2018-11-16 | [**0.2.0**](https://github.com/bullet-db/bullet-spark/releases/tag/bullet-spark-0.2.0) | Uses bullet-core 0.6.1 and adds partitioning support | [SparkDocs](apidocs/bullet-spark/0.2.0/index.html) | @@ -153,7 +154,7 @@ The Web Service implementation that can serve a static schema from a file and ta | Date | Release | Highlights | APIDocs | | ------------ | -------------------------------------------------------------------------------------- | ---------- | ------- | -| 2021-01-01 | [**1.0.0**](https://github.com/bullet-db/bullet-service/releases/tag/bullet-service-0.5.0) | Async queries, Storage, Metrics, BQL only 1.0, Bullet Core 1.0 | [JavaDocs](apidocs/bullet-service/1.0.0/index.html) | +| 2021-01-12 | [**1.0.0**](https://github.com/bullet-db/bullet-service/releases/tag/bullet-service-1.0.0) | Async queries, Storage, Metrics, BQL only 1.0, Bullet Core 1.0 | [JavaDocs](apidocs/bullet-service/1.0.0/index.html) | | 2019-03-07 | [**0.5.0**](https://github.com/bullet-db/bullet-service/releases/tag/bullet-service-0.5.0) | QueryManager API updates | [JavaDocs](apidocs/bullet-service/0.5.0/index.html) | | 2018-11-28 | [**0.4.3**](https://github.com/bullet-db/bullet-service/releases/tag/bullet-service-0.4.3) | Updates bullet-bql to 0.2.1 | [JavaDocs](apidocs/bullet-service/0.4.3/index.html) | | 2018-11-26 | [**0.4.2**](https://github.com/bullet-db/bullet-service/releases/tag/bullet-service-0.4.2) | BQL to JSON endpoint, dead backend reaper, new types in Schema, bullet-core 0.6.4 | [JavaDocs](apidocs/bullet-service/0.4.2/index.html) | @@ -185,7 +186,7 @@ The Bullet UI that lets you build, run, save and visualize results from Bullet. | Date | Release | Highlights | | ------------ | -------------------------------------------------------------------------------------- | ---------- | -| 2021-01-01 | [**1.0.0**](https://github.com/bullet-db/bullet-ui/releases/tag/v1.0.0) | Ember 3 Octane, BQL support, new filter operators | +| 2021-01-12 | [**1.0.0**](https://github.com/bullet-db/bullet-ui/releases/tag/v1.0.0) | Ember 3 Octane, BQL support, new filter operators | | 2019-03-18 | [**0.6.2**](https://github.com/bullet-db/bullet-ui/releases/tag/v0.6.2) | Logo update | | 2018-10-05 | [**0.6.1**](https://github.com/bullet-db/bullet-ui/releases/tag/v0.6.1) | Timeseries Graphing, Bar, Pie Charts and FontAwesome | | 2018-07-20 | [**0.6.0**](https://github.com/bullet-db/bullet-ui/releases/tag/v0.6.0) | Supports adding a full default starting query | @@ -241,6 +242,8 @@ A DSL to plug data sources into the Bullet Backend and Web Service. | Date | Release | Highlights | APIDocs | | ------------ | ---------------------------------------------------------------------------------------- | ---------- | ------- | +| 2021-02-17 | [**1.1.0**](https://github.com/bullet-db/bullet-dsl/releases/tag/bullet-dsl-1.1.0) | JSONBulletRecordConverter | [JavaDocs](apidocs/bullet-dsl/1.1.0/index.html) | +| 2021-02-11 | [**1.0.1**](https://github.com/bullet-db/bullet-dsl/releases/tag/bullet-dsl-1.0.1) | Bullet Core 1.2, Unsets default connector/converter | [JavaDocs](apidocs/bullet-dsl/1.0.1/index.html) | | 2020-10-30 | [**1.0.0**](https://github.com/bullet-db/bullet-dsl/releases/tag/bullet-dsl-1.0.0) | Bullet Core 1.1, Types to match Bullet Record 1.1 | [JavaDocs](apidocs/bullet-dsl/1.0.0/index.html) | | 2019-02-07 | [**0.1.1**](https://github.com/bullet-db/bullet-dsl/releases/tag/bullet-dsl-0.1.1) | Interface consolidation, IdentityDeserializer | [JavaDocs](apidocs/bullet-dsl/0.1.1/index.html) | | 2019-02-05 | [**0.1.0**](https://github.com/bullet-db/bullet-dsl/releases/tag/bullet-dsl-0.1.0) | Bullet DSL, Fat jar, Interface refactors | [JavaDocs](apidocs/bullet-dsl/0.1.0/index.html) | @@ -262,6 +265,7 @@ A PubSub implementation using Kafka as the backing PubSub. Can be used with any | Date | Release | Highlights | APIDocs | | ------------ | ------------------------------------------------------------------------------------ | ---------- | ------- | +| 2021-02-17 | [**1.0.1**](https://github.com/bullet-db/bullet-kafka/releases/tag/bullet-kafka-1.0.1) | Bullet Core 1.2 | [JavaDocs](apidocs/bullet-kafka/1.0.1/index.html) | | 2020-10-30 | [**1.0.0**](https://github.com/bullet-db/bullet-kafka/releases/tag/bullet-kafka-1.0.0) | Bullet Core 1.1 | [JavaDocs](apidocs/bullet-kafka/1.0.0/index.html) | | 2018-12-17 | [**0.3.3**](https://github.com/bullet-db/bullet-kafka/releases/tag/bullet-kafka-0.3.3) | Removes adding unnecessary properties to Producers/Consumers | [JavaDocs](apidocs/bullet-kafka/0.3.3/index.html) | | 2018-11-26 | [**0.3.2**](https://github.com/bullet-db/bullet-kafka/releases/tag/bullet-kafka-0.3.2) | Uses bullet-core-0.6.4 | [JavaDocs](apidocs/bullet-kafka/0.3.2/index.html) | @@ -306,7 +310,8 @@ A library facilitating the conversion from Bullet BQL queries to Bullet queries. | Date | Release | Highlights | APIDocs | | ------------ | ------------------------------------------------------------------------------------ | ---------- | ------- | -| 2020-01-01 | [**1.0.0**](https://github.com/bullet-db/bullet-bql/releases/tag/bullet-bql-1.0.0) | Expressions, Schema integration, native queries instead of JSON | [JavaDocs](apidocs/bullet-bql/1.0.0/index.html) | +| 2021-01-04 | [**1.1.0**](https://github.com/bullet-db/bullet-bql/releases/tag/bullet-bql-1.1.0) | Updates Bullet Core to 1.2.0 | [JavaDocs](apidocs/bullet-bql/1.1.0/index.html) | +| 2021-01-04 | [**1.0.0**](https://github.com/bullet-db/bullet-bql/releases/tag/bullet-bql-1.0.0) | Expressions, Schema integration, native queries instead of JSON | [JavaDocs](apidocs/bullet-bql/1.0.0/index.html) | | 2018-11-28 | [**0.2.1**](https://github.com/bullet-db/bullet-bql/releases/tag/bullet-bql-0.2.1) | Extended field access notation | [JavaDocs](apidocs/bullet-bql/0.2.1/index.html) | | 2018-09-28 | [**0.2.0**](https://github.com/bullet-db/bullet-bql/releases/tag/bullet-bql-0.2.0) | Adds Post Aggregations and uses bullet-core-0.5.1 | [JavaDocs](apidocs/bullet-bql/0.2.0/index.html) | | 2018-09-06 | [**0.1.2**](https://github.com/bullet-db/bullet-bql/releases/tag/bullet-bql-0.1.2) | Supports CONTAINSKEY, CONTAINSVALUE, SIZEOF, comparing to other fields. Fixes some bugs | [JavaDocs](apidocs/bullet-bql/0.1.2/index.html) | diff --git a/docs/ui/setup.md b/docs/ui/setup.md index 91c9d5b9..5b39f1ec 100644 --- a/docs/ui/setup.md +++ b/docs/ui/setup.md @@ -76,14 +76,15 @@ The configuration for the UI lets you have different instances of Bullet for dif | queryHost | The end point (port included) of your Web Service machine that is talking to the Bullet backend | | queryNamespace | Any qualifiers you have after your host and port on your Web Service running on your ```queryHost``` | | queryPath | The path fragment after the ```queryNamespace``` on your Web Service running on your ```queryHost``` for the WebSocket endpoint | +| validationPath | The path fragment after the ```queryNamespace``` on your Web Service running on your ```queryHost``` for the Query Validation endpoint | | queryStompRequestChannel | The fragment after this is the Stomp Request channel as configured in your Web Service for the WebSocket endpoint | | queryStompResponseChannel | The fragment after this is the Stomp Response channel as configured in your Web Service for the WebSocket endpoint | | schemaHost | The end point (port included) of your Web Service machine that is serving your schema in the JSON API format (see [Web Service setup](../ws/setup.md) for details.)| | schemaNamespace | The path fragment on your schema Web Service running on the ```schemaHost```. There is no ```schemaPath``` because it **must** be ```columns``` in order for the UI to be able fetch the column resource (the fields in your schema).| -| modelVersion | This is used an indicator to apply changes to the stored queries, results etc. It is monotonically increasing. On startup, changes specified in ```migrations``` will be applied if the old modelVersion is not present or is < than this number | +| modelVersion | This is used an indicator to apply changes to the stored queries, results etc. It is monotonically increasing. On startup, changes specified in ```migrations``` will be applied if the old modelVersion is not present or is < than this number. This is generally incremented by the UI once backwards-incompatible changes are made. | | migrations | is an object that currently supports one key: ```deletions``` of type string. The value can be set to either ```result``` or ```query```. The former wipes all existing results. The latter wipes everything. See ```modelVersion``` above. | | helpLinks | Is a list of objects, where each object is a help link. These links populate the "Help" drop-down on the UI's top navbar. You can add links to explain your data for example | -| defaultQuery | Can either be a [API Query](../ws/api-json.md) or a URL from which one could be fetched dynamically. The UI makes this the query created on every newly created Query. You could use this as a way to have user specific (for example, cookie based) filters created for your users or customize an aggregation when they create a new query in the UI. Note that if you have are accessing a map subfield and your field value in the filter is set as ```foo.bar``` and you want ```bar``` to be the subfield in the UI query builder, you will need to add a key called ```subfield``` in the filter (not supported by the API) and set its value to ```true``` | +| defaultQuery | Can either be a [API Query](../ws/api.md) or a URL from which one could be fetched dynamically. The UI makes this the query created on every newly created Query. You could use this as a way to have user specific (for example, cookie based) filters created for your users or customize an aggregation when they create a new query in the UI. Note that Builder Query do not support all API queries yet but whatever query you specify here (as long as it's a valid query) will be supported in the BQL query page in the UI. If it is not possible to convert your query into a Builder query, a default one will be used instead. | | bugLink | Is a URL that by default points to the issues page for the UI GitHub repository. You can change it to point to your own custom JIRA queue or something else | | defaultValues | Is an object that lets you configures defaults for various query parameters and lets you tie your custom backend settings to the UI | @@ -93,15 +94,15 @@ These are the properties in the ```defaultValues``` object. The Validated column | --------------------------------------- | --------- | ------- | ---------------- | | aggregationMaxSize | Yes | Yes | The size used when doing a Count Distinct, Distinct, Group By, or Distribution query. Set this to your max aggregations size in your backend configuration | | rawMaxSize | Yes | Yes | The maximum size for a Raw query. Set this to your max raw aggregation size in your backend configuration | -| durationMaxSecs | Yes | Yes | The maximum duration for a query. Set this to the seconds version of milliseconds max duration in your backend configuration | +| durationMaxSecs | Yes | Yes | The maximum duration for a query. Set this to the seconds version of the milliseconds max duration in your backend configuration | | distributionNumberOfPoints | Yes | No | The default value filled in for the Number of Points field for all Distribution aggregations | | distributionQuantilePoints | No | No | The default value filled in for the Points field for Quantile Distribution aggregations | | distributionQuantileStart | No | No | The default value filled in for the Start field for Quantile Distribution aggregations | | distributionQuantileEnd | No | No | The default value filled in for the End field for Quantile Distribution aggregations | | distributionQuantileIncrement | No | No | The default value filled in for the Increment field for Quantile Distribution aggregations | -| windowEmitFrequencyMinSecs | Yes | No | The minimum time interval at which a time based window can be returned. Set this to the minimum window emit frequency from your backend configuration | +| windowEmitFrequencyMinSecs | Yes | No | The minimum time interval at which a time based window can be returned in seconds. Set this to the minimum window emit frequency from your backend configuration | | everyForRecordBasedWindow | No | No | The default value for the number of records in a window for a record based window | -| everyForTimeBasedWindow | No | No | The default value for the number of records in a window for a time based window | +| everyForTimeBasedWindow | No | No | The default value for the number of records in a window for a time based window in milliseconds | | sketches.countDistinctMaxEntries | No | Yes | The maximum entries configured for your Count Distinct sketch in your backend configuration | | sketches.groupByMaxEntries | No | Yes | The maximum entries configured for your Group sketch in your backend configuration | | sketches.distributionMaxEntries | No | Yes | The maximum entries configured for your Distribution sketch in your backend configuration | @@ -126,14 +127,17 @@ These are the properties in the ```defaultValues``` object. The Validated column You can specify values for each property above in the ```env-settings.json``` file. These will be used when running a custom instance of the UI (see [above](#Running)). The ```default``` property in the ```env-settings.json``` that loads default settings for the UI that can be selectively overridden based on which environment you are running on. All settings explained above have default values -that are the same as the [default backend settings](https://github.com/bullet-db/bullet-storm/blob/master/src/main/resources/bullet_defaults.yaml). However, the defaults do not add the ```defaultQuery``` setting explained above. +that are the same as the [default backend settings](https://github.com/bullet-db/bullet-storm/blob/master/src/main/resources/bullet_defaults.yaml). ```json { "default": { "queryHost": "http://localhost:5555", - "queryNamespace": "api/bullet", - "queryPath": "query", + "queryNamespace": "api/bullet/queries", + "queryPath": "ws-query", + "validationPath": "validate-query", + "queryStompRequestChannel": "/server/request", + "queryStompResponseChannel": "/client/response", "schemaHost": "http://localhost:5555", "schemaNamespace": "api/bullet", "helpLinks": [ @@ -143,17 +147,23 @@ that are the same as the [default backend settings](https://github.com/bullet-db } ], "bugLink": "https://github.com/bullet-db/bullet-ui/issues", - "modelVersion": 1, + "modelVersion": 4, + "migrations": { + "deletions": "query" + }, + "defaultQuery": "SELECT COUNT(*) FROM STREAM(60000, TIME) WINDOWING TUMBLING(2000, TIME);", "defaultValues": { - "aggregationMaxSize": 512, + "aggregationMaxSize": 500, "rawMaxSize": 100, - "durationMaxSecs": 120, + "durationMaxSecs": 9007199254740, "distributionNumberOfPoints": 11, "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", "distributionQuantileStart": 0, "distributionQuantileEnd": 1, "distributionQuantileIncrement": 0.1, - "queryTimeoutSecs": 3, + "windowEmitFrequencyMinSecs": 1, + "everyForRecordBasedWindow": 1, + "everyForTimeBasedWindow": 2000, "sketches": { "countDistinctMaxEntries": 16384, "groupByMaxEntries": 512, @@ -163,17 +173,24 @@ that are the same as the [default backend settings](https://github.com/bullet-db "topKErrorType": "No False Negatives" }, "metadataKeyMapping": { - "theta": "theta", - "uniquesEstimate": "uniques_estimate", - "queryCreationTime": "query_receive_time", - "queryTerminationTime": "query_finish_time", - "estimatedResult": "was_estimated", - "standardDeviations": "standard_deviations", - "normalizedRankError": "normalized_rank_error", - "maximumCountError": "maximum_count_error", - "itemsSeen": "items_seen", - "minimumValue": "minimum_value", - "maximumValue": "maximum_value" + "querySection": "Query", + "windowSection": "Window", + "sketchSection": "Sketch", + "theta": "Theta", + "uniquesEstimate": "Uniques Estimate", + "queryCreationTime": "Receive Time", + "queryTerminationTime": "Finish Time", + "estimatedResult": "Was Estimated", + "standardDeviations": "Standard Deviations", + "normalizedRankError": "Normalized Rank Error", + "maximumCountError": "Maximum Count Error", + "itemsSeen": "Items Seen", + "minimumValue": "Minimum Value", + "maximumValue": "Maximum Value", + "windowNumber": "Number", + "windowSize": "Size", + "windowEmitTime": "Emit Time", + "expectedEmitTime": "Expected Emit Time" } } } @@ -215,13 +232,14 @@ To cement all this, if you wanted an instance of the UI in your CI environment, Your UI on your CI environment will: - * POST to ```http://bullet-ws.dev.domain.com:4080/bullet/api/drpc``` for UI created Bullet queries - * GET the schema from ```http://bullet-ws.dev.domain.com:4080/bullet/api/columns``` + * Talk using Websockets to ```http://bullet-ws.dev.domain.com:4080/api/bullet/ws-query``` for UI created Bullet queries + * GET the schema from ```http://bullet-ws.dev.domain.com:4080/api/bullet/columns``` + * Validate queries in the BQL page with ```http://bullet-ws.dev.domain.com:4080/api/bullet/validate-query``` * Populate an additional link on the Help drop-down pointing to ```http://data.docs.domain.com``` * Allow queries to run as long as 300 seconds * Use 32768 in the help menu for the max number of unique elements that can be counted exactly * Allow only 50 points to be generated for Distribution queries - * GET and cache a defaultQuery from ```http://bullet-ws.dev.domain.com:4080/custom-endpoint/api/defaultQuery``` + * GET and cache a default query from ```http://bullet-ws.dev.domain.com:4080/custom-endpoint/api/defaultQuery``` You would make express use these settings by running diff --git a/docs/ws/api-bql.md b/docs/ws/api-bql.md deleted file mode 100644 index db0f3df4..00000000 --- a/docs/ws/api-bql.md +++ /dev/null @@ -1,192 +0,0 @@ -# Bullet BQL API - -This section gives a comprehensive overview of the Web Service API for launching Bullet BQL queries. - -For examples of BQL queries, see the [examples page](examples.md). - -BQL queries that are received by the Web Service will be detenced and automatically converted to [the JSON format](api-json.md) before being sent to the backend (which requires the basic JSON format). This conversion is done in the web service using [the bullet-bql library](../releases/#bullet-bql). - -## Overview - -Bullet-BQL provides users with a friendly SQL-like API to submit queries to the Web Service instead of using the more cumbersome [JSON API](api-json.md). - -## Statement Syntax - - SELECT DISTINCT? select_clause - FROM from_clause - ( WHERE where_clause )? - ( GROUP BY groupBy_clause )? - ( HAVING having_clause )? - ( ORDER BY orderBy_clause )? - ( WINDOWING windowing_clause )? - ( LIMIT limit_clause )?; - -where `select_clause` is one of - - * - COUNT( DISTINCT reference_expr ( , reference_expr )? ) - group_function ( AS? ColumnReference )? ( , group_function ( AS? ColumnReference )? )? ( , reference_expr ( AS? ColumnReference )? )? - reference_expr ( AS? ColumnReference )? ( , reference_expr ( AS? ColumnReference )? )? - distribution_type( reference_expr, input_mode ) ( AS? ColumnReference )? - TOP ( ( Integer | Long ) ( , Integer | Long ) )? , reference_expr ( , reference_expr )? ) ( AS? ColumnReference )? - - -`reference_expr` is one of `ColumnReference` or `Dereference`. - -and `group_function` is one of `SUM(reference_expr)`, `MIN(reference_expr)`, `MAX(reference_expr)`, `AVG(reference_expr)` and `COUNT(*)`. `reference_expr` is one of ColumnReference and Dereference. `distribution_type` is one of `QUANTILE`, `FREQ` and `CUMFREQ`. The 1st number in `TOP` is K, and the 2nd number is an optional threshold. The `input_mode` is one of - - LINEAR, ( Integer | Long ) evenly spaced - REGION, ( Integer | Long ), ( Integer | Long ), ( Integer | Long ) evenly spaced in a region - MANUAL, ( Integer | Long ) (, ( Integer | Long ) )* defined points - -and `from_clause` is one of - - STREAM() default time duration will be set from BQLConfig - STREAM( ( Long | MAX ), TIME ) time based duration control. - STREAM( ( Long | MAX ), TIME, ( Long | MAX ), RECORD ) time and record based duration control. - -`RECORD` will be supported in the future. - -and `where_clause` is one of - - NOT where_clause - where_clause AND where_clause - where_clause OR where_clause - reference_expr IS NOT? NULL - reference_expr IS NOT? EMPTY - reference_expr IS NOT? DISTINCT FROM value_expr - reference_expr NOT? BETWEEN value_expr AND value_expr - reference_expr NOT? IN ( value_expr ( , value_expr )* ) - reference_expr NOT? LIKE ( value_expr ( , value_expr )* ) - reference_expr NOT? CONTAINSKEY ( value_expr ( , value_expr )* ) - reference_expr NOT? CONTAINSVALUE ( value_expr ( , value_expr )* ) - reference_expr ( = | <> | != | < | > | <= | >= ) value_expr - SIZEOF(reference_expr) ( = | <> | != ) value_expr - SIZEOF(reference_expr) NOT? IN ( value_expr ( , value_expr )* ) - SIZEOF(reference_expr) NOT? DISTINCT FROM value_expr - -`value_expr` is one of Null, Boolean, Integer, Long, Double, Decimal, String or `reference_expr`. - -and `groupBy_clause` is one of - - () group all - reference_expr ( , reference_expr )* group by - ( reference_expr ( , reference_expr )* ) group by - -and `HAVING` and `ORDER BY` are only supported for TopK. In which case, `having_clause` is - - COUNT(*) >= Integer - -and `orderBy_clause` is - - COUNT(*) - -and `windowing_clause` is one of - - ( EVERY, ( Integer | Long ), ( TIME | RECORD ), include ) - ( TUMBLING, ( Integer | Long ), ( TIME | RECORD ) ) - -`include` is one of - - ALL - FIRST, ( Integer | Long ), ( TIME | RECORD ) - LAST, ( Integer | Long ), ( TIME | RECORD ) will be supported - -and `limit_clause` is one of - - Integer | Long - ALL will be supported - -## Data Types - -* **Null**: `NULL`. - -* **Boolean**: `TRUE`, `FALSE`. - -* **Integer**: 32-bit signed two’s complement integer with a minimum value of `-2^31` and a maximum value of `2^31 - 1`. Example: `65`. - -* **Long**: 64-bit signed two’s complement integer with a minimum value of `-2^63 + 1` and a maximum value of `2^63 - 1`. Example: `9223372036854775807`, `-9223372036854775807`. - -* **Double**: 64-bit inexact, variable-precision with a minimum value of `2^-1074` and a maximum value of `(2-2^-52)·2^1023`. Example: `1.7976931348623157E+308`, `.17976931348623157E+309`, `4.9E-324`. - -* **Decimal**: decimal number can be treated as Double, String or ParsingException. This is controlled by `ParsingOptions`. `1.7976931348623157`, `.17976931348623157`. - -* **String**: character string which can have escapes. Example: `'this is a string'`, `'this is ''another'' string'`. - -* **ColumnReference**: representation of a column field. Unquoted ColumnReference must start with a letter or `_`. Example: `column_name` or `column_name.foo` or `column_name.foo.bar` or `column_name.0.bar`. - -* **All**: representation of all columns. Example: `*`. `column_name.*` is interpreted as `column_name`. - -## Reserved Keywords - -| Keyword | SQL:2016 | SQL-92 | -| --------------------- | :-------------: | :-----------: | -| `ALTER` | reserved | reserved | -| `AND` | reserved | reserved | -| `AS` | reserved | reserved | -| `BETWEEN` | reserved | reserved | -| `BY` | reserved | reserved | -| `CASE` | reserved | reserved | -| `CAST` | reserved | reserved | -| `CONSTRAINT` | reserved | reserved | -| `CREATE` | reserved | reserved | -| `CROSS` | reserved | reserved | -| `CUBE` | reserved | | -| `CURRENT_DATE` | reserved | reserved | -| `CURRENT_TIME` | reserved | reserved | -| `CURRENT_TIMESTAMP` | reserved | reserved | -| `CURRENT_USER` | reserved | | -| `DEALLOCATE` | reserved | reserved | -| `DELETE` | reserved | reserved | -| `DESCRIBE` | reserved | reserved | -| `DISTINCT` | reserved | reserved | -| `DROP` | reserved | reserved | -| `ELSE` | reserved | reserved | -| `END` | reserved | reserved | -| `ESCAPE` | reserved | reserved | -| `EXCEPT` | reserved | reserved | -| `EXECUTE` | reserved | reserved | -| `EXISTS` | reserved | reserved | -| `EXTRACT` | reserved | reserved | -| `FALSE` | reserved | reserved | -| `FOR` | reserved | reserved | -| `FROM` | reserved | reserved | -| `FULL` | reserved | reserved | -| `GROUP` | reserved | reserved | -| `GROUPING` | reserved | | -| `HAVING` | reserved | reserved | -| `IN` | reserved | reserved | -| `INNER` | reserved | reserved | -| `INSERT` | reserved | reserved | -| `INTERSECT` | reserved | reserved | -| `INTO` | reserved | reserved | -| `IS` | reserved | reserved | -| `JOIN` | reserved | reserved | -| `LEFT` | reserved | reserved | -| `LIKE` | reserved | reserved | -| `LOCALTIME` | reserved | | -| `LOCALTIMESTAMP` | reserved | | -| `NATURAL` | reserved | reserved | -| `NORMALIZE` | reserved | | -| `NOT` | reserved | reserved | -| `NULL` | reserved | reserved | -| `ON` | reserved | reserved | -| `OR` | reserved | reserved | -| `ORDER` | reserved | reserved | -| `OUTER` | reserved | reserved | -| `PREPARE` | reserved | reserved | -| `RECURSIVE` | reserved | | -| `RIGHT` | reserved | reserved | -| `ROLLUP` | reserved | | -| `SELECT` | reserved | reserved | -| `TABLE` | reserved | reserved | -| `THEN` | reserved | reserved | -| `TRUE` | reserved | reserved | -| `UESCAPE` | reserved | | -| `UNION` | reserved | reserved | -| `UNNEST` | reserved | | -| `USING` | reserved | reserved | -| `VALUES` | reserved | reserved | -| `WHEN` | reserved | reserved | -| `WHERE` | reserved | reserved | -| `WITH` | reserved | reserved | diff --git a/docs/ws/api-json.md b/docs/ws/api-json.md index c28328a8..5ac355f9 100644 --- a/docs/ws/api-json.md +++ b/docs/ws/api-json.md @@ -1,8 +1,8 @@ # Bullet JSON API -This section gives a comprehensive overview of the Web Service API for launching Bullet JSON queries. +This section gives a comprehensive overview of the old Web Service API for launching Bullet JSON queries. This was **deprecated** in favor of [BQL](api.md) in Bullet 1.0+. -The JSON API is the actual Query format that is expected by the backend. [The BQL API](api-bql.md) is a more user-friendly API which can also be used - the Web Service will automatically detect the BQL query and convert the query to this JSON format before submitting it to the backend. With the addition of Post Aggregations and Expressions, +The JSON API is the old Query format that was expected by the API and the Backend prior to Bullet 1.0. [The API](api.md) is a more user-friendly API which can also be used - the Web Service prior to Bullet 1.0 will automatically detect the BQL query and convert the query to this JSON format before submitting it to the backend. With the addition of Post Aggregations and Expressions, it is a lot easier to use BQL rather than construct the JSON. The Bullet Web Service also provides [an API](https://github.com/bullet-db/bullet-service/releases/tag/bullet-service-0.4.2) to convert BQL to JSON if you so desire. * For info on how to use the UI, see the [UI Usage section](../ui/usage.md) diff --git a/docs/ws/api.md b/docs/ws/api.md new file mode 100644 index 00000000..c7b8c044 --- /dev/null +++ b/docs/ws/api.md @@ -0,0 +1,147 @@ +# Bullet API + +This section gives a comprehensive overview of the Web Service API for launching Bullet queries. + +For examples of queries, see the [examples page](examples.md). + +BQL is the interface that is exposed to users to query Bullet. BQL queries that are received by the Web Service are converted to an underlying querying format before being sent to the backend. This conversion is done in the web service using [the bullet-bql library](../releases/#bullet-bql). + +## Overview + +Bullet-BQL provides users with a friendly SQL-like API to submit queries to the Web Service. + +## Statement Syntax + + SELECT select + FROM stream + ( WHERE expression )? + ( GROUP BY expression ( , expression )* )? + ( HAVING expression )? + ( ORDER BY orderBy )? + ( WINDOWING window )? + ( LIMIT Integer )? + ';'? + +where `select` is + + DISTINCT? selectItem ( , selectItem )* + +and `selectItem` is one of + + expression ( AS? identifier )? + * + +and `expression` is one of + + valueExpression + fieldExpression + listExpression + expression IS NULL + expression IS NOT NULL + unaryExpression + functionExpression + expression NOT? IN expression + expression RLIKE ANY? expression + expression ( * | / ) expression + expression ( + | - ) expression + expression ( < | <= | > | >= ) ( ANY | ALL )? expression + expression ( = | != ) ( ANY | ALL )? expression + expression AND expression + expression XOR expression + expression OR expression + ( expression ) + +where `valueExpression` is one of Null, Boolean, Integer, Long, Float, Double, or String + +and `fieldExpression` is one of + + identifier ( : fieldType )? + identifier [ Integer ] ( : fieldType )? + identifier [ Integer ] . identifier ( : fieldType )? + identifier . identifier ( : fieldType )? + identifier . identifier . identifier ( : fieldType )? + +`fieldType` is one of + + primitiveType + LIST [ primitiveType ] + MAP [ primitiveType ] + LIST [ MAP [ primitiveType ] ] + MAP [ MAP [ primitiveType ] ] + +and `primitiveType` is `INTEGER`, `LONG`, `FLOAT`, `DOUBLE`, `BOOLEAN`, or `STRING` + +where `listExpression` is one of + + [] + [ expression ( , expression )* ] + +`unaryExpression` is + + ( NOT | SIZEOF ) ( expression ) with optional parentheses + +`functionExpression` is one of + + ( SIZEIS | CONTAINSKEY | CONTAINSVALUE | FILTER ) ( expression, expression ) + IF ( expression ( , expression )* ) three arguments + aggregateExpression + CAST ( expression AS primitiveType ) + +where `aggregateExpression` is one of + + COUNT ( * ) + ( SUM | AVG | MIN | MAX ) ( expression ) + COUNT ( DISTINCT expression ( , expression )* ) + distributionType ( expression, inputMode ) + TOP ( Integer ( , Integer )?, expression ( , expression )* ) + +where `distributionType` is `QUANTILE`, `FREQ`, or `CUMFREQ` + +and `inputMode` is one of + + LINEAR, Integer evenly spaced + REGION, Number, Number, Number evenly spaced in a region + MANUAL, Number ( , Number )* defined points + + +and `stream` is one of + + STREAM() default time duration will be set from BQLConfig + STREAM( ( Integer | MAX ), TIME ) time based duration control + +`RECORD` will be supported in the future. + +and `orderBy` is + + expression ( ASC | DESC )? ( , expression ( ASC | DESC )? )* + +and `window` is one of + + EVERY ( Integer, ( TIME | RECORD ), include ) + TUMBLING ( Integer, ( TIME | RECORD ) ) + +`include` is one of + + ALL + FIRST, Integer, ( TIME | RECORD ) + + +## Data Types + +* **Null**: `NULL`. + +* **Boolean**: `TRUE`, `FALSE`. + +* **Integer**: 32-bit signed two’s complement integer with a minimum value of `-2^31` and a maximum value of `2^31 - 1`. Example: `65`. + +* **Long**: 64-bit signed two’s complement integer with a minimum value of `-2^63 + 1` and a maximum value of `2^63 - 1`. Example: `9223372036854775807`, `-9223372036854775807`. + +* **Float**: 32-bit inexact, variable-precision with a minimum value of `2^-149` and a maximum value of `(2-2^-23)·2^127`. Example: `1.70141183E+38`, `1.17549435E-38`, `0.15625`. + +* **Double**: 64-bit inexact, variable-precision with a minimum value of `2^-1074` and a maximum value of `(2-2^-52)·2^1023`. Example: `1.7976931348623157E+308`, `.17976931348623157E+309`, `4.9E-324`. + +* **String**: character string which can have escapes. Example: `'this is a string'`, `'this is ''another'' string'`. + +* **Identifier**: representation of a field. Unquoted identifier must start with a letter or `_`. Example: `column_name`, `column_name.foo`, `column_name.foo.bar`, `column_name[0].bar`, or `"123column"`. + +* **All**: representation of all fields. Example: `*`. diff --git a/docs/ws/examples.md b/docs/ws/examples.md index e870fc89..0f7f367c 100644 --- a/docs/ws/examples.md +++ b/docs/ws/examples.md @@ -24,7 +24,7 @@ If you wanted to write a smaller or shorter query to, for example, quickly test !!! note "WINDOW?" - There is only one unified data stream in Bullet, so for clarity the ```FROM``` clause is given a ```STREAM``` function to denote the look-forward time window for the Bullet query. + There is only one unified data stream in Bullet, so for clarity the ```FROM``` clause is given a ```STREAM``` function to denote the look-forward time window for the Bullet query. ### Simple Filtering @@ -81,7 +81,7 @@ WHERE id = 'btsg8l9b234ha' AND page_id IS NOT NULL LIMIT 10; ``` -The above query finds all events where id is set to 'btsg8l9b234ha' and page_id is not null, projects the fields selected above with their aliases (timestamp as ts, etc.) and limits the results to at most 10 records. The query would wait at most 20 seconds for records to show up. +The above query finds all events where id is set to 'btsg8l9b234ha' and page_id is not null, projects the fields selected above with their aliases (timestamp as ts, etc.) and limits the results to at most 10 records. The query would wait at most 20 seconds for records to show up. The resulting response could look like (only 3 events were generated that matched the criteria): @@ -123,7 +123,7 @@ The resulting response could look like (only 3 events were generated that matche For the following examples, we will simply show and explain the queries. They also use the extended syntax for specify values in a filter using the ```kind``` field. -#### SIZEIS Filter +#### SIZEOF Filter This query checks to see if the size of the ```data_map``` is equal to 4 and returns all records that satisfy this. diff --git a/docs/ws/setup.md b/docs/ws/setup.md index 15e7d695..ce931cff 100644 --- a/docs/ws/setup.md +++ b/docs/ws/setup.md @@ -2,16 +2,14 @@ The Web Service is a Java JAR file that you can deploy on a machine to communicate with the Bullet Backend. You then plug in a particular Bullet PubSub implementation such as [Kafka PubSub](../pubsub/kafka.md) or [Storm DRPC PubSub](../pubsub/storm-drpc.md). For an example on how to set up a Bullet backend, see the [Storm example setup](../backend/storm-setup.md). -There are two main purposes for this layer at this time: +There are three main purposes for this layer at this time: -1) It provides an endpoint that can serve a [JSON API schema](http://jsonapi.org/format/) for the Bullet UI. Currently, static schemas from a file are supported. +1) It converts queries and sends them through the PubSub to the backend. It handles responses from the backend for both synchronous and asynchronous queries. -2) It generates unique identifiers and other metadata for a JSON Bullet query before sending the query to the Bullet backend. It wraps errors if the backend is unreachable. +2) It provides an endpoint that can serve a [JSON API schema](http://jsonapi.org/format/) for the Bullet UI. Currently, static schemas from a file are supported. +3) It manages metadata for queries such unique identifiers or storing queries for resilience for Backends that support replaying. -!!! note "That's it?" - - The Web Service essentially just wraps the PubSub layer and provides some helpful endpoints. When incremental updates drop, it will translate a PubSub's streaming responses back into incremental results for the user. It is also there to be a point of abstraction for implementing things like security, monitoring, access-control, rate-limiting, sharding, different query formats (e.g. SQL Bullet queries) etc, which are planned in the near future. ## Prerequisites @@ -48,18 +46,39 @@ You can also add ```sources``` or ```javad ## Configuration -There are two levels of configuration: +There are a few different modules in the Web Service: + +1. **API**: Configure the Web Service, the web server, the names of various endpoints, and other Spring Boot settings. You can also configure certain top-level settings for the various modules below - such as the number of publishers and subscribers to use for the PubSub etc. +2. **PubSub**: Configure what PubSub to use and the various settings for it. +3. **Schema** (Optional): Configure the Schema file (that powers the [UI](../ui/usage.md). +4. **Query** (Optional): Configure the various query defaults for queries coming into the API. You can also point to the schema used by the BQL module to do type-checking and other semantic validation. +5. **Storage** (Optional): Configure what Storage to use, the various settings for it through another configuration file. +6. **Asynchronous Queries** (Optional): Configure the Asynchronous query module which lets you send queries to an API but not wait for the results. The results, when received, are sent through a PubSubResponder interface that you can plug in - such as email or writing to another PubSub etc. +7. **Metrics** (Optional): Configure the Metrics collection system which collects various statistics about the endpoints and sends them through a Publisher interface that you can plug in. You can use this for monitoring status codes and errors. +8. **Status** (Optional): Configure the Status checking system which disables the API if the backend is down or unreachable. It works by sending a simple query through and waiting for results periodically. + +### API Configuration + +Take a look at the [settings](https://github.com/bullet-db/bullet-service/blob/master/src/main/resources/application.yaml) for a list of the settings that are configured. The Web Service settings start with ```bullet.```. You can configure various WebSocket settings and other API level configuration. + +If you provide a custom settings ```application.yaml```, you will **need** to specify the default values in this file since the framework uses your file instead of these defaults. You can also pass in overrides as command-line arguments when launching the server. + +#### Spring Boot Configuration + +You can also configure various Spring and web server settings here. Take a look at [this page](https://docs.spring.io/spring-boot/docs/current/reference/html/common-application-properties.html) page for the various values you can supply. -1. You can configure the Web Service, the web server and other Spring Boot settings through a configuration file (or through the command line when launching). By default, this file is called ```application.yaml```. This is where settings like where to find your schema file (optional for if you want to power the [UI](../ui/usage.md) schema with a static file) or how many publishers and subscribers to use for your PubSub etc. -2. You can configure what PubSub to use, the various settings for it through another configuration file. The location for this file is provided in the Web Service configuration step above. +### PubSub Configuration + +You configure the PubSub by providing a configuration YAML file and setting the ```bullet.pubsub.config``` to its path. In *that* file, you will set these two settings at a minimum: -### Web Service Configuration +1. ```bullet.pubsub.class.name``` should be set to the fully qualified package to your PubSub implementation. Example: ```com.yahoo.bullet.kafka.KafkaPubSub``` for the [Kafka PubSub](../pubsub/kafka.md). +2. ```bullet.pubsub.context.name: QUERY_SUBMISSION```. The Web Service requires the PubSub to be in the ```QUERY_SUBMISSION``` context. -Take a look at the [settings](https://github.com/bullet-db/bullet-service/blob/master/src/main/resources/application.yaml) for a list of the settings that are configured. The Web Service settings start with ```bullet.```. +You will also specify other parameters that your chosen PubSub requires or can use. -If you provide a custom settings ```application.yaml```, you will **need** to specify the default values in this file since the framework uses your file instead of these defaults. +In the top level configuration for the PubSub in ```application.yaml```, you may configure the number of threads for reading and writing the PubSub as well as enabling and configuring the built-in [REST PubSub](../pubsub/rest.md) if you choose to use that. -#### File based schema +### Schema Configuration The Web Service can also provide a endpoint that serves your data schema to your UI. You do not necessarily have to use this to serve your schema. The UI can use any JSON API schema specification. But if your schema is fixed or does not change often, it might be simpler for you to use this endpoint to provide the schema for the UI, instead of creating a new one. The Web Service also takes care to provide the right [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Access_control_CORS) headers so that your UI can communicate with it. @@ -67,18 +86,39 @@ You can use [sample_columns.json](https://github.com/bullet-db/bullet-service/bl Once you have your schema file, you can provide it to the Web Service by setting the ```bullet.schema.file``` to the path to your file. -#### Spring Boot Configuration -You can also configure various Spring and web server here. Take a look at [this page](https://docs.spring.io/spring-boot/docs/current/reference/html/common-application-properties.html) page for the various values you can supply. +### Query Configuration -### PubSub Configuration +You can provide a file containing the various query defaults and maximums by using the `bullet.query.config` setting. This is configured used by the query building module to make sure incoming queries respect various configurations provided here such as default aggregation sizes or minimum window emit intervals etc. You can also point to a schema file (ideally the same one used if you chose to enable the schema module) that the query builder layer can use for advanced checking. See [the defaults](https://github.com/bullet-db/bullet-service/blob/master/src/main/resources/query_defaults.yaml) for more information. -You configure the PubSub by providing a configuration YAML file and setting the ```bullet.pubsub.config``` to its path. In *that* file, you will set these two settings at a minimum: +!!! note "Querying and Schema" + If you provide a schema, the query creation layer in BQL can leverage this for type-checking and advanced semantic validation. This can error out otherwise erroneous queries right at the API layer without having to run a query that returns no results or errors out in the backend. -1. ```bullet.pubsub.class.name``` should be set to the fully qualified package to your PubSub implementation. Example: ```com.yahoo.bullet.kafka.KafkaPubSub``` for the [Kafka PubSub](../pubsub/kafka.md). -2. ```bullet.pubsub.context.name: QUERY_SUBMISSION```. The Web Service requires the PubSub to be in the ```QUERY_SUBMISSION``` context. +### Storage Configuration -You will also specify other parameters that your chosen PubSub requires or can use. +This module lets you set up a Storage layer to write queries to when submitted. These are cleaned up when the query is terminated and the final result sent back to the API. This is particularly relevant if your Bullet instance is fielding long-running queries that need to be resilient. This coupled with a Backend implementation that can leverage the Storage lets you recreate queries in the Backend in case of component failure or restarts. The Storage layer is also particularly relevant if you're using the asynchronous query module with a PubSubResponder interface that relies on the Storage to do additional metadata lookups. + +You can configure and provide a Storage implementation by implementing the [StorageManager interface](https://github.com/bullet-db/bullet-core/blob/master/src/main/java/com/yahoo/bullet/storage/StorageManager.java). Note that you cannot turn off the Storage module in the API but by default, the `NullStorageManager` is used, which does nothing. You can provide a configuration yaml file that supplies your particular settings for your StorageManager by using the `bullet.storage.config` setting. See [the defaults](https://github.com/bullet-db/bullet-service/blob/master/src/main/resources/storage_defaults.yaml) for more details. + +!!! note "So you DO have persistence?" + + This is not the same as storing the data. Bullet's philosophy is to avoid storing the incoming data stream that it is field queries on. This layer is meant for storing query related information. When we extend the storage to storing intermediate results in the backend for extra resiliency between windows, the size of the storage should still be well defined for sketch-based aggregations. + +### Asynchronous Query Configuration + +This module enables the asynchronous query submission endpoint (the `bullet.endpoint.async` setting) that lets you submit queries to it without having to hang around for the results to stream back. Instead you use [the PubSubResponder interface](https://github.com/bullet-db/bullet-core/blob/master/src/main/java/com/yahoo/bullet/pubsub/PubSubResponder.java) to provide an instance that is used to write results that come back for that query. You can use this for *alerting* use-cases where you need to send e-mails on certain alert queries being triggered or if you want your results written to a PubSub that you can consume in a different manner etc. + +By default, this module is disabled. However, it is mock configured to use [a standard Bullet PubSubResponder](https://github.com/bullet-db/bullet-core/blob/master/src/main/java/com/yahoo/bullet/pubsub/BulletPubSubResponder.java) that we provide to write the result back to a REST PubSub that is assumed to be running locally. You can change this to write the results to your own PubSub if you desire or plug in something else entirely. You can provide a configuration yaml file that supplies your particular settings for your PubSubResponder by using the `bullet.async.config` setting. See the [defaults](https://github.com/bullet-db/bullet-service/blob/master/src/main/resources/async_defaults.yaml) for more information. + +### Metrics Configuration + +This module lets you monitor the Web Service for information on what is happening. It tracks the various status codes and publishes them using the [MetricPublisher interface](https://github.com/bullet-db/bullet-core/blob/master/src/main/java/com/yahoo/bullet/common/metrics/MetricPublisher.java) to a place of your choice. By default, the [HTTPMetricPublisher interface](https://github.com/bullet-db/bullet-core/blob/master/src/main/java/com/yahoo/bullet/common/metrics/HTTPMetricEventPublisher.java) is configured, which can post to a URL of your choice. + +You can provide a configuration yaml file that supplies your particular settings for your MetricPublisher by using the `bullet.metric.config` setting. See the [defaults](https://github.com/bullet-db/bullet-service/blob/master/src/main/resources/metric_defaults.yaml) for more information. + +### Status Configuration + +This module periodically sends a *tick* query to the backend to make sure it is functioning properly. You can configure various settings for it here. If enabled, this module can disable the whole API if the backend is unreachable. This can be used if you front multiple Web Service instances talking to different instances of a backend behind a proxy and take down the backends one at a time for upgrades. ## Launch @@ -114,4 +154,4 @@ You should receive a random record flowing through Bullet instantly (if you left If you provided a path to a schema file in your configuration file when you [launch](#launch) the Web Service, you can also HTTP GET your schema at ```http://localhost:5555/api/bullet/columns``` -If you did not, the schema in [sample_columns.json](https://github.com/bullet-db/bullet-service/blob/master/src/main/resources/sample_columns.json) is the response. The Web Service converts it to a JSON API response and provides the right headers for CORS. +If you did not, the schema in [sample_fields.json](https://github.com/bullet-db/bullet-service/blob/master/src/main/resources/sample_fields.json) is the response. The Web Service converts it to a JSON API response and provides the right headers for CORS. diff --git a/examples/Makefile b/examples/Makefile index ca849b77..3b39aba9 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -17,7 +17,7 @@ build-spark: mkdir -p bullet-examples/backend/spark cp install-all-spark.sh bullet-examples/ cd spark && mvn package - cp spark/target/bullet-spark-example-0.0.1-SNAPSHOT.jar bullet-examples/backend/spark/bullet-spark-example.jar + cp spark/target/bullet-spark-example-*-SNAPSHOT.jar bullet-examples/backend/spark/bullet-spark-example.jar cp spark/src/main/resources/bullet_spark_kafka_settings.yaml bullet-examples/backend/spark build-web-service: diff --git a/examples/install-all-spark.sh b/examples/install-all-spark.sh index 236a69b5..4cd21eff 100755 --- a/examples/install-all-spark.sh +++ b/examples/install-all-spark.sh @@ -2,15 +2,15 @@ set -euo pipefail -BULLET_EXAMPLES_VERSION=0.6.1 -BULLET_UI_VERSION=0.6.2 -BULLET_WS_VERSION=0.4.3 -BULLET_KAFKA_VERSION=0.3.2 -BULLET_SPARK_VERSION=0.2.2 -KAFKA_VERSION=0.11.0.1 -SPARK_VERSION=2.2.1 -NVM_VERSION=0.33.1 -NODE_VERSION=6.9.4 +BULLET_EXAMPLES_VERSION=1.0.0 +BULLET_UI_VERSION=1.0.0 +BULLET_WS_VERSION=1.0.0 +BULLET_KAFKA_VERSION=1.0.1 +BULLET_SPARK_VERSION=1.0.0 +KAFKA_VERSION=2.3.1 +SPARK_VERSION=3.0.1 +NVM_VERSION=0.37.2 +NODE_VERSION=10.20.1 KAFKA_TOPIC_REQUESTS=bullet.requests KAFKA_TOPIC_RESPONSES=bullet.responses @@ -130,42 +130,37 @@ create_topics() { println "Done!" } -install_web_service() { - local BULLET_WEB_SERVICE="bullet-service-${BULLET_WS_VERSION}-embedded.jar" - - println "Downloading Bullet Web Service version ${BULLET_WS_VERSION}..." - download "http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/${BULLET_WS_VERSION}" "${BULLET_WEB_SERVICE}" - - println "Installing Bullet Web Service..." - cp ${BULLET_DOWNLOADS}/${BULLET_WEB_SERVICE} ${BULLET_HOME}/service/ - cp ${BULLET_EXAMPLES}/web-service/example_kafka_pubsub_config.yaml ${BULLET_HOME}/service/ - cp ${BULLET_EXAMPLES}/web-service/example_columns.json ${BULLET_HOME}/service/ - export BULLET_WS_JAR=${BULLET_HOME}/service/${BULLET_WEB_SERVICE} +launch_bullet_web_service() { + local BULLET_WS_JAR="bullet-service-${BULLET_WS_VERSION}-embedded.jar" + local BULLET_SERVICE_HOME="${BULLET_HOME}/service" - println "Done!" -} + println "Downloading Bullet Web Service ${BULLET_WS_VERSION}..." + download "http://jcenter.bintray.com/com/yahoo/bullet/bullet-service/${BULLET_WS_VERSION}" "${BULLET_WS_JAR}" -launch_web_service() { - local BULLET_SERVICE_HOME="${BULLET_HOME}/service" + println "Configuring Bullet Web Service and plugging in Kafka PubSub..." + cp "${BULLET_DOWNLOADS}/${BULLET_WS_JAR}" "${BULLET_SERVICE_HOME}/bullet-service.jar" + cp "${BULLET_EXAMPLES}/web-service/"example_* "${BULLET_SERVICE_HOME}/" - println "Launching Bullet Web Service..." + println "Launching Bullet Web Service with the Kafka PubSub..." cd "${BULLET_SERVICE_HOME}" - java -Dloader.path=${BULLET_HOME}/pubsub/bullet-kafka-${BULLET_KAFKA_VERSION}-fat.jar -jar ${BULLET_WS_JAR} \ - --bullet.pubsub.config=${BULLET_SERVICE_HOME}/example_kafka_pubsub_config.yaml \ - --bullet.schema.file=${BULLET_SERVICE_HOME}/example_columns.json \ - --server.port=9999 \ - --logging.path=${BULLET_SERVICE_HOME} \ - --logging.file=log.txt &> ${BULLET_SERVICE_HOME}/log.txt & + java -Dloader.path=${BULLET_HOME}/pubsub/bullet-kafka-${BULLET_KAFKA_VERSION}-fat.jar -jar ${BULLET_SERVICE_HOME}/bullet-service.jar \ + --bullet.pubsub.config=${BULLET_SERVICE_HOME}/example_kafka_pubsub_config.yaml \ + --bullet.query.config=${BULLET_SERVICE_HOME}/example_query_config.yaml \ + --bullet.schema.file=${BULLET_SERVICE_HOME}/example_columns.json \ + --server.port=9999 --logging.path="${BULLET_SERVICE_HOME}" \ + --logging.file=log.txt &> "${BULLET_SERVICE_HOME}/log.txt" & println "Sleeping for 15 s to ensure Bullet Web Service is up..." sleep 15 - println "Getting one random record from Bullet through the Web Service..." - curl -s -H 'Content-Type: text/plain' -X POST -d '{"aggregation": {"size": 1}}' http://localhost:9999/api/bullet/sse-query - println "" + println "Testing the Web Service" println "Getting column schema from the Web Service..." println "" curl -s http://localhost:9999/api/bullet/columns + println "" + println "Getting one random record from Bullet through the Web Service..." + curl -s -H 'Content-Type: text/plain' -X POST -d 'SELECT * FROM STREAM(2000, TIME) LIMIT 1;' http://localhost:9999/api/bullet/queries/sse-query + println "" println "Finished Bullet Web Service test!" } @@ -217,7 +212,7 @@ launch_bullet_spark() { install_node() { # NVM unset var bug - set +u + set +eu println "Trying to install nvm. If there is a failure, manually perform: " println " curl -s https://raw.githubusercontent.com/creationix/nvm/v${NVM_VERSION}/install.sh | bash" @@ -236,7 +231,7 @@ install_node() { nvm install "v${NODE_VERSION}" nvm use "v${NODE_VERSION}" - set -u + set -eu println "Done!" } @@ -269,7 +264,7 @@ cleanup() { pkill -f "[e]xpress-server.js" pkill -f "[e]xample_kafka_pubsub_config.yaml" - pkill -f "[b]ullet-spark" + pkill -9 -f "[b]ullet-spark" ${KAFKA_INSTALL_DIR}/bin/kafka-server-stop.sh ${KAFKA_INSTALL_DIR}/bin/zookeeper-server-stop.sh @@ -316,8 +311,7 @@ launch() { install_bullet_spark launch_bullet_spark - install_web_service - launch_web_service + launch_bullet_web_service install_node launch_bullet_ui diff --git a/examples/install-all-storm.sh b/examples/install-all-storm.sh index 414a6421..b46cb980 100755 --- a/examples/install-all-storm.sh +++ b/examples/install-all-storm.sh @@ -2,12 +2,12 @@ set -euo pipefail -BULLET_EXAMPLES_VERSION=0.6.1 -BULLET_UI_VERSION=0.6.2 -BULLET_WS_VERSION=0.4.3 -STORM_VERSION=1.2.2 -NVM_VERSION=0.33.1 -NODE_VERSION=6.9.4 +BULLET_EXAMPLES_VERSION=1.0.0 +BULLET_UI_VERSION=1.0.0 +BULLET_WS_VERSION=1.0.0 +STORM_VERSION=2.2.0 +NVM_VERSION=0.37.2 +NODE_VERSION=10.20.1 println() { local DATE @@ -132,7 +132,8 @@ launch_bullet_web_service() { println "Launching Bullet Web Service with the built-in REST PubSub enabled..." cd "${BULLET_SERVICE_HOME}" java -jar ${BULLET_SERVICE_HOME}/bullet-service.jar \ - --bullet.pubsub.config=${BULLET_SERVICE_HOME}/example_rest_pubsub_config.yaml \ + --bullet.pubsub.config=${BULLET_SERVICE_HOME}/example_rest_pubsub_config.yaml \ + --bullet.query.config=${BULLET_SERVICE_HOME}/example_query_config.yaml \ --bullet.schema.file=${BULLET_SERVICE_HOME}/example_columns.json \ --server.port=9999 --bullet.pubsub.builtin.rest.enabled=true --logging.path="${BULLET_SERVICE_HOME}" \ --logging.file=log.txt &> "${BULLET_SERVICE_HOME}/log.txt" & @@ -141,19 +142,19 @@ launch_bullet_web_service() { sleep 15 println "Testing the Web Service" - println "" - println "Getting one random record from Bullet through the Web Service..." - curl -s -H 'Content-Type: text/plain' -X POST -d '{"aggregation": {"size": 1}}' http://localhost:9999/api/bullet/sse-query - println "" println "Getting column schema from the Web Service..." println "" curl -s http://localhost:9999/api/bullet/columns + println "" + println "Getting one random record from Bullet through the Web Service..." + curl -s -H 'Content-Type: text/plain' -X POST -d 'SELECT * FROM STREAM(2000, TIME) LIMIT 1;' http://localhost:9999/api/bullet/queries/sse-query + println "" println "Finished Bullet Web Service test!" } install_node() { # NVM unset var bug - set +u + set +eu println "Trying to install nvm. If there is a failure, manually perform: " println " curl -s https://raw.githubusercontent.com/creationix/nvm/v${NVM_VERSION}/install.sh | bash" @@ -172,7 +173,7 @@ install_node() { nvm install "v${NODE_VERSION}" nvm use "v${NODE_VERSION}" - set -u + set -eu println "Done!" } diff --git a/examples/spark/bin/launch.sh b/examples/spark/bin/launch.sh deleted file mode 100644 index d23299ab..00000000 --- a/examples/spark/bin/launch.sh +++ /dev/null @@ -1,2 +0,0 @@ - -# This will launch bullet-spark diff --git a/examples/spark/pom.xml b/examples/spark/pom.xml index 78696ba9..a173904a 100644 --- a/examples/spark/pom.xml +++ b/examples/spark/pom.xml @@ -7,11 +7,12 @@ 1.0.0-SNAPSHOT jar - 2.11.7 - 2.11 - 2.3.0 + 2.12.8 + 2.12 + 3.0.1 1.0.0 - 1.0.0 + 1.1.0 + 1.9.2 @@ -56,6 +57,11 @@ bullet-record ${bullet.record.version} + + org.apache.avro + avro + ${avro.version} + src/main/scala diff --git a/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml index ba4c7312..f7827b78 100644 --- a/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml +++ b/examples/spark/src/main/resources/bullet_spark_kafka_settings.yaml @@ -100,7 +100,8 @@ bullet.pubsub.kafka.response.topic.name: "bullet.responses" ######################################################################################################################## ######################################################################################################################## bullet.query.aggregation.raw.max.size: 500 -bullet.query.aggregation.max.size: 1024 +# This setting is enforced in the API at this time +# bullet.query.aggregation.max.size: 1024 bullet.query.aggregation.count.distinct.sketch.entries: 16384 bullet.query.aggregation.group.sketch.entries: 1024 bullet.query.aggregation.distribution.sketch.entries: 1024 diff --git a/examples/storm/pom.xml b/examples/storm/pom.xml index 5824d42c..e625e022 100644 --- a/examples/storm/pom.xml +++ b/examples/storm/pom.xml @@ -27,9 +27,9 @@ 1.8 1.8 1.0.0 - 1.0.0 - 1.0.0 - 2.1.0 + 1.2.0 + 1.1.0 + 2.2.0 @@ -56,6 +56,24 @@ test --> + + + com.yahoo.bullet + bullet-storm + ${bullet.storm.version} + + + com.yahoo.bullet + bullet-record + ${bullet.record.version} + + + com.yahoo.bullet + bullet-core + ${bullet.core.version} + + log4j log4j @@ -91,21 +109,6 @@ ${storm.version} --> - - com.yahoo.bullet - bullet-storm - ${bullet.storm.version} - - - com.yahoo.bullet - bullet-record - ${bullet.record.version} - - - com.yahoo.bullet - bullet-core - ${bullet.core.version} - diff --git a/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java b/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java index 9bae1659..f3235ef1 100644 --- a/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java +++ b/examples/storm/src/main/java/com/yahoo/bullet/storm/examples/RandomSpout.java @@ -35,7 +35,7 @@ public class RandomSpout extends BaseRichSpout { protected SpoutOutputCollector outputCollector; public static final String RECORD_FIELD = "record"; - // This is the message ID for all tuples. This enables acking from this Spout to the FilterBolt. However + // This is the message ID for all tuples. This enables acking from this Spout to the Filterbolt. However // this spout does not handle dealing with failures. So, we use this as a way to simply enable acking. public static final Long DUMMY_ID = 42L; diff --git a/examples/storm/src/main/resources/bullet_settings.yaml b/examples/storm/src/main/resources/bullet_settings.yaml index 1c852357..82c7d50c 100644 --- a/examples/storm/src/main/resources/bullet_settings.yaml +++ b/examples/storm/src/main/resources/bullet_settings.yaml @@ -39,7 +39,8 @@ bullet.topology.bullet.spout.memory.off.heap.load: 160.0 # Bullet Core settings bullet.query.aggregation.raw.max.size: 500 -bullet.query.aggregation.max.size: 1024 +# This setting is enforced in the API at this time +# bullet.query.aggregation.max.size: 1024 bullet.query.aggregation.count.distinct.sketch.entries: 16384 bullet.query.aggregation.group.sketch.entries: 1024 bullet.query.aggregation.distribution.sketch.entries: 1024 diff --git a/examples/ui/env-settings.json b/examples/ui/env-settings.json index 27643f77..9c1bc744 100644 --- a/examples/ui/env-settings.json +++ b/examples/ui/env-settings.json @@ -1,8 +1,9 @@ { "default": { "queryHost": "http://localhost:9999", - "queryNamespace": "api/bullet", + "queryNamespace": "api/bullet/queries", "queryPath": "ws-query", + "validationPath": "validate-query", "queryStompRequestChannel": "/server/request", "queryStompResponseChannel": "/client/response", "schemaHost": "http://localhost:9999", @@ -14,14 +15,14 @@ } ], "bugLink": "https://github.com/bullet-db/bullet-ui/issues", - "modelVersion": 3, + "modelVersion": 4, "migrations": { "deletions": "query" }, "defaultValues": { "aggregationMaxSize": 1024, "rawMaxSize": 500, - "durationMaxSecs": 86400, + "durationMaxSecs": 9007199254740, "distributionNumberOfPoints": 11, "distributionQuantilePoints": "0, 0.25, 0.5, 0.75, 0.9, 1", "distributionQuantileStart": 0, @@ -29,7 +30,7 @@ "distributionQuantileIncrement": 0.1, "windowEmitFrequencyMinSecs": 1, "everyForRecordBasedWindow": 1, - "everyForTimeBasedWindow": 2, + "everyForTimeBasedWindow": 2000, "sketches": { "countDistinctMaxEntries": 16384, "groupByMaxEntries": 512, diff --git a/examples/web-service/example_query_config.yaml b/examples/web-service/example_query_config.yaml new file mode 100644 index 00000000..aa1b6dc1 --- /dev/null +++ b/examples/web-service/example_query_config.yaml @@ -0,0 +1,4 @@ +# Custom or notable settings for the example +# Settings not overridden will default to values in bullet_defaults.yaml in the bullet-core artifact. + +bullet.query.aggregation.max.size: 1024 diff --git a/mkdocs.yml b/mkdocs.yml index 82939f59..8c8a9b22 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -32,9 +32,9 @@ pages: - Web Service: - Setup: ws/setup.md - API: - - BQL: ws/api-bql.md - - JSON: ws/api-json.md - - Query Examples: ws/examples.md + - BQL: ws/api.md + - Examples: ws/examples.md + - JSON (DEPRECATED): ws/api-json.md - UI: - Setup: ui/setup.md - Usage: ui/usage.md @@ -50,7 +50,7 @@ markdown_extensions: extra: collapse_toc: true include_search: true - service_version: v0.6.1 + service_version: v1.0.0 extra_css: - css/extra.css