Skip to content

Commit

Permalink
1.2.3 release changes (#298)
Browse files Browse the repository at this point in the history
* update docs (#276)

* Update Go version and migrate to Go modules (#287)

* Adding go.mod and go.sum to migrate to Go modules from Glide

* Removed old Vendor folder after migrating to Go modules

* Adding new Vendor folder after migrating to Go mudules

* Removed Glide files, references in readme and updated CI/CD

* Migrated from bolt to bbolt for cache module to resolve race issue

* Added tags from the envelope to the events

* Updated tile.yml with new config

* Updated README

* Updated migration script to prevent unnecessary migration (#288)

* Updated cache.GetApp logic to retrieve app info from boltdb database as well

* Bugfix/disconnect due to slow consumer event drop (#289)

* dropping events when the queue is full

* Corrected the newly added test case with data race condition

* minor change in logging

* added event drop warning threshold in config

* minor update in the logging

* Updated comments and log info

* Trimmed white space for ApiEndPoint and SplunkHost

* Updated docs with required field & optional value

* Added default value in README

* Updated events doc

* Removed SPLUNK_VERSION config as not needed anymore

* Removed SplunkVersion from eventsink.splunk_test

* Updated README with app level index routing (#292)

* Updated README with app level index routing

* Removed unnecessary changes

* added required config to enable index routing

* updated index routing doc

* added warning in index routing docs

* upgraded module version

* upgraded gogo/protobuf version

* executed go mod tidy

* upgraded indirect dependency

* updated version in a build example in readme

* Corrected index routing docs

* Added more details (#299)

* Updated docs around

- Add Tags: Performance impact of adding tags in events
- How to detect slow downstream related issues and event drops

* added 'drop_warn_threshold' config in tile (#300)

Co-authored-by: Matthew Heidemann <[email protected]>
Co-authored-by: Shubham Jain <[email protected]>
Co-authored-by: harshit-splunk <[email protected]>
Co-authored-by: harshit-splunk <[email protected]>
  • Loading branch information
5 people authored Oct 14, 2021
1 parent ecf9398 commit fcbda2d
Show file tree
Hide file tree
Showing 2,728 changed files with 252,289 additions and 1,103,131 deletions.
1 change: 1 addition & 0 deletions .circleci/ci_nozzle_manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ applications:
JOB_INDEX: -1
JOB_HOST: localhost
ADD_APP_INFO: AppName,OrgName,OrgGuid,SpaceName,SpaceGuid
ADD_TAGS: true
IGNORE_MISSING_APP: true
MISSING_APP_CACHE_INVALIDATE_TTL: 3600s
APP_CACHE_INVALIDATE_TTL: 86440s
Expand Down
17 changes: 7 additions & 10 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@ version: 2 # use CircleCI 2.0
jobs:
build:
docker:
- image: circleci/golang:1.12
- image: circleci/golang:1.17
working_directory: /go/src/github.com/cloudfoundry-community/splunk-firehose-nozzle
steps: # steps that comprise the `build` job
- checkout # check out source code to working directory
- run:
name: Install Dependencies
command: |
go get -u -v -t github.com/Masterminds/glide
glide install --strip-vendor
go mod vendor
- run:
name: Builder
command: make build
Expand All @@ -26,7 +25,7 @@ jobs:

deploy-nozzle:
docker:
- image: circleci/golang:1.12
- image: circleci/golang:1.17
working_directory: /go/src/github.com/cloudfoundry-community/splunk-firehose-nozzle
steps: # steps that comprise the `deploy` job
- attach_workspace:
Expand All @@ -35,8 +34,7 @@ jobs:
- run:
name: Install dependencies
command: |
go get -u -v -t github.com/Masterminds/glide
glide install --strip-vendor
go mod vendor
cp -R /tmp/splunk-firehose-nozzle .
- run:
name: Deploy nozzle
Expand All @@ -47,7 +45,7 @@ jobs:
tile-builder:
docker:
- image: circleci/golang:1.12
- image: circleci/golang:1.17
working_directory: /go/src/github.com/cloudfoundry-community/splunk-firehose-nozzle
steps:
- attach_workspace:
Expand All @@ -66,7 +64,7 @@ jobs:
execute_tests:
docker:
- image: circleci/golang:1.12
- image: circleci/golang:1.17
working_directory: /go/src/github.com/cloudfoundry-community/splunk-firehose-nozzle
steps:
- attach_workspace:
Expand All @@ -75,8 +73,7 @@ jobs:
- run:
name: Install dependencies
command: |
go get -u -v -t github.com/Masterminds/glide
glide install --strip-vendor
go mod vendor
cp -R /tmp/splunk-firehose-nozzle .
- run:
name: Deploy data-gen
Expand Down
10 changes: 7 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,17 @@ endif
default: installdeps build

installdeps:
glide install --strip-vendor
go mod vendor

updatedeps:
glide update
go get -u
go mod tidy
go mod vendor

initdeps:
glide create
go mod init
go mod tidy
go mod vendor

# -gcflags '-N -l' for debug
# -ldflags -w for prod
Expand Down
123 changes: 72 additions & 51 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,50 +57,47 @@ or later. Earlier versions should use `cloud_controller.admin` instead.
- - - -
#### Environment Parameters
You can declare parameters by making a copy of the scripts/nozzle.sh.template.
* `DEBUG`: Enable debug mode (forward to standard out instead of Splunk).
* `DEBUG`: Enable debug mode (forward to standard out instead of Splunk). (Default: false).

__Cloud Foundry configuration parameters:__
* `API_ENDPOINT`: Cloud Foundry API endpoint address.
* `CLIENT_ID`: UAA Client ID (Must have authorities and grant_types described above).
* `CLIENT_SECRET`: Secret for Client ID.
* `API_ENDPOINT`: Cloud Foundry API endpoint address. It is required parameter.
* `CLIENT_ID`: UAA Client ID (Must have authorities and grant_types described above). It is required parameter.
* `CLIENT_SECRET`: Secret for Client ID. It is required parameter.

__Splunk configuration parameters:__
* `SPLUNK_TOKEN`: [Splunk HTTP event collector token](http://docs.splunk.com/Documentation/Splunk/latest/Data/UsetheHTTPEventCollector/).
* `SPLUNK_HOST`: Splunk HTTP event collector host. example: https://example.cloud.splunk.com:8088
* `SPLUNK_INDEX`: The Splunk index events will be sent to. Warning: Setting an invalid index will cause events to be lost. This index must match one of the selected indexes for the Splunk HTTP event collector token used for the SPLUNK_TOKEN parameter.
* `SPLUNK_TOKEN`: [Splunk HTTP event collector token](http://docs.splunk.com/Documentation/Splunk/latest/Data/UsetheHTTPEventCollector/). It is required parameter.
* `SPLUNK_HOST`: Splunk HTTP event collector host. example: https://example.cloud.splunk.com:8088. It is required parameter.
* `SPLUNK_INDEX`: The Splunk index events will be sent to. Warning: Setting an invalid index will cause events to be lost. This index must match one of the selected indexes for the Splunk HTTP event collector token used for the SPLUNK_TOKEN parameter. It is required parameter.

__Advanced Configuration Features:__
* `JOB_NAME`: Tags nozzle log events with job name.
* `JOB_INDEX`: Tags nozzle log events with job index.
* `JOB_HOST`: Tags nozzle log events with job host.
* `JOB_NAME`: Tags nozzle log events with job name. It is optional. (Default: 'splunk-nozzle')
* `JOB_INDEX`: Tags nozzle log events with job index. (Default: -1)
* `JOB_HOST`: Tags nozzle log events with job host. (Default: "")
* `SKIP_SSL_VALIDATION_CF`: Skips SSL certificate validation for connection to Cloud Foundry. Secure communications will not check SSL certificates against a trusted certificate authority.
This is recommended for dev environments only. (Default: false)
* `SKIP_SSL_VALIDATION_SPLUNK`: Skips SSL certificate validation for connection to Splunk. Secure communications will not check SSL certificates against a trusted certificate authority. (Default: false)
This is recommended for dev environments only.
* `SKIP_SSL_VALIDATION_SPLUNK`: Skips SSL certificate validation for connection to Splunk. Secure communications will not check SSL certificates against a trusted certificate authority.
This is recommended for dev environments only.
* `FIREHOSE_SUBSCRIPTION_ID`: Tags nozzle events with a Firehose subscription id. See https://docs.pivotal.io/pivotalcf/1-11/loggregator/log-ops-guide.html.
* `FIREHOSE_KEEP_ALIVE`: Keep alive duration for the Firehose consumer.
* `ADD_APP_INFO`: Enrich raw data with app info. A comma separated list of app metadata (AppName,OrgName,OrgGuid,SpaceName,SpaceGuid).
* `IGNORE_MISSING_APP`: If the application is missing, then stop repeatedly querying application info from Cloud Foundry.
* `MISSING_APP_CACHE_INVALIDATE_TTL`: How frequently the missing app info cache invalidates (in s/m/h. For example, 3600s or 60m or 1h).
* `APP_CACHE_INVALIDATE_TTL`: How frequently the app info local cache invalidates (in s/m/h. For example, 3600s or 60m or 1h).
* `ORG_SPACE_CACHE_INVALIDATE_TTL`: How frequently the org and space cache invalidates (in s/m/h. For example, 3600s or 60m or 1h).
* `APP_LIMITS`: Restrict to APP_LIMITS the most updated apps per request when populating the app metadata cache.
* `BOLTDB_PATH`: Bolt database path.
* `EVENTS`: A comma separated list of events to include. Possible values: ValueMetric,CounterEvent,Error,LogMessage,HttpStartStop,ContainerMetric
* `EXTRA_FIELDS`: Extra fields to annotate your events with (format is key:value,key:value).
* `FLUSH_INTERVAL`: Time interval (in s/m/h. For example, 3600s or 60m or 1h) for flushing queue to Splunk regardless of CONSUMER_QUEUE_SIZE. Protects against stale events in low throughput systems.
* `CONSUMER_QUEUE_SIZE`: Sets the internal consumer queue buffer size. Events will be pushed to Splunk after queue is full.
* `HEC_BATCH_SIZE`: Set the batch size for the events to push to HEC (Splunk HTTP Event Collector).
* `HEC_RETRIES`: Retry count for sending events to Splunk. After expiring, events will begin dropping causing data loss.
* `HEC_WORKERS`: Set the amount of Splunk HEC workers to increase concurrency while ingesting in Splunk.
* `ENABLE_EVENT_TRACING`: Enables event trace logging. Splunk events will now contain a UUID, Splunk Nozzle Event Counts, and a Subscription-ID for Splunk correlation searches.
* `SPLUNK_VERSION`: The Splunk version that determines how HEC ingests metadata fields. Only required for Splunk version 6.3 or below.
* `FIREHOSE_SUBSCRIPTION_ID`: Tags nozzle events with a Firehose subscription id. See https://docs.pivotal.io/pivotalcf/1-11/loggregator/log-ops-guide.html. (Default: splunk-firehose)
* `FIREHOSE_KEEP_ALIVE`: Keep alive duration for the Firehose consumer. (Default: 25s)
* `ADD_APP_INFO`: Enrich raw data with app info. A comma separated list of app metadata (AppName,OrgName,OrgGuid,SpaceName,SpaceGuid). (Default: "")
* `ADD_TAGS`: Add additional tags from envelope to splunk event. (Default: false)
(Please note: Adding tags / Enabling this feature may slightly impact the performance due to the increased event size)
* `IGNORE_MISSING_APP`: If the application is missing, then stop repeatedly querying application info from Cloud Foundry. (Default: true)
* `MISSING_APP_CACHE_INVALIDATE_TTL`: How frequently the missing app info cache invalidates (in s/m/h. For example, 3600s or 60m or 1h). (Default: 0s)
* `APP_CACHE_INVALIDATE_TTL`: How frequently the app info local cache invalidates (in s/m/h. For example, 3600s or 60m or 1h). (Default: 0s)
* `ORG_SPACE_CACHE_INVALIDATE_TTL`: How frequently the org and space cache invalidates (in s/m/h. For example, 3600s or 60m or 1h). (Default: 72h)
* `APP_LIMITS`: Restrict to APP_LIMITS the most updated apps per request when populating the app metadata cache. keep it 0 to update all the apps. (Default: 0)
* `BOLTDB_PATH`: Bolt database path. (Default: cache.db)
* `EVENTS`: A comma separated list of events to include. It is a required field. Possible values: ValueMetric,CounterEvent,Error,LogMessage,HttpStartStop,ContainerMetric. If no eventtype is selected, nozzle will automatically select LogMessage to keep the nozzle running. (Default: "ValueMetric,CounterEvent,ContainerMetric")
* `EXTRA_FIELDS`: Extra fields to annotate your events with (format is key:value,key:value). (Default: "")
* `FLUSH_INTERVAL`: Time interval (in s/m/h. For example, 3600s or 60m or 1h) for flushing queue to Splunk regardless of CONSUMER_QUEUE_SIZE. Protects against stale events in low throughput systems. (Default: 5s)
* `CONSUMER_QUEUE_SIZE`: Sets the internal consumer queue buffer size. Events will be pushed to Splunk after queue is full. (Default: 10000)
* `HEC_BATCH_SIZE`: Set the batch size for the events to push to HEC (Splunk HTTP Event Collector). (Default: 100)
* `HEC_RETRIES`: Retry count for sending events to Splunk. After expiring, events will begin dropping causing data loss. (Default: 5)
* `HEC_WORKERS`: Set the amount of Splunk HEC workers to increase concurrency while ingesting in Splunk. (Default: 8)
* `ENABLE_EVENT_TRACING`: Enables event trace logging. Splunk events will now contain a UUID, Splunk Nozzle Event Counts, and a Subscription-ID for Splunk correlation searches. (Default: false)
* `STATUS_MONITOR_INTERVAL`: Time interval (in s/m/h. For example, 3600s or 60m or 1h) for monitoring memory queue pressure. Use to help with back-pressure insights. (Increases CPU load. Use for insights purposes only) Default is 0s (Disabled).
### Please note
> SPLUNK_VERSION configuration parameter is only required for Splunk version 6.3 and below.
For Splunk version 6.3 or below, please deploy nozzle via CLI. Update nozzle_manifest.yml with splunk_version (For example: SPLUNK_VERSION: 6.3) as an env variable and [deploy nozzle as an app via CLI](#push-as-an-app-to-cloud-foundry).

**[Tile](https://network.pivotal.io/products/splunk-nozzle/)** only supports deployment for Splunk version 6.4 or above
* `DROP_WARN_THRESHOLD`: Threshold for the count of dropped events in case the downstream is slow. Based on the threshold, the errors will be logged.

- - - -

Expand Down Expand Up @@ -157,26 +154,42 @@ specifying correct "--boltdb-path" flag or "BOLTDB_PATH" environment variable.
Set F2S_DISABLE_LOGGING = true as a environment variable in applications's manifest to disable logging.


### Index routing
## Index routing
Index routing is a feature that can be used to send different Cloud Foundry logs to different indexes for better ACL and data retention control in Splunk.

#### Per application index routing via application manifest
In your app manifest provide an environment variable called `SPLUNK_INDEX` and assign it the index you would like to send the app data to
### Per application index routing via application manifest
To enable per app index routing,
* Please set environment variable `SPLUNK_INDEX` in your application's manifest ([example below](#example-manifest-file))
* Make sure Splunk nozzle is configured with `ADD_APP_INFO` (Select at least one of AppName,OrgName,OrgGuid,SpaceName,SpaceGuid) to enable app info caching
* Make sure `SPLUNK_INDEX` specified in app's manifest exist in Splunk and can receive data for the configured Splunk HEC token.

> **WARNING**: If `SPLUNK_INDEX` is invalid, events from other apps may also get lost as splunk will drop entire event batch if any of the event from batch is invalid (i.e. invalid index)

There are two ways to set the variable:

In your app manifest provide an environment variable called `SPLUNK_INDEX` and assign it the index you would like to send the app data to.

#### Example Manifest file
```
applications:
- name: console
- name: <App-Name>
memory: 256M
disk_quota: 256M
host: console
timeout: 180
buildpack: https://github.com/SUSE/stratos-buildpack
health-check-type: port
...
env:
SPLUNK_INDEX: testing_index
SPLUNK_INDEX: <SPLUNK_INDEX>
...
```
You can also update the env on the fly using cf-cli command:
```
cf set-env <APP_NAME> SPLUNK_INDEX <ENV_VAR_VALUE>
```
#### Please note
> If you are updating env on the fly, make sure that `APP_CACHE_INVALIDATE_TTL` is greater tha 0s. Otherwise cached app-info will not be updated and events will not be sent to required index.
#### Index routing via Splunk configuration
### Index routing via Splunk configuration
Logs can be routed using fields such as app ID/name, space ID/name or org ID/name.
Users can configure the Splunk configuration files props.conf and transforms.conf on Splunk indexers or Splunk Heavy Forwarders if deployed.
Expand Down Expand Up @@ -337,7 +350,6 @@ A correct setup logs a start message with configuration parameters of the Nozzle
skip-ssl: true
splunk-host: http://localhost:8088
splunk-index: atomic
splunk-version: 8.1
subscription-id: splunk-firehose
trace-logging: true
status-monitor-interval: 0s
Expand Down Expand Up @@ -366,7 +378,17 @@ As the Splunk Firehose Nozzle sends data to Splunk via HTTPS using the HTTP Even
sourcetype="cf:splunknozzle" "dropping events"
</pre>
### 4. Check for data loss inside the Splunk Firehose Nozzle:
### 4. Check for dropped events due to slow downstream(Network/Splunk):
If the nozzle emits the ‘dropped events’ warning saying that downstream is slow, then the network or Splunk environment might needs to be scaled. (eg. Splunk HEC receiver node, Splunk Indexer, LB etc)
Run the following search to determine if Splunk has indexed any events indicating such issues.
<pre class="terminal">
sourcetype="cf:splunknozzle" "dropped Total of"
</pre>
### 5. Check for data loss inside the Splunk Firehose Nozzle:
If "Event Tracing" is enabled, extra metadata will be attached to events. This allows searches to calculate the percentage of data loss inside the Splunk Firehose Nozzle, if applicable.
Expand Down Expand Up @@ -398,10 +420,9 @@ Make sure you have the following installed on your workstation:
| Software | Version
| --- | --- |
| go | go1.12.x
| glide | 0.12.x
| go | go1.17.x
Then install all dependent packages via [Glide](https://glide.sh/):
Then make sure that all dependent packages are there:
```
$ cd <REPO_ROOT_DIRECTORY>
Expand All @@ -421,7 +442,7 @@ $ chmod +x tools/nozzle.sh
Build project:
```
$ make VERSION=1.2.2
$ make VERSION=1.2.3
```
Run tests with [Ginkgo](http://onsi.github.io/ginkgo/)
Expand Down
61 changes: 48 additions & 13 deletions cache/boltdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ import (

"code.cloudfoundry.org/lager"

"github.com/boltdb/bolt"
cfclient "github.com/cloudfoundry-community/go-cfclient"
json "github.com/mailru/easyjson"
bolt "go.etcd.io/bbolt"
)

const (
APP_BUCKET = "AppBucket"
)

var (
MissingAndIgnoredErr = errors.New("App was missed and ignored")
ErrMissingAndIgnored = errors.New("App was missing from the in-memory cache and ignored")
)

type BoltdbConfig struct {
Expand Down Expand Up @@ -127,13 +127,12 @@ func (c *Boltdb) Close() error {
return c.appdb.Close()
}

// GetApp tries first get app info from cache. If caches doesn't have this
// app info (cache miss), it issues API to retrieve the app info from remote
// if the app is not already missing and clients don't ignore the missing app
// info, and then add the app info to the cache
// On the other hand, if the app is already missing and clients want to
// save remote API and ignore missing app, then a nil app info and an error
// will be returned.
// GetApp tries to retrieve the app info from in-memory cache. If it finds the app then it returns.
// If the app is added to missing app cache then it will return ErrMissingAndIgnored.
// If the app is not found in in-memory cache and missing app cache, it'll make an API request
// to retrieve the app info from remote. If found, the app will be added to the cache and returns.
// If not found on remote, it'll try to retrieve from boltdb databse. If found, returns.
// If not found and IgnoreMissingApps congig is enabled, the app will be added to missingApps cache.
func (c *Boltdb) GetApp(appGuid string) (*App, error) {
app, err := c.getAppFromCache(appGuid)
if err != nil {
Expand All @@ -146,9 +145,26 @@ func (c *Boltdb) GetApp(appGuid string) (*App, error) {
return app, nil
}

// First time seeing app
// App was not found in in-memory cache. Try to retrieve from remote and boltdb databse.
app, err = c.getAppFromRemote(appGuid)

if app == nil {
// Not able to find the app from remote. App may be deleted.
// Check if the app is available in boltdb cache
dbApp, _ := c.getAppFromDatabase(appGuid)
if dbApp != nil {
c.config.Logger.Debug(fmt.Sprint("Using old app info for cf_app_id ", appGuid))
c.lock.Lock()
c.cache[appGuid] = dbApp
c.lock.Unlock()
c.fillOrgAndSpace(dbApp)
return dbApp, nil
}
}

if err != nil {
// App is not available from in-memory cache, boltdb databse or remote
// Adding to missing app cache
if c.config.IgnoreMissingApps {
// Record this missing app
c.lock.Lock()
Expand Down Expand Up @@ -207,7 +223,7 @@ func (c *Boltdb) getAppFromCache(appGuid string) (*App, error) {
if c.config.IgnoreMissingApps && alreadyMissed {
// already missed
c.lock.RUnlock()
return nil, MissingAndIgnoredErr
return nil, ErrMissingAndIgnored
}
c.lock.RUnlock()

Expand Down Expand Up @@ -239,6 +255,25 @@ func (c *Boltdb) getAllAppsFromBoltDB() (map[string]*App, error) {
return apps, nil
}

// getAppFromDatabase will try to get the app from the database and return it.
func (c *Boltdb) getAppFromDatabase(appGuid string) (*App, error) {
var appData []byte
c.appdb.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte(APP_BUCKET))

appData = b.Get([]byte(appGuid))
return nil
})
if appData == nil {
return nil, nil
}
var app App
if err := json.Unmarshal(appData, &app); err != nil {
return nil, err
}
return &app, nil
}

func (c *Boltdb) getAllAppsFromRemote() (map[string]*App, error) {
c.config.Logger.Info("Retrieving apps from remote")

Expand Down Expand Up @@ -341,12 +376,12 @@ func (c *Boltdb) fillDatabase(apps map[string]*App) {
c.appdb.Update(func(tx *bolt.Tx) error {
serialize, err := json.Marshal(app)
if err != nil {
return fmt.Errorf("Error Marshaling data: %s", err)
return fmt.Errorf("error Marshaling data: %s", err)
}

b := tx.Bucket([]byte(APP_BUCKET))
if err := b.Put([]byte(app.Guid), serialize); err != nil {
return fmt.Errorf("Error inserting data: %s", err)
return fmt.Errorf("error inserting data: %s", err)
}
return nil
})
Expand Down
Loading

0 comments on commit fcbda2d

Please sign in to comment.