From 5cbfaef745bc0ec032abf95d52b692fc53d64a97 Mon Sep 17 00:00:00 2001 From: Thomas Jackson Date: Fri, 23 Feb 2018 18:29:19 -0800 Subject: [PATCH 01/10] Add replsetConf metrics We already pull the replsetStatus metrics, but things such as priority, hidden, and votes are in the conf. This exposes them. For now the tags mimic what is in mongo (so they match the docs) -- sadly this does mean that the tags don't match what the other replset metrics have (set -> id, etc.) --- collector/mongodb_collector.go | 13 +++ collector/replset_conf.go | 144 +++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 collector/replset_conf.go diff --git a/collector/mongodb_collector.go b/collector/mongodb_collector.go index a16e8635..a01837cc 100644 --- a/collector/mongodb_collector.go +++ b/collector/mongodb_collector.go @@ -68,6 +68,7 @@ func NewMongodbCollector(opts MongodbCollectorOpts) *MongodbCollector { func (exporter *MongodbCollector) Describe(ch chan<- *prometheus.Desc) { (&ServerStatus{}).Describe(ch) (&ReplSetStatus{}).Describe(ch) + (&ReplSetConf{}).Describe(ch) (&DatabaseStatus{}).Describe(ch) if exporter.Opts.CollectTopMetrics { @@ -88,6 +89,7 @@ func (exporter *MongodbCollector) Collect(ch chan<- prometheus.Metric) { if exporter.Opts.CollectReplSet { glog.Info("Collecting ReplSet Status") exporter.collectReplSetStatus(mongoSess, ch) + exporter.collectReplSetConf(mongoSess, ch) } if exporter.Opts.CollectOplog { glog.Info("Collecting Oplog Status") @@ -145,6 +147,17 @@ func (exporter *MongodbCollector) collectReplSetStatus(session *mgo.Session, ch return replSetStatus } +func (exporter *MongodbCollector) collectReplSetConf(session *mgo.Session, ch chan<- prometheus.Metric) *ReplSetConf { + replSetConf := GetReplSetConf(session) + + if replSetConf != nil { + glog.Info("exporting ReplSetConf Metrics") + replSetConf.Export(ch) + } + + return replSetConf +} + func (exporter *MongodbCollector) collectOplogStatus(session *mgo.Session, ch chan<- prometheus.Metric) *OplogStatus { oplogStatus := GetOplogStatus(session) diff --git a/collector/replset_conf.go b/collector/replset_conf.go new file mode 100644 index 00000000..e451240d --- /dev/null +++ b/collector/replset_conf.go @@ -0,0 +1,144 @@ +package collector + +import ( + "gopkg.in/mgo.v2" + "gopkg.in/mgo.v2/bson" + + "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" +) + +var ( + memberHidden = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: subsystem, + Name: "member_hidden", + Help: "This field conveys if the member is hidden (1) or not-hidden (0).", + }, []string{"id", "host"}) + memberArbiter = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: subsystem, + Name: "member_arbiter", + Help: "This field conveys if the member is an arbiter (1) or not (0).", + }, []string{"id", "host"}) + memberBuildIndexes = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: subsystem, + Name: "member_build_indexes", + Help: "This field conveys if the member is builds indexes (1) or not (0).", + }, []string{"id", "host"}) + memberPriority = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: subsystem, + Name: "member_priority", + Help: "This field conveys the priority of a given member", + }, []string{"id", "host"}) + memberVotes = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: subsystem, + Name: "member_votes", + Help: "This field conveys the number of votes of a given member", + }, []string{"id", "host"}) +) + +// Although the docs say that it returns a map with id etc. it *actually* returns +// that wrapped in a map +type OuterReplSetConf struct { + Config ReplSetConf `bson:"config"` +} + +// ReplSetConf keeps the data returned by the GetReplSetConf method +type ReplSetConf struct { + Id string `bson:"_id"` + Version int `bson:"version"` + Members []MemberConf `bson:"members"` +} + +/* +Example: +"settings" : { + "chainingAllowed" : true, + "heartbeatIntervalMillis" : 2000, + "heartbeatTimeoutSecs" : 10, + "electionTimeoutMillis" : 5000, + "getLastErrorModes" : { + + }, + "getLastErrorDefaults" : { + "w" : 1, + "wtimeout" : 0 + } +} +*/ +type ReplSetConfSettings struct { +} + +// Member represents an array element of ReplSetConf.Members +type MemberConf struct { + Id int32 `bson:"_id"` + Host string `bson:"host"` + ArbiterOnly bool `bson:"arbiterOnly"` + BuildIndexes bool `bson:"buildIndexes"` + Hidden bool `bson:"hidden"` + Priority int32 `bson:"priority"` + + Tags map[string]string `bson:"tags"` + SlaveDelay float64 `bson:"saveDelay"` + Votes int32 `bson:"votes"` +} + +// Export exports the replSetGetStatus stati to be consumed by prometheus +func (replConf *ReplSetConf) Export(ch chan<- prometheus.Metric) { + for _, member := range replConf.Members { + ls := prometheus.Labels{ + "id": replConf.Id, + "host": member.Host, + } + if member.Hidden { + memberHidden.With(ls).Set(1) + } else { + memberHidden.With(ls).Set(0) + } + + if member.ArbiterOnly { + memberArbiter.With(ls).Set(1) + } else { + memberArbiter.With(ls).Set(0) + } + + if member.BuildIndexes { + memberBuildIndexes.With(ls).Set(1) + } else { + memberBuildIndexes.With(ls).Set(0) + } + + memberPriority.With(ls).Set(float64(member.Priority)) + memberVotes.With(ls).Set(float64(member.Votes)) + } + // collect metrics + memberHidden.Collect(ch) + memberArbiter.Collect(ch) + memberBuildIndexes.Collect(ch) + memberPriority.Collect(ch) + memberVotes.Collect(ch) +} + +// Describe describes the replSetGetStatus metrics for prometheus +func (replConf *ReplSetConf) Describe(ch chan<- *prometheus.Desc) { + memberHidden.Describe(ch) + memberArbiter.Describe(ch) + memberBuildIndexes.Describe(ch) + memberPriority.Describe(ch) + memberVotes.Describe(ch) +} + +// GetReplSetConf returns the replica status info +func GetReplSetConf(session *mgo.Session) *ReplSetConf { + result := &OuterReplSetConf{} + err := session.DB("admin").Run(bson.D{{"replSetGetConfig", 1}}, result) + if err != nil { + glog.Error("Failed to get replSet config.") + return nil + } + return &result.Config +} From a77d216168c00632205c14e431c3bf3046a2d08c Mon Sep 17 00:00:00 2001 From: Thomas Jackson Date: Tue, 6 Mar 2018 12:46:52 -0800 Subject: [PATCH 02/10] Add socket timeout CLI flag --- collector/mongodb_collector.go | 4 ++++ mongodb_exporter.go | 2 ++ shared/connection.go | 3 ++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/collector/mongodb_collector.go b/collector/mongodb_collector.go index a01837cc..789d04af 100644 --- a/collector/mongodb_collector.go +++ b/collector/mongodb_collector.go @@ -1,6 +1,8 @@ package collector import ( + "time" + "github.com/dcu/mongodb_exporter/shared" "github.com/golang/glog" "github.com/prometheus/client_golang/prometheus" @@ -36,6 +38,7 @@ type MongodbCollectorOpts struct { CollectConnPoolStats bool UserName string AuthMechanism string + SocketTimeout time.Duration } func (in MongodbCollectorOpts) toSessionOps() shared.MongoSessionOpts { @@ -47,6 +50,7 @@ func (in MongodbCollectorOpts) toSessionOps() shared.MongoSessionOpts { TLSHostnameValidation: in.TLSHostnameValidation, UserName: in.UserName, AuthMechanism: in.AuthMechanism, + SocketTimeout: in.SocketTimeout, } } diff --git a/mongodb_exporter.go b/mongodb_exporter.go index b69fa7be..24f4a7fc 100644 --- a/mongodb_exporter.go +++ b/mongodb_exporter.go @@ -56,6 +56,7 @@ var ( mongodbCollectCollectionMetrics = flag.Bool("mongodb.collect.collection", false, "Collect MongoDB collection metrics") mongodbCollectProfileMetrics = flag.Bool("mongodb.collect.profile", false, "Collect MongoDB profile metrics") mongodbCollectConnPoolStats = flag.Bool("mongodb.collect.connpoolstats", false, "Collect MongoDB connpoolstats") + mongodbSocketTimeout = flag.Duration("mongodb.socket-timeout", 0, "timeout for socket operations to mongodb") version = flag.Bool("version", false, "Print mongodb_exporter version") ) @@ -160,6 +161,7 @@ func registerCollector() { CollectConnPoolStats: *mongodbCollectConnPoolStats, UserName: *mongodbUserName, AuthMechanism: *mongodbAuthMechanism, + SocketTimeout: *mongodbSocketTimeout, }) prometheus.MustRegister(mongodbCollector) } diff --git a/shared/connection.go b/shared/connection.go index 6dbe71c3..c51fbd0c 100644 --- a/shared/connection.go +++ b/shared/connection.go @@ -26,6 +26,7 @@ type MongoSessionOpts struct { TLSHostnameValidation bool UserName string AuthMechanism string + SocketTimeout time.Duration } // MongoSession creates a Mongo session @@ -55,7 +56,7 @@ func MongoSession(opts MongoSessionOpts) *mgo.Session { } session.SetMode(mgo.Eventual, true) session.SetSyncTimeout(syncMongodbTimeout) - session.SetSocketTimeout(0) + session.SetSocketTimeout(opts.SocketTimeout) return session } From 2838dbd1e1b2646429725cb27c1c19d30c1b1276 Mon Sep 17 00:00:00 2001 From: Thomas Jackson Date: Thu, 15 Mar 2018 15:20:01 -0700 Subject: [PATCH 03/10] Add option to tail oplog and get per ns/api counts Mongo's internal metrics are only API level (insert/update/delete). This adds the ability to split those out by ns as well. --- collector/mongodb_collector.go | 17 ++++++++ collector/oplog_tail.go | 71 ++++++++++++++++++++++++++++++++++ glide.lock | 31 +++++++++++---- glide.yaml | 2 + mongodb_exporter.go | 2 + 5 files changed, 115 insertions(+), 8 deletions(-) create mode 100644 collector/oplog_tail.go diff --git a/collector/mongodb_collector.go b/collector/mongodb_collector.go index 789d04af..97da59cc 100644 --- a/collector/mongodb_collector.go +++ b/collector/mongodb_collector.go @@ -31,6 +31,7 @@ type MongodbCollectorOpts struct { TLSHostnameValidation bool CollectReplSet bool CollectOplog bool + TailOplog bool CollectTopMetrics bool CollectDatabaseMetrics bool CollectCollectionMetrics bool @@ -100,6 +101,11 @@ func (exporter *MongodbCollector) Collect(ch chan<- prometheus.Metric) { exporter.collectOplogStatus(mongoSess, ch) } + if exporter.Opts.TailOplog { + glog.Info("Collecting Oplog Tail Stats") + exporter.collectOplogTailStats(mongoSess, ch) + } + if exporter.Opts.CollectTopMetrics { glog.Info("Collecting Top Metrics") exporter.collectTopStatus(mongoSess, ch) @@ -173,6 +179,17 @@ func (exporter *MongodbCollector) collectOplogStatus(session *mgo.Session, ch ch return oplogStatus } +func (exporter *MongodbCollector) collectOplogTailStats(session *mgo.Session, ch chan<- prometheus.Metric) *OplogTailStats { + oplogTailStats := GetOplogTailStats(session) + + if oplogTailStats != nil { + glog.Info("exporting oplogTailStats Metrics") + oplogTailStats.Export(ch) + } + + return oplogTailStats +} + func (exporter *MongodbCollector) collectTopStatus(session *mgo.Session, ch chan<- prometheus.Metric) *TopStatus { topStatus := GetTopStatus(session) if topStatus != nil { diff --git a/collector/oplog_tail.go b/collector/oplog_tail.go new file mode 100644 index 00000000..20de5170 --- /dev/null +++ b/collector/oplog_tail.go @@ -0,0 +1,71 @@ +package collector + +import ( + "fmt" + + "github.com/prometheus/client_golang/prometheus" + "github.com/rwynn/gtm" + "gopkg.in/mgo.v2" +) + +var ( + oplogEntryCount = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "oplogtail", + Name: "entry_count", + Help: "The total number of entries observed in the oplog by ns/op", + }, []string{"ns", "op"}) + oplogTailError = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: "oplogtail", + Name: "tail_error", + Help: "The total number of errors while tailing the oplog", + }) +) + +var tailer *OplogTailStats + +type OplogTailStats struct{} + +func (o *OplogTailStats) Start(session *mgo.Session) { + defer session.Close() + session.SetMode(mgo.Monotonic, true) + + ctx := gtm.Start(session, nil) + + // ctx.OpC is a channel to read ops from + // ctx.ErrC is a channel to read errors from + // ctx.Stop() stops all go routines started by gtm.Start + for { + // loop forever receiving events + select { + case err := <-ctx.ErrC: + // handle errors + fmt.Println(err) + case op := <-ctx.OpC: + oplogEntryCount.WithLabelValues(op.Namespace, op.Operation).Add(1) + } + } +} + +// Export exports metrics to Prometheus +func (status *OplogTailStats) Export(ch chan<- prometheus.Metric) { + oplogEntryCount.Collect(ch) + oplogTailError.Collect(ch) +} + +// Describe describes metrics collected +func (status *OplogTailStats) Describe(ch chan<- *prometheus.Desc) { + oplogEntryCount.Describe(ch) + oplogTailError.Describe(ch) +} + +func GetOplogTailStats(session *mgo.Session) *OplogTailStats { + if tailer == nil { + tailer = &OplogTailStats{} + // Start a tailer with a copy of the session (to avoid messing with the other metrics in the session) + tailer.Start(session.Copy()) + } + + return tailer +} diff --git a/glide.lock b/glide.lock index f7fe5f54..786ae441 100644 --- a/glide.lock +++ b/glide.lock @@ -1,41 +1,56 @@ -hash: 7dc238d7118210ebe55b508ed1d11af8c39132025807b9cddc2ec2489d9926f6 -updated: 2016-11-23T09:05:11.556107146-08:00 +hash: 039dc285fae0ee4b97a236ec9935fff74f31b66c011fef0fdf64ab746d07a02a +updated: 2018-03-15T15:08:18.90732609-07:00 imports: - name: github.com/beorn7/perks version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9 subpackages: - quantile +- name: github.com/globalsign/mgo + version: baa28fcb8e7d5dfab92026c0920cb6c9ae72faa2 + subpackages: + - bson + - internal/json + - internal/sasl + - internal/scram - name: github.com/golang/glog version: 23def4e6c14b4da8ac2ed8007337bc5eb5007998 - name: github.com/golang/protobuf - version: 8ee79997227bf9b34611aee7946ae64735e6fd93 + version: 17ce1425424ab154092bbb43af630bd647f3bb0d subpackages: - proto - name: github.com/matttproud/golang_protobuf_extensions version: c12348ce28de40eed0136aa2b644d0ee0650e56c subpackages: - pbutil +- name: github.com/pkg/errors + version: 645ef00459ed84a119197bfb8d8205042c6df63d - name: github.com/prometheus/client_golang version: c5b7fccd204277076155f10851dad72b76a49317 subpackages: - prometheus - name: github.com/prometheus/client_model - version: fa8ad6fec33561be4280a8f0514318c79d7f6cb6 + version: 6f3806018612930941127f2a7c6c453ba2c527d2 subpackages: - go - name: github.com/prometheus/common - version: 0d5de9d6d8629cb8bee6d4674da4127cd8b615a3 + version: e3fb1a1acd7605367a2b378bc2e2f893c05174b7 subpackages: - expfmt - - model - internal/bitbucket.org/ww/goautoneg + - model - name: github.com/prometheus/procfs - version: abf152e5f3e97f2fafac028d2cc06c1feb87ffa5 + version: a6e9df898b1336106c743392c48ee0b71f5c4efa + subpackages: + - xfs +- name: github.com/rwynn/gtm + version: c5642730dfa1ae9ceaebf33baa28d19a24bb0714 +- name: github.com/serialx/hashring + version: 6a9381c5a83e926b9f1fd907395a581e69747e96 - name: gopkg.in/mgo.v2 version: 3f83fa5005286a7fe593b055f0d7771a7dce4655 subpackages: - bson + - internal/json - internal/sasl - internal/scram - - internal/json testImports: [] diff --git a/glide.yaml b/glide.yaml index 17134852..59ae1d54 100644 --- a/glide.yaml +++ b/glide.yaml @@ -8,3 +8,5 @@ import: - package: gopkg.in/mgo.v2 subpackages: - bson +- package: github.com/rwynn/gtm + version: c5642730dfa1ae9ceaebf33baa28d19a24bb0714 diff --git a/mongodb_exporter.go b/mongodb_exporter.go index 24f4a7fc..9d0607fa 100644 --- a/mongodb_exporter.go +++ b/mongodb_exporter.go @@ -50,6 +50,7 @@ var ( mongodbUserName = flag.String("mongodb.username", "", "Username to connect to Mongodb") mongodbAuthMechanism = flag.String("mongodb.mechanism", "", "auth mechanism to connect to Mongodb (ie: MONGODB-X509)") mongodbCollectOplog = flag.Bool("mongodb.collect.oplog", true, "collect Mongodb Oplog status") + mongodbCollectOplogTail = flag.Bool("mongodb.collect.oplog_tail", false, "tail Mongodb Oplog to get stats") mongodbCollectReplSet = flag.Bool("mongodb.collect.replset", true, "collect Mongodb replica set status") mongodbCollectTopMetrics = flag.Bool("mongodb.collect.top", false, "collect Mongodb Top metrics") mongodbCollectDatabaseMetrics = flag.Bool("mongodb.collect.database", false, "collect MongoDB database metrics") @@ -153,6 +154,7 @@ func registerCollector() { TLSCaFile: *mongodbTLSCa, TLSHostnameValidation: !(*mongodbTLSDisableHostnameValidation), CollectOplog: *mongodbCollectOplog, + TailOplog: *mongodbCollectOplogTail, CollectReplSet: *mongodbCollectReplSet, CollectTopMetrics: *mongodbCollectTopMetrics, CollectDatabaseMetrics: *mongodbCollectDatabaseMetrics, From 2a7f2e111e7646fcf2ca00fe1071a595bb3120f3 Mon Sep 17 00:00:00 2001 From: Thomas Jackson Date: Thu, 15 Mar 2018 17:02:37 -0700 Subject: [PATCH 04/10] Update glide dep to handle errors Before gtm would stop collecting if it hit any errors on iterators (as it would corrupt the session). This update moves to a forked version which solves that issue. --- collector/oplog_tail.go | 7 +++---- glide.lock | 16 ++++------------ glide.yaml | 3 ++- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/collector/oplog_tail.go b/collector/oplog_tail.go index 20de5170..abc01ee9 100644 --- a/collector/oplog_tail.go +++ b/collector/oplog_tail.go @@ -1,8 +1,7 @@ package collector import ( - "fmt" - + "github.com/golang/glog" "github.com/prometheus/client_golang/prometheus" "github.com/rwynn/gtm" "gopkg.in/mgo.v2" @@ -32,6 +31,7 @@ func (o *OplogTailStats) Start(session *mgo.Session) { session.SetMode(mgo.Monotonic, true) ctx := gtm.Start(session, nil) + defer ctx.Stop() // ctx.OpC is a channel to read ops from // ctx.ErrC is a channel to read errors from @@ -40,8 +40,7 @@ func (o *OplogTailStats) Start(session *mgo.Session) { // loop forever receiving events select { case err := <-ctx.ErrC: - // handle errors - fmt.Println(err) + glog.Errorf("Error getting entry from oplog: %v", err) case op := <-ctx.OpC: oplogEntryCount.WithLabelValues(op.Namespace, op.Operation).Add(1) } diff --git a/glide.lock b/glide.lock index 786ae441..2699fcb2 100644 --- a/glide.lock +++ b/glide.lock @@ -1,17 +1,10 @@ -hash: 039dc285fae0ee4b97a236ec9935fff74f31b66c011fef0fdf64ab746d07a02a -updated: 2018-03-15T15:08:18.90732609-07:00 +hash: 2daa265ce1037e5ac801ef074714e212b1ff69f2fea2e5c69a6c05ece9326f31 +updated: 2018-03-15T17:01:48.086832728-07:00 imports: - name: github.com/beorn7/perks version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9 subpackages: - quantile -- name: github.com/globalsign/mgo - version: baa28fcb8e7d5dfab92026c0920cb6c9ae72faa2 - subpackages: - - bson - - internal/json - - internal/sasl - - internal/scram - name: github.com/golang/glog version: 23def4e6c14b4da8ac2ed8007337bc5eb5007998 - name: github.com/golang/protobuf @@ -22,8 +15,6 @@ imports: version: c12348ce28de40eed0136aa2b644d0ee0650e56c subpackages: - pbutil -- name: github.com/pkg/errors - version: 645ef00459ed84a119197bfb8d8205042c6df63d - name: github.com/prometheus/client_golang version: c5b7fccd204277076155f10851dad72b76a49317 subpackages: @@ -43,7 +34,8 @@ imports: subpackages: - xfs - name: github.com/rwynn/gtm - version: c5642730dfa1ae9ceaebf33baa28d19a24bb0714 + version: 495abc277593067479c3c528864191804ab04cf7 + repo: https://github.com/jacksontj/gtm.git - name: github.com/serialx/hashring version: 6a9381c5a83e926b9f1fd907395a581e69747e96 - name: gopkg.in/mgo.v2 diff --git a/glide.yaml b/glide.yaml index 59ae1d54..63ca8349 100644 --- a/glide.yaml +++ b/glide.yaml @@ -9,4 +9,5 @@ import: subpackages: - bson - package: github.com/rwynn/gtm - version: c5642730dfa1ae9ceaebf33baa28d19a24bb0714 + version: 495abc277593067479c3c528864191804ab04cf7 + repo: https://github.com/jacksontj/gtm.git From 628c5aa0ad64067f23246a80ad9d9b323b029ee2 Mon Sep 17 00:00:00 2001 From: Thomas Jackson Date: Fri, 16 Mar 2018 08:51:40 -0700 Subject: [PATCH 05/10] background the tailer Instead of making the first GetOplogTailStats block indefinitely we can simply background it and move on. --- collector/oplog_tail.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector/oplog_tail.go b/collector/oplog_tail.go index abc01ee9..77c13a36 100644 --- a/collector/oplog_tail.go +++ b/collector/oplog_tail.go @@ -63,7 +63,7 @@ func GetOplogTailStats(session *mgo.Session) *OplogTailStats { if tailer == nil { tailer = &OplogTailStats{} // Start a tailer with a copy of the session (to avoid messing with the other metrics in the session) - tailer.Start(session.Copy()) + go tailer.Start(session.Copy()) } return tailer From ecbbf6a6bfc79611e04c120e23d87cee5a518037 Mon Sep 17 00:00:00 2001 From: Thomas Jackson Date: Fri, 16 Mar 2018 08:52:16 -0700 Subject: [PATCH 06/10] Actually use the oplogTailError --- collector/oplog_tail.go | 1 + 1 file changed, 1 insertion(+) diff --git a/collector/oplog_tail.go b/collector/oplog_tail.go index 77c13a36..bf842283 100644 --- a/collector/oplog_tail.go +++ b/collector/oplog_tail.go @@ -40,6 +40,7 @@ func (o *OplogTailStats) Start(session *mgo.Session) { // loop forever receiving events select { case err := <-ctx.ErrC: + oplogTailError.Add(1) glog.Errorf("Error getting entry from oplog: %v", err) case op := <-ctx.OpC: oplogEntryCount.WithLabelValues(op.Namespace, op.Operation).Add(1) From b668087c0638e356a59b2c7717834a1db175c563 Mon Sep 17 00:00:00 2001 From: Thomas Jackson Date: Mon, 19 Mar 2018 09:29:01 -0700 Subject: [PATCH 07/10] Mkae collection level metric logging go under verbose --- collector/collection_status.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector/collection_status.go b/collector/collection_status.go index 030690d0..e63c360b 100644 --- a/collector/collection_status.go +++ b/collector/collection_status.go @@ -101,7 +101,7 @@ func CollectCollectionStatus(session *mgo.Session, db string, ch chan<- promethe for _, collection_name := range collection_names { collStats := GetCollectionStatus(session, db, collection_name) if collStats != nil { - glog.Infof("exporting Database Metrics for db=%q, table=%q", db, collection_name) + glog.V(1).Infof("exporting Database Metrics for db=%q, table=%q", db, collection_name) collStats.Export(ch) } } From 4ee1cca5c25687c0598cd698316218730b68795c Mon Sep 17 00:00:00 2001 From: Thomas Jackson Date: Mon, 19 Mar 2018 09:42:48 -0700 Subject: [PATCH 08/10] Set timeout longer for oplog tail --- collector/oplog_tail.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/collector/oplog_tail.go b/collector/oplog_tail.go index bf842283..c739824d 100644 --- a/collector/oplog_tail.go +++ b/collector/oplog_tail.go @@ -1,6 +1,8 @@ package collector import ( + "time" + "github.com/golang/glog" "github.com/prometheus/client_golang/prometheus" "github.com/rwynn/gtm" @@ -27,6 +29,11 @@ var tailer *OplogTailStats type OplogTailStats struct{} func (o *OplogTailStats) Start(session *mgo.Session) { + // Override the socket timeout for oplog tailing + // Here we want a long-running socket, otherwise we cause lots of locks + // which seriously impede oplog performance + session.SetSocketTimeout(time.Second * 120) + defer session.Close() session.SetMode(mgo.Monotonic, true) From b81e77d0f515dd19fcb80c6bb6eec17f6dd4a087 Mon Sep 17 00:00:00 2001 From: Thomas Jackson Date: Mon, 19 Mar 2018 11:34:39 -0700 Subject: [PATCH 09/10] Set timeout on DB-side cursor as well --- collector/oplog_tail.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/collector/oplog_tail.go b/collector/oplog_tail.go index c739824d..35f57bbc 100644 --- a/collector/oplog_tail.go +++ b/collector/oplog_tail.go @@ -7,6 +7,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/rwynn/gtm" "gopkg.in/mgo.v2" + "gopkg.in/mgo.v2/bson" ) var ( @@ -32,7 +33,11 @@ func (o *OplogTailStats) Start(session *mgo.Session) { // Override the socket timeout for oplog tailing // Here we want a long-running socket, otherwise we cause lots of locks // which seriously impede oplog performance - session.SetSocketTimeout(time.Second * 120) + timeout := time.Second * 120 + session.SetSocketTimeout(timeout) + // Set cursor timeout + var tmp map[string]interface{} + session.Run(bson.D{{"setParameter", 1}, {"cursorTimeoutMillis", timeout / time.Millisecond}}, &tmp) defer session.Close() session.SetMode(mgo.Monotonic, true) From 733087c46ee214601175ebf5f781d589cc7390d9 Mon Sep 17 00:00:00 2001 From: Thomas Jackson Date: Fri, 6 Apr 2018 09:47:18 -0700 Subject: [PATCH 10/10] Add tcmalloc metrics --- collector/server_status.go | 7 +++ collector/tcmalloc.go | 121 +++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 collector/tcmalloc.go diff --git a/collector/server_status.go b/collector/server_status.go index a5c0eed4..bc8ee5ec 100644 --- a/collector/server_status.go +++ b/collector/server_status.go @@ -56,6 +56,7 @@ type ServerStatus struct { Opcounters *OpcountersStats `bson:"opcounters"` OpcountersRepl *OpcountersReplStats `bson:"opcountersRepl"` + TCMallocStats *TCMallocStats `bson:"tcmalloc"` Mem *MemStats `bson:"mem"` Metrics *MetricsStats `bson:"metrics"` @@ -104,6 +105,9 @@ func (status *ServerStatus) Export(ch chan<- prometheus.Metric) { if status.OpcountersRepl != nil { status.OpcountersRepl.Export(ch) } + if status.TCMallocStats != nil { + status.TCMallocStats.Export(ch) + } if status.Mem != nil { status.Mem.Export(ch) } @@ -168,6 +172,9 @@ func (status *ServerStatus) Describe(ch chan<- *prometheus.Desc) { if status.OpcountersRepl != nil { status.OpcountersRepl.Describe(ch) } + if status.TCMallocStats != nil { + status.TCMallocStats.Describe(ch) + } if status.Mem != nil { status.Mem.Describe(ch) } diff --git a/collector/tcmalloc.go b/collector/tcmalloc.go new file mode 100644 index 00000000..dba74aaa --- /dev/null +++ b/collector/tcmalloc.go @@ -0,0 +1,121 @@ +package collector + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +var ( + tcmallocGeneral = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Name: "tcmalloc_generic_heap", + Help: "High-level summary metricsInternal metrics from tcmalloc", + }, []string{"type"}) + tcmallocPageheapBytes = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Name: "tcmalloc_pageheap_bytes", + Help: "Sizes for tcpmalloc pageheaps", + }, []string{"type"}) + tcmallocPageheapCounts = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Name: "tcmalloc_pageheap_count", + Help: "Sizes for tcpmalloc pageheaps", + }, []string{"type"}) + + tcmallocCacheBytes = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: Namespace, + Name: "tcmalloc_cache_bytes", + Help: "Sizes for tcpmalloc caches in bytes", + }, []string{"cache", "type"}) + + tcmallocAggressiveDecommit = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: Namespace, + Name: "tcmalloc_aggressive_memory_decommit", + Help: "Whether aggressive_memory_decommit is on", + }) + + tcmallocFreeBytes = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: Namespace, + Name: "tcmalloc_free_bytes", + Help: "Total free bytes of tcmalloc", + }) +) + +// TCMallocStats tracks the mem stats metrics. +type TCMallocStats struct { + Generic GenericTCMAllocStats `bson:"generic"` + Details DetailedTCMallocStats `bson:"tcmalloc"` +} + +type GenericTCMAllocStats struct { + CurrentAllocatedBytes float64 `bson:"current_allocated_bytes"` + HeapSize float64 `bson:"heap_size"` +} + +type DetailedTCMallocStats struct { + PageheapFreeBytes float64 `bson:"pageheap_free_bytes"` + PageheapUnmappedBytes float64 `bson:"pageheap_unmapped_bytes"` + PageheapComittedBytes float64 `bson:"pageheap_committed_bytes"` + PageheapScavengeCount float64 `bson:"pageheap_scavenge_count"` + PageheapCommitCount float64 `bson:"pageheap_commit_count"` + PageheapTotalCommitBytes float64 `bson:"pageheap_total_commit_bytes"` + PageheapDecommitCount float64 `bson:"pageheap_decommit_count"` + PageheapTotalDecommitBytes float64 `bson:"pageheap_total_decommit_bytes"` + PageheapReserveCount float64 `bson:"pageheap_reserve_count"` + PageheapTotalReserveBytes float64 `bson:"pageheap_total_reserve_bytes"` + + MaxTotalThreadCacheBytes float64 `bson:"max_total_thread_cache_bytes"` + CurrentTotalThreadCacheBytes float64 `bson:"current_total_thread_cache_bytes"` + CentralCacheFreeBytes float64 `bson:"central_cache_free_bytes"` + TransferCacheFreeBytes float64 `bson:"transfer_cache_free_bytes"` + ThreadCacheFreeBytes float64 `bson:"thread_cache_free_bytes"` + + TotalFreeBytes float64 `bson:"total_free_bytes"` + AggressiveMemoryDecommit float64 `bson:"aggressive_memory_decommit"` +} + +// Export exports the data to prometheus. +func (m *TCMallocStats) Export(ch chan<- prometheus.Metric) { + // Generic metrics + tcmallocGeneral.WithLabelValues("allocated").Set(m.Generic.CurrentAllocatedBytes) + tcmallocGeneral.WithLabelValues("total").Set(m.Generic.HeapSize) + tcmallocGeneral.Collect(ch) + + // Pageheap + tcmallocPageheapBytes.WithLabelValues("free").Set(m.Details.PageheapFreeBytes) + tcmallocPageheapBytes.WithLabelValues("unmapped").Set(m.Details.PageheapUnmappedBytes) + tcmallocPageheapBytes.WithLabelValues("comitted").Set(m.Details.PageheapComittedBytes) + tcmallocPageheapBytes.WithLabelValues("total_commit").Set(m.Details.PageheapTotalCommitBytes) + tcmallocPageheapBytes.WithLabelValues("total_decommit").Set(m.Details.PageheapTotalDecommitBytes) + tcmallocPageheapBytes.WithLabelValues("total_reserve").Set(m.Details.PageheapTotalReserveBytes) + tcmallocPageheapBytes.Collect(ch) + + tcmallocPageheapCounts.WithLabelValues("scavenge").Set(m.Details.PageheapScavengeCount) + tcmallocPageheapCounts.WithLabelValues("commit").Set(m.Details.PageheapCommitCount) + tcmallocPageheapCounts.WithLabelValues("decommit").Set(m.Details.PageheapDecommitCount) + tcmallocPageheapCounts.WithLabelValues("reserve").Set(m.Details.PageheapReserveCount) + tcmallocPageheapCounts.Collect(ch) + + tcmallocCacheBytes.WithLabelValues("thread_cache", "max_total").Set(m.Details.MaxTotalThreadCacheBytes) + tcmallocCacheBytes.WithLabelValues("thread_cache", "current_total").Set(m.Details.CurrentTotalThreadCacheBytes) + tcmallocCacheBytes.WithLabelValues("central_cache", "free").Set(m.Details.CentralCacheFreeBytes) + tcmallocCacheBytes.WithLabelValues("transfer_cache", "free").Set(m.Details.TransferCacheFreeBytes) + tcmallocCacheBytes.WithLabelValues("thread_cache", "free").Set(m.Details.ThreadCacheFreeBytes) + tcmallocCacheBytes.Collect(ch) + + tcmallocAggressiveDecommit.Set(m.Details.AggressiveMemoryDecommit) + tcmallocAggressiveDecommit.Collect(ch) + + tcmallocFreeBytes.Set(m.Details.TotalFreeBytes) + tcmallocFreeBytes.Collect(ch) + +} + +// Describe describes the metrics for prometheus +func (m *TCMallocStats) Describe(ch chan<- *prometheus.Desc) { + tcmallocGeneral.Describe(ch) + tcmallocPageheapBytes.Describe(ch) + tcmallocPageheapCounts.Describe(ch) + tcmallocCacheBytes.Describe(ch) + tcmallocAggressiveDecommit.Describe(ch) + tcmallocFreeBytes.Describe(ch) +}