Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refs 4739: metric check for failing snap tasks on RH repos #833

Merged
merged 2 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/dao/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ type MetricsDao interface {
PendingTasksAverageLatency(ctx context.Context) float64
PendingTasksCount(ctx context.Context) int64
PendingTasksOldestTask(ctx context.Context) float64
RHReposNoSuccessfulSnapshotTaskIn36Hours(ctx context.Context) int64
dominikvagner marked this conversation as resolved.
Show resolved Hide resolved
}

//go:generate $GO_OUTPUT/mockery --name TaskInfoDao --filename task_info_mock.go --inpackage
Expand Down
21 changes: 21 additions & 0 deletions pkg/dao/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,24 @@ func (d metricsDaoImpl) PendingTasksOldestTask(ctx context.Context) float64 {
}
return time.Since(*task.Queued).Seconds()
}

func (d metricsDaoImpl) RHReposNoSuccessfulSnapshotTaskIn36Hours(ctx context.Context) int64 {
var output int64 = -1
date := time.Now().Add(-36 * time.Hour).Format(time.RFC3339)

subQuery := d.db.WithContext(ctx).
Model(&models.RepositoryConfiguration{}).
Select("repository_configurations.uuid, bool_or(tasks.status ILIKE ? AND tasks.finished_at > ?) AS has_successful_tasks", fmt.Sprintf("%%%s%%", config.TaskStatusCompleted), date).
dominikvagner marked this conversation as resolved.
Show resolved Hide resolved
Joins("LEFT OUTER JOIN tasks ON repository_configurations.repository_uuid = tasks.object_uuid").
Where("repository_configurations.org_id = ?", config.RedHatOrg).
Where("repository_configurations.snapshot IS TRUE").
Where("tasks.type = ?", config.RepositorySnapshotTask).
Group("repository_configurations.uuid")

d.db.WithContext(ctx).
Table("(?) AS sq", subQuery).
Where("sq.has_successful_tasks IS FALSE").
Count(&output)

return output
}
18 changes: 18 additions & 0 deletions pkg/dao/metrics_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

84 changes: 84 additions & 0 deletions pkg/dao/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,87 @@ func (s *MetricsSuite) TestPendingTasksOldestTask() {
oldestQeuedAt := s.dao.PendingTasksOldestTask(context.Background())
assert.True(t, oldestQeuedAt > 1)
}

func (s *MetricsSuite) TestRHReposNoSuccessfulSnapshotTaskIn36Hours() {
t := s.T()

initialCount := s.dao.RHReposNoSuccessfulSnapshotTaskIn36Hours(context.Background())
assert.NotEqual(t, -1, initialCount)

rcs, err := seeds.SeedRepositoryConfigurations(s.tx, 4, seeds.SeedOptions{OrgID: config.RedHatOrg})
assert.NoError(t, err)
assert.Equal(t, 4, len(rcs))

r1, r2, r3, r4 := rcs[0], rcs[1], rcs[2], rcs[3]

_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r1.UUID,
RepoUUID: r1.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-10 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-9 * time.Hour)),
Status: config.TaskStatusCompleted,
})
assert.NoError(t, err)
_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r1.UUID,
RepoUUID: r1.RepositoryUUID,
Typename: config.IntrospectTask,
QueuedAt: utils.Ptr(time.Now().Add(-5 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-4 * time.Hour)),
Status: config.TaskStatusFailed,
})
assert.NoError(t, err)

_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r2.UUID,
RepoUUID: r2.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-40 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-39 * time.Hour)),
Status: config.TaskStatusCompleted,
})
assert.NoError(t, err)
_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r2.UUID,
RepoUUID: r2.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-30 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-29 * time.Hour)),
Status: config.TaskStatusFailed,
})
assert.NoError(t, err)

_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r3.UUID,
RepoUUID: r3.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-30 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-29 * time.Hour)),
Status: config.TaskStatusCompleted,
})
assert.NoError(t, err)
_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r3.UUID,
RepoUUID: r3.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-20 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-19 * time.Hour)),
Status: config.TaskStatusFailed,
})
assert.NoError(t, err)

_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r4.UUID,
RepoUUID: r4.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-20 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-19 * time.Hour)),
Status: config.TaskStatusFailed,
})
assert.NoError(t, err)

// expecting r2, r4 to be additionally counted in this metric
count := s.dao.RHReposNoSuccessfulSnapshotTaskIn36Hours(context.Background())
assert.Equal(t, 2+initialCount, count)
}
11 changes: 10 additions & 1 deletion pkg/instrumentation/custom/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ import (
"gorm.io/gorm"
)

const tickerDelay = 30 // in seconds // could be good to match this with the scrapper frequency
const tickerDelay = 30 // in seconds // could be good to match this with the scrapper frequency
const snapshottingFailCheckDelay = 60 * 60 // in seconds

type Collector struct {
context context.Context
Expand Down Expand Up @@ -88,13 +89,21 @@ func (c *Collector) iterate() {
}
}

func (c *Collector) snapshottingFailCheckIterate() {
ctx := c.context
c.metrics.RHRepositories36HourWithoutSuccessfulSnapTask.Set(float64(c.dao.RHReposNoSuccessfulSnapshotTaskIn36Hours(ctx)))
}

func (c *Collector) Run() {
log.Info().Msg("Starting metrics collector go routine")
ticker := time.NewTicker(tickerDelay * time.Second)
snapshottingFailCheckTicker := time.NewTicker(snapshottingFailCheckDelay * time.Second)
for {
select {
case <-ticker.C:
c.iterate()
case <-snapshottingFailCheckTicker.C:
c.snapshottingFailCheckIterate()
case <-c.context.Done():
log.Info().Msgf("Stopping metrics collector go routine")
ticker.Stop()
Expand Down
7 changes: 7 additions & 0 deletions pkg/instrumentation/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const (
TaskStatsLabelPendingCount = "task_stats_pending_count"
TaskStatsLabelOldestWait = "task_stats_oldest_wait"
TaskStatsLabelAverageWait = "task_stats_average_wait"
RHRepositories36HourWithoutSuccessfulSnapTask = "rh_repositories_36_hour_without_successful_snap_task"
)

type Metrics struct {
Expand All @@ -43,6 +44,7 @@ type Metrics struct {
TaskStats prometheus.GaugeVec
OrgTotal prometheus.Gauge
RHCertExpiryDays prometheus.Gauge
RHRepositories36HourWithoutSuccessfulSnapTask prometheus.Gauge
reg *prometheus.Registry
}

Expand Down Expand Up @@ -119,6 +121,11 @@ func NewMetrics(reg *prometheus.Registry) *Metrics {
Name: RHCertExpiryDays,
Help: "Number of days until the Red Hat client certificate expires",
}),
RHRepositories36HourWithoutSuccessfulSnapTask: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Namespace: NameSpace,
Name: RHRepositories36HourWithoutSuccessfulSnapTask,
Help: "Number of Red Hat repositories that haven't had successful snapshot task in the last 36 hours.",
}),
}

reg.MustRegister(collectors.NewBuildInfoCollector())
Expand Down
6 changes: 6 additions & 0 deletions pkg/seeds/seeds.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ func SeedRepositoryConfigurations(db *gorm.DB, size int, options SeedOptions) ([
OrgID: createOrgId(options.OrgID),
RepositoryUUID: repos[i].UUID,
LastSnapshotTaskUUID: options.TaskID,
Snapshot: true,
}
repoConfigurations = append(repoConfigurations, repoConfig)
}
Expand Down Expand Up @@ -404,6 +405,7 @@ type TaskSeedOptions struct {
RepoConfigUUID string
RepoUUID string
QueuedAt *time.Time
FinishedAt *time.Time
}

func SeedTasks(db *gorm.DB, size int, options TaskSeedOptions) ([]models.TaskInfo, error) {
Expand Down Expand Up @@ -469,6 +471,10 @@ func SeedTasks(db *gorm.DB, size int, options TaskSeedOptions) ([]models.TaskInf
}
started := time.Now().Add(time.Minute * time.Duration(i+5))
finished := time.Now().Add(time.Minute * time.Duration(i+10))
if options.FinishedAt != nil {
started = (*options.FinishedAt).Add(-5 * time.Minute)
finished = *options.FinishedAt
}
tasks[i] = models.TaskInfo{
Id: uuid.New(),
Typename: typename,
Expand Down
Loading