Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refs 4739: metric check for failing snap tasks on RH repos #833

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/dao/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ type MetricsDao interface {
PendingTasksAverageLatency(ctx context.Context) float64
PendingTasksCount(ctx context.Context) int64
PendingTasksOldestTask(ctx context.Context) float64
RHReposNoSuccessfulSnapshotTaskIn36Hours(ctx context.Context) int64
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
RHReposNoSuccessfulSnapshotTaskIn36Hours(ctx context.Context) int64
RHReposSnapshotNotCompletedInLast36HoursCount(ctx context.Context) int64

This is a nitpick, but what do you think about renaming this metric?

"No" can also be an abbreviation for "number", so "RHReposNoSuccessfulSnapshotTaskIn36Hours" could be interpreted as "Number of sucessful snapshot tasks"

}

//go:generate $GO_OUTPUT/mockery --name TaskInfoDao --filename task_info_mock.go --inpackage
Expand Down
21 changes: 21 additions & 0 deletions pkg/dao/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,24 @@ func (d metricsDaoImpl) PendingTasksOldestTask(ctx context.Context) float64 {
}
return time.Since(*task.Queued).Seconds()
}

func (d metricsDaoImpl) RHReposNoSuccessfulSnapshotTaskIn36Hours(ctx context.Context) int64 {
var output int64 = -1
date := time.Now().Add(-36 * time.Hour).Format(time.RFC3339)

subQuery := d.db.WithContext(ctx).
Model(&models.RepositoryConfiguration{}).
Select("repository_configurations.uuid, bool_or(tasks.status ILIKE ? AND tasks.finished_at > ?) AS has_successful_tasks", fmt.Sprintf("%%%s%%", config.TaskStatusCompleted), date).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Select("repository_configurations.uuid, bool_or(tasks.status ILIKE ? AND tasks.finished_at > ?) AS has_successful_tasks", fmt.Sprintf("%%%s%%", config.TaskStatusCompleted), date).
Select("repository_configurations.uuid, bool_or(tasks.status = ? AND tasks.finished_at > ?) AS has_successful_tasks", config.TaskStatusCompleted), date).

this can be simplified slightly

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This query looks good. I believe it would miss the (very unlikely) case where there's no snapshot tasks for a repository. I don't think this would happen naturally, but maybe could happen if there's a bug with task cleanup.

Copy link
Contributor

@rverdile rverdile Oct 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think about handling that case? I could see just adding a second query and adding the two counts together as a simple solution.

Joins("LEFT OUTER JOIN tasks ON repository_configurations.repository_uuid = tasks.object_uuid").
Where("repository_configurations.org_id = ?", config.RedHatOrg).
Where("repository_configurations.snapshot IS TRUE").
Where("tasks.type = ?", config.RepositorySnapshotTask).
Group("repository_configurations.uuid")

d.db.WithContext(ctx).
Table("(?) AS sq", subQuery).
Where("sq.has_successful_tasks IS FALSE").
Count(&output)

return output
}
18 changes: 18 additions & 0 deletions pkg/dao/metrics_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

84 changes: 84 additions & 0 deletions pkg/dao/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,87 @@ func (s *MetricsSuite) TestPendingTasksOldestTask() {
oldestQeuedAt := s.dao.PendingTasksOldestTask(context.Background())
assert.True(t, oldestQeuedAt > 1)
}

func (s *MetricsSuite) TestRHReposNoSuccessfulSnapshotTaskIn36Hours() {
t := s.T()

initialCount := s.dao.RHReposNoSuccessfulSnapshotTaskIn36Hours(context.Background())
assert.NotEqual(t, -1, initialCount)

rcs, err := seeds.SeedRepositoryConfigurations(s.tx, 4, seeds.SeedOptions{OrgID: config.RedHatOrg})
assert.NoError(t, err)
assert.Equal(t, 4, len(rcs))

r1, r2, r3, r4 := rcs[0], rcs[1], rcs[2], rcs[3]

_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r1.UUID,
RepoUUID: r1.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-10 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-9 * time.Hour)),
Status: config.TaskStatusCompleted,
})
assert.NoError(t, err)
_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r1.UUID,
RepoUUID: r1.RepositoryUUID,
Typename: config.IntrospectTask,
QueuedAt: utils.Ptr(time.Now().Add(-5 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-4 * time.Hour)),
Status: config.TaskStatusFailed,
})
assert.NoError(t, err)

_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r2.UUID,
RepoUUID: r2.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-40 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-39 * time.Hour)),
Status: config.TaskStatusCompleted,
})
assert.NoError(t, err)
_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r2.UUID,
RepoUUID: r2.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-30 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-29 * time.Hour)),
Status: config.TaskStatusFailed,
})
assert.NoError(t, err)

_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r3.UUID,
RepoUUID: r3.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-30 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-29 * time.Hour)),
Status: config.TaskStatusCompleted,
})
assert.NoError(t, err)
_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r3.UUID,
RepoUUID: r3.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-20 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-19 * time.Hour)),
Status: config.TaskStatusFailed,
})
assert.NoError(t, err)

_, err = seeds.SeedTasks(s.tx, 1, seeds.TaskSeedOptions{
RepoConfigUUID: r4.UUID,
RepoUUID: r4.RepositoryUUID,
Typename: config.RepositorySnapshotTask,
QueuedAt: utils.Ptr(time.Now().Add(-20 * time.Hour)),
FinishedAt: utils.Ptr(time.Now().Add(-19 * time.Hour)),
Status: config.TaskStatusFailed,
})
assert.NoError(t, err)

// expecting r2, r4 to be additionally counted in this metric
count := s.dao.RHReposNoSuccessfulSnapshotTaskIn36Hours(context.Background())
assert.Equal(t, 2+initialCount, count)
}
11 changes: 10 additions & 1 deletion pkg/instrumentation/custom/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ import (
"gorm.io/gorm"
)

const tickerDelay = 30 // in seconds // could be good to match this with the scrapper frequency
const tickerDelay = 30 // in seconds // could be good to match this with the scrapper frequency
const snapshottingFailCheckDelay = 60 * 60 // in seconds

type Collector struct {
context context.Context
Expand Down Expand Up @@ -88,13 +89,21 @@ func (c *Collector) iterate() {
}
}

func (c *Collector) snapshottingFailCheckIterate() {
ctx := c.context
c.metrics.RHRepositories36HourWithoutSuccessfulSnapTask.Set(float64(c.dao.RHReposNoSuccessfulSnapshotTaskIn36Hours(ctx)))
}

func (c *Collector) Run() {
log.Info().Msg("Starting metrics collector go routine")
ticker := time.NewTicker(tickerDelay * time.Second)
snapshottingFailCheckTicker := time.NewTicker(snapshottingFailCheckDelay * time.Second)
for {
select {
case <-ticker.C:
c.iterate()
case <-snapshottingFailCheckTicker.C:
c.snapshottingFailCheckIterate()
case <-c.context.Done():
log.Info().Msgf("Stopping metrics collector go routine")
ticker.Stop()
Expand Down
7 changes: 7 additions & 0 deletions pkg/instrumentation/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const (
TaskStatsLabelPendingCount = "task_stats_pending_count"
TaskStatsLabelOldestWait = "task_stats_oldest_wait"
TaskStatsLabelAverageWait = "task_stats_average_wait"
RHRepositories36HourWithoutSuccessfulSnapTask = "rh_repositories_36_hour_without_successful_snap_task"
)

type Metrics struct {
Expand All @@ -43,6 +44,7 @@ type Metrics struct {
TaskStats prometheus.GaugeVec
OrgTotal prometheus.Gauge
RHCertExpiryDays prometheus.Gauge
RHRepositories36HourWithoutSuccessfulSnapTask prometheus.Gauge
reg *prometheus.Registry
}

Expand Down Expand Up @@ -119,6 +121,11 @@ func NewMetrics(reg *prometheus.Registry) *Metrics {
Name: RHCertExpiryDays,
Help: "Number of days until the Red Hat client certificate expires",
}),
RHRepositories36HourWithoutSuccessfulSnapTask: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Namespace: NameSpace,
Name: RHRepositories36HourWithoutSuccessfulSnapTask,
Help: "Number of Red Hat repositories that haven't had successful snapshot task in the last 36 hours.",
}),
}

reg.MustRegister(collectors.NewBuildInfoCollector())
Expand Down
6 changes: 6 additions & 0 deletions pkg/seeds/seeds.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ func SeedRepositoryConfigurations(db *gorm.DB, size int, options SeedOptions) ([
OrgID: createOrgId(options.OrgID),
RepositoryUUID: repos[i].UUID,
LastSnapshotTaskUUID: options.TaskID,
Snapshot: true,
}
repoConfigurations = append(repoConfigurations, repoConfig)
}
Expand Down Expand Up @@ -404,6 +405,7 @@ type TaskSeedOptions struct {
RepoConfigUUID string
RepoUUID string
QueuedAt *time.Time
FinishedAt *time.Time
}

func SeedTasks(db *gorm.DB, size int, options TaskSeedOptions) ([]models.TaskInfo, error) {
Expand Down Expand Up @@ -469,6 +471,10 @@ func SeedTasks(db *gorm.DB, size int, options TaskSeedOptions) ([]models.TaskInf
}
started := time.Now().Add(time.Minute * time.Duration(i+5))
finished := time.Now().Add(time.Minute * time.Duration(i+10))
if options.FinishedAt != nil {
started = (*options.FinishedAt).Add(-5 * time.Minute)
finished = *options.FinishedAt
}
tasks[i] = models.TaskInfo{
Id: uuid.New(),
Typename: typename,
Expand Down
Loading