diff --git a/internal/thirdparty/promclient/client.go b/internal/thirdparty/promclient/client.go index ea1745fd1..5397ef1d4 100644 --- a/internal/thirdparty/promclient/client.go +++ b/internal/thirdparty/promclient/client.go @@ -43,24 +43,25 @@ func WithTime(t time.Time) QueryOption { } type RealClient struct { - prometheuses map[string]promv1.API + mimirMetrics promv1.API + mimirRules promv1.API log logrus.FieldLogger } func New(clusters []string, tenant string, log logrus.FieldLogger) (*RealClient, error) { - proms := map[string]promv1.API{} - - for _, cluster := range clusters { - client, err := api.NewClient(api.Config{Address: fmt.Sprintf("https://prometheus.%s.%s.cloud.nais.io", cluster, tenant)}) - if err != nil { - return nil, err - } + mimirMetrics, err := api.NewClient(api.Config{Address: "http://mimir-query-frontend:8080"}) + if err != nil { + return nil, err + } - proms[cluster] = promv1.NewAPI(client) + mimirAlerts, err := api.NewClient(api.Config{Address: "http://mimir-ruler:8080"}) + if err != nil { + return nil, err } return &RealClient{ - prometheuses: proms, + mimirMetrics: promv1.NewAPI(mimirMetrics), + mimirRules: promv1.NewAPI(mimirAlerts), log: log, }, nil } @@ -72,7 +73,7 @@ func (c *RealClient) QueryAll(ctx context.Context, query string, opts ...QueryOp } wg := pool.NewWithResults[*result]().WithContext(ctx) - for env := range c.prometheuses { + for _, env := range []string{"dev"} { wg.Go(func(ctx context.Context) (*result, error) { v, err := c.Query(ctx, env, query, opts...) if err != nil { @@ -97,10 +98,7 @@ func (c *RealClient) QueryAll(ctx context.Context, query string, opts ...QueryOp } func (c *RealClient) Query(ctx context.Context, environmentName string, query string, opts ...QueryOption) (prom.Vector, error) { - client, ok := c.prometheuses[environmentName] - if !ok { - return nil, fmt.Errorf("no prometheus client for environment %s", environmentName) - } + client := c.mimirMetrics opt := &QueryOpts{ Time: time.Now().Add(-5 * time.Minute), @@ -130,19 +128,13 @@ func (c *RealClient) Query(ctx context.Context, environmentName string, query st } func (c *RealClient) QueryRange(ctx context.Context, environment string, query string, promRange promv1.Range) (prom.Value, promv1.Warnings, error) { - client, ok := c.prometheuses[environment] - if !ok { - return nil, nil, fmt.Errorf("no prometheus client for environment %s", environment) - } - + client := c.mimirMetrics return client.QueryRange(ctx, query, promRange) } func (c *RealClient) Rules(ctx context.Context, environment string, teamSlug slug.Slug) (promv1.RulesResult, error) { - api, ok := c.prometheuses[environment] - if !ok { - return promv1.RulesResult{}, fmt.Errorf("no prometheus client for environment %s", environment) - } + api := c.mimirRules + res, err := api.Rules(ctx) if err != nil { return promv1.RulesResult{}, err @@ -160,7 +152,7 @@ func (c *RealClient) RulesAll(ctx context.Context, teamSlug slug.Slug) (map[stri } wg := pool.NewWithResults[*item]().WithContext(ctx) - for env := range c.prometheuses { + for _, env := range []string{"dev"} { wg.Go(func(ctx context.Context) (*item, error) { res, err := c.Rules(ctx, env, teamSlug) if err != nil { diff --git a/internal/utilization/model.go b/internal/utilization/model.go index 65a39d2e9..c55cdd322 100644 --- a/internal/utilization/model.go +++ b/internal/utilization/model.go @@ -42,7 +42,7 @@ func (e UtilizationResourceType) String() string { return string(e) } -func (e *UtilizationResourceType) UnmarshalGQL(v interface{}) error { +func (e *UtilizationResourceType) UnmarshalGQL(v any) error { str, ok := v.(string) if !ok { return fmt.Errorf("enums must be strings") @@ -133,7 +133,7 @@ type WorkloadUtilizationRecommendations struct { } func (w WorkloadUtilizationRecommendations) CPURequestCores(ctx context.Context) (float64, error) { - v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(cpuRequestRecommendation, w.workloadName, w.teamSlug, w.start.Hour(), w.start.Add(time.Hour*12).Hour())) + v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(cpuRequestRecommendation, w.environmentName, w.teamSlug, w.workloadName, w.start.Hour(), w.start.Add(time.Hour*12).Hour())) if err != nil { return 0, err } @@ -144,7 +144,7 @@ func (w WorkloadUtilizationRecommendations) CPURequestCores(ctx context.Context) } func (w WorkloadUtilizationRecommendations) MemoryRequestBytes(ctx context.Context) (int64, error) { - v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(memoryRequestRecommendation, w.workloadName, w.teamSlug, w.start.Hour(), w.start.Add(time.Hour*12).Hour())) + v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(memoryRequestRecommendation, w.environmentName, w.teamSlug, w.workloadName, w.start.Hour(), w.start.Add(time.Hour*12).Hour())) if err != nil { return 0, err } @@ -154,7 +154,7 @@ func (w WorkloadUtilizationRecommendations) MemoryRequestBytes(ctx context.Conte } func (w WorkloadUtilizationRecommendations) MemoryLimitBytes(ctx context.Context) (int64, error) { - v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(memoryLimitRecommendation, w.workloadName, w.teamSlug, w.start.Hour(), w.start.Add(time.Hour*12).Hour())) + v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(memoryLimitRecommendation, w.environmentName, w.teamSlug, w.workloadName, w.start.Hour(), w.start.Add(time.Hour*12).Hour())) if err != nil { return 0, err } diff --git a/internal/utilization/queries.go b/internal/utilization/queries.go index 26814c457..1f7b3f28a 100644 --- a/internal/utilization/queries.go +++ b/internal/utilization/queries.go @@ -17,35 +17,35 @@ import ( ) const ( - appCPULimit = `max by (container, namespace) (kube_pod_container_resource_limits{namespace=%q, container=%q, resource="cpu", unit="core"})` - appCPURequest = `max by (container, namespace) (kube_pod_container_resource_requests{namespace=%q, container=%q, resource="cpu",unit="core"})` - appCPUUsage = `rate(container_cpu_usage_seconds_total{namespace=%q, container=%q}[5m])` - appCPUUsageAvg = `avg by (container, namespace) (rate(container_cpu_usage_seconds_total{namespace=%q, container=%q}[5m]))` - appMemoryLimit = `max by (container, namespace) (kube_pod_container_resource_limits{namespace=%q, container=%q, resource="memory", unit="byte"})` - appMemoryRequest = `max by (container, namespace) (kube_pod_container_resource_requests{namespace=%q, container=%q, resource="memory",unit="byte"})` - appMemoryUsage = `last_over_time(container_memory_working_set_bytes{namespace=%q, container=%q}[5m])` - appMemoryUsageAvg = `avg by (container, namespace) (last_over_time(container_memory_working_set_bytes{namespace=%q, container=%q}[5m]))` - instanceCPUUsage = `rate(container_cpu_usage_seconds_total{namespace=%q, container=%q, pod=%q}[5m])` - instanceMemoryUsage = `last_over_time(container_memory_working_set_bytes{namespace=%q, container=%q, pod=%q}[5m])` - teamCPURequest = `sum by (container, owner_kind) (kube_pod_container_resource_requests{namespace=%q, container!~%q, resource="cpu",unit="core"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` - teamCPUUsage = `sum by (container, owner_kind) (rate(container_cpu_usage_seconds_total{namespace=%q, container!~%q}[5m]) * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"} )` - teamMemoryRequest = `sum by (container, owner_kind) (kube_pod_container_resource_requests{namespace=%q, container!~%q, resource="memory",unit="byte"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` - teamMemoryUsage = `sum by (container, owner_kind) (container_memory_working_set_bytes{namespace=%q, container!~%q} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` - teamsCPURequest = `sum by (namespace, owner_kind) (kube_pod_container_resource_requests{namespace!~%q, container!~%q, resource="cpu",unit="core"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` - teamsCPUUsage = `sum by (namespace, owner_kind) (rate(container_cpu_usage_seconds_total{namespace!~%q, container!~%q}[5m]) * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` - teamsMemoryRequest = `sum by (namespace, owner_kind) (kube_pod_container_resource_requests{namespace!~%q, container!~%q, resource="memory",unit="byte"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` - teamsMemoryUsage = `sum by (namespace, owner_kind) (container_memory_working_set_bytes{namespace!~%q, container!~%q} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` + appCPULimit = `max by (container, namespace) (kube_pod_container_resource_limits{k8s_cluster_name=%q,namespace=%q, container=%q, resource="cpu", unit="core"})` + appCPURequest = `max by (container, namespace) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace=%q, container=%q, resource="cpu",unit="core"})` + appCPUUsage = `rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q, namespace=%q, container=%q}[5m])` + appCPUUsageAvg = `avg by (container, namespace) (rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q, namespace=%q, container=%q}[5m]))` + appMemoryLimit = `max by (container, namespace) (kube_pod_container_resource_limits{k8s_cluster_name=%q, namespace=%q, container=%q, resource="memory", unit="byte"})` + appMemoryRequest = `max by (container, namespace) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace=%q, container=%q, resource="memory",unit="byte"})` + appMemoryUsage = `last_over_time(container_memory_working_set_bytes{k8s_cluster_name=%q, namespace=%q, container=%q}[5m])` + appMemoryUsageAvg = `avg by (container, namespace) (last_over_time(container_memory_working_set_bytes{k8s_cluster_name=%q, namespace=%q, container=%q}[5m]))` + instanceCPUUsage = `rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q, namespace=%q, container=%q, pod=%q}[5m])` + instanceMemoryUsage = `last_over_time(container_memory_working_set_bytes{k8s_cluster_name=%q, namespace=%q, container=%q, pod=%q}[5m])` + teamCPURequest = `sum by (container, owner_kind) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace=%q, container!~%q, resource="cpu",unit="core"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` + teamCPUUsage = `sum by (container, owner_kind) (rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q, namespace=%q, container!~%q}[5m]) * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"} )` + teamMemoryRequest = `sum by (container, owner_kind) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace=%q, container!~%q, resource="memory",unit="byte"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` + teamMemoryUsage = `sum by (container, owner_kind) (container_memory_working_set_bytes{k8s_cluster_name=%q, namespace=%q, container!~%q} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` + teamsCPURequest = `sum by (namespace, owner_kind) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace!~%q, container!~%q, resource="cpu",unit="core"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` + teamsCPUUsage = `sum by (namespace, owner_kind) (rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q, namespace!~%q, container!~%q}[5m]) * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` + teamsMemoryRequest = `sum by (namespace, owner_kind) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace!~%q, container!~%q, resource="memory",unit="byte"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` + teamsMemoryUsage = `sum by (namespace, owner_kind) (container_memory_working_set_bytes{k8s_cluster_name=%q, namespace!~%q, container!~%q} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})` cpuRequestRecommendation = `max( avg_over_time( - rate(container_cpu_usage_seconds_total{container=%q,namespace=%q}[5m])[1w:5m] + rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q,namespace=%q, container=%q}[5m])[1w:5m] ) and on () (hour() >= %d and hour() < %d and day_of_week() > 0 and day_of_week() < 6) )` memoryRequestRecommendation = `max( avg_over_time( - quantile_over_time(0.8, container_memory_working_set_bytes{container=%q,namespace=%q}[5m])[1w:5m] + quantile_over_time(0.8, container_memory_working_set_bytes{k8s_cluster_name=%q,namespace=%q,container=%q}[5m])[1w:5m] ) and on () time() >= (hour() >= %d and hour() < %d and day_of_week() > 0 and day_of_week() < 6) @@ -54,7 +54,7 @@ const ( max_over_time( quantile_over_time( 0.95, - container_memory_working_set_bytes{container=%q,namespace=%q}[5m] + container_memory_working_set_bytes{k8s_cluster_name=%q,namespace=%q, container=%q}[5m] )[1w:5m] ) and on () @@ -204,7 +204,7 @@ func WorkloadResourceRequest(ctx context.Context, env string, teamSlug slug.Slug c := fromContext(ctx).client - v, err := c.Query(ctx, env, fmt.Sprintf(q, teamSlug, workloadName)) + v, err := c.Query(ctx, env, fmt.Sprintf(q, env, teamSlug, workloadName)) if err != nil { return 0, err } @@ -219,7 +219,7 @@ func WorkloadResourceLimit(ctx context.Context, env string, teamSlug slug.Slug, c := fromContext(ctx).client - v, err := c.Query(ctx, env, fmt.Sprintf(q, teamSlug, workloadName)) + v, err := c.Query(ctx, env, fmt.Sprintf(q, env, teamSlug, workloadName)) if err != nil { return nil, err } @@ -239,7 +239,7 @@ func WorkloadResourceUsage(ctx context.Context, env string, teamSlug slug.Slug, c := fromContext(ctx).client - v, err := c.Query(ctx, env, fmt.Sprintf(q, teamSlug, workloadName)) + v, err := c.Query(ctx, env, fmt.Sprintf(q, env, teamSlug, workloadName)) if err != nil { return 0, err } @@ -247,20 +247,20 @@ func WorkloadResourceUsage(ctx context.Context, env string, teamSlug slug.Slug, return ensuredVal(v), nil } -func queryPrometheusRange(ctx context.Context, environmentName string, teamSlug slug.Slug, workloadName string, queryTemplate string, start time.Time, end time.Time, step int) ([]*UtilizationSample, error) { +func queryPrometheusRange(ctx context.Context, env string, teamSlug slug.Slug, workloadName string, queryTemplate string, start time.Time, end time.Time, step int) ([]*UtilizationSample, error) { c := fromContext(ctx).client // Format the query - query := fmt.Sprintf(queryTemplate, teamSlug, workloadName) + query := fmt.Sprintf(queryTemplate, env, teamSlug, workloadName) // Perform the query - v, warnings, err := c.QueryRange(ctx, environmentName, query, promv1.Range{Start: start, End: end, Step: time.Duration(step) * time.Second}) + v, warnings, err := c.QueryRange(ctx, env, query, promv1.Range{Start: start, End: end, Step: time.Duration(step) * time.Second}) if err != nil { return nil, err } if len(warnings) > 0 { fromContext(ctx).log.WithFields(logrus.Fields{ - "environment": environmentName, + "environment": env, "warnings": strings.Join(warnings, ", "), }).Warn("prometheus query warnings") }