Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 17 additions & 25 deletions internal/thirdparty/promclient/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,24 +43,25 @@ func WithTime(t time.Time) QueryOption {
}

type RealClient struct {
prometheuses map[string]promv1.API
mimirMetrics promv1.API
mimirRules promv1.API
log logrus.FieldLogger
}

func New(clusters []string, tenant string, log logrus.FieldLogger) (*RealClient, error) {
proms := map[string]promv1.API{}

for _, cluster := range clusters {
client, err := api.NewClient(api.Config{Address: fmt.Sprintf("https://prometheus.%s.%s.cloud.nais.io", cluster, tenant)})
if err != nil {
return nil, err
}
mimirMetrics, err := api.NewClient(api.Config{Address: "http://mimir-query-frontend:8080"})
if err != nil {
return nil, err
}

proms[cluster] = promv1.NewAPI(client)
mimirAlerts, err := api.NewClient(api.Config{Address: "http://mimir-ruler:8080"})
if err != nil {
return nil, err
}

return &RealClient{
prometheuses: proms,
mimirMetrics: promv1.NewAPI(mimirMetrics),
mimirRules: promv1.NewAPI(mimirAlerts),
log: log,
}, nil
}
Expand All @@ -72,7 +73,7 @@ func (c *RealClient) QueryAll(ctx context.Context, query string, opts ...QueryOp
}
wg := pool.NewWithResults[*result]().WithContext(ctx)

for env := range c.prometheuses {
for _, env := range []string{"dev"} {
wg.Go(func(ctx context.Context) (*result, error) {
v, err := c.Query(ctx, env, query, opts...)
if err != nil {
Expand All @@ -97,10 +98,7 @@ func (c *RealClient) QueryAll(ctx context.Context, query string, opts ...QueryOp
}

func (c *RealClient) Query(ctx context.Context, environmentName string, query string, opts ...QueryOption) (prom.Vector, error) {
client, ok := c.prometheuses[environmentName]
if !ok {
return nil, fmt.Errorf("no prometheus client for environment %s", environmentName)
}
client := c.mimirMetrics

opt := &QueryOpts{
Time: time.Now().Add(-5 * time.Minute),
Expand Down Expand Up @@ -130,19 +128,13 @@ func (c *RealClient) Query(ctx context.Context, environmentName string, query st
}

func (c *RealClient) QueryRange(ctx context.Context, environment string, query string, promRange promv1.Range) (prom.Value, promv1.Warnings, error) {
client, ok := c.prometheuses[environment]
if !ok {
return nil, nil, fmt.Errorf("no prometheus client for environment %s", environment)
}

client := c.mimirMetrics
return client.QueryRange(ctx, query, promRange)
}

func (c *RealClient) Rules(ctx context.Context, environment string, teamSlug slug.Slug) (promv1.RulesResult, error) {
api, ok := c.prometheuses[environment]
if !ok {
return promv1.RulesResult{}, fmt.Errorf("no prometheus client for environment %s", environment)
}
api := c.mimirRules

res, err := api.Rules(ctx)
if err != nil {
return promv1.RulesResult{}, err
Expand All @@ -160,7 +152,7 @@ func (c *RealClient) RulesAll(ctx context.Context, teamSlug slug.Slug) (map[stri
}
wg := pool.NewWithResults[*item]().WithContext(ctx)

for env := range c.prometheuses {
for _, env := range []string{"dev"} {
wg.Go(func(ctx context.Context) (*item, error) {
res, err := c.Rules(ctx, env, teamSlug)
if err != nil {
Expand Down
8 changes: 4 additions & 4 deletions internal/utilization/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func (e UtilizationResourceType) String() string {
return string(e)
}

func (e *UtilizationResourceType) UnmarshalGQL(v interface{}) error {
func (e *UtilizationResourceType) UnmarshalGQL(v any) error {
str, ok := v.(string)
if !ok {
return fmt.Errorf("enums must be strings")
Expand Down Expand Up @@ -133,7 +133,7 @@ type WorkloadUtilizationRecommendations struct {
}

func (w WorkloadUtilizationRecommendations) CPURequestCores(ctx context.Context) (float64, error) {
v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(cpuRequestRecommendation, w.workloadName, w.teamSlug, w.start.Hour(), w.start.Add(time.Hour*12).Hour()))
v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(cpuRequestRecommendation, w.environmentName, w.teamSlug, w.workloadName, w.start.Hour(), w.start.Add(time.Hour*12).Hour()))
if err != nil {
return 0, err
}
Expand All @@ -144,7 +144,7 @@ func (w WorkloadUtilizationRecommendations) CPURequestCores(ctx context.Context)
}

func (w WorkloadUtilizationRecommendations) MemoryRequestBytes(ctx context.Context) (int64, error) {
v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(memoryRequestRecommendation, w.workloadName, w.teamSlug, w.start.Hour(), w.start.Add(time.Hour*12).Hour()))
v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(memoryRequestRecommendation, w.environmentName, w.teamSlug, w.workloadName, w.start.Hour(), w.start.Add(time.Hour*12).Hour()))
if err != nil {
return 0, err
}
Expand All @@ -154,7 +154,7 @@ func (w WorkloadUtilizationRecommendations) MemoryRequestBytes(ctx context.Conte
}

func (w WorkloadUtilizationRecommendations) MemoryLimitBytes(ctx context.Context) (int64, error) {
v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(memoryLimitRecommendation, w.workloadName, w.teamSlug, w.start.Hour(), w.start.Add(time.Hour*12).Hour()))
v, err := w.client.Query(ctx, w.environmentName, fmt.Sprintf(memoryLimitRecommendation, w.environmentName, w.teamSlug, w.workloadName, w.start.Hour(), w.start.Add(time.Hour*12).Hour()))
if err != nil {
return 0, err
}
Expand Down
56 changes: 28 additions & 28 deletions internal/utilization/queries.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,35 +17,35 @@ import (
)

const (
appCPULimit = `max by (container, namespace) (kube_pod_container_resource_limits{namespace=%q, container=%q, resource="cpu", unit="core"})`
appCPURequest = `max by (container, namespace) (kube_pod_container_resource_requests{namespace=%q, container=%q, resource="cpu",unit="core"})`
appCPUUsage = `rate(container_cpu_usage_seconds_total{namespace=%q, container=%q}[5m])`
appCPUUsageAvg = `avg by (container, namespace) (rate(container_cpu_usage_seconds_total{namespace=%q, container=%q}[5m]))`
appMemoryLimit = `max by (container, namespace) (kube_pod_container_resource_limits{namespace=%q, container=%q, resource="memory", unit="byte"})`
appMemoryRequest = `max by (container, namespace) (kube_pod_container_resource_requests{namespace=%q, container=%q, resource="memory",unit="byte"})`
appMemoryUsage = `last_over_time(container_memory_working_set_bytes{namespace=%q, container=%q}[5m])`
appMemoryUsageAvg = `avg by (container, namespace) (last_over_time(container_memory_working_set_bytes{namespace=%q, container=%q}[5m]))`
instanceCPUUsage = `rate(container_cpu_usage_seconds_total{namespace=%q, container=%q, pod=%q}[5m])`
instanceMemoryUsage = `last_over_time(container_memory_working_set_bytes{namespace=%q, container=%q, pod=%q}[5m])`
teamCPURequest = `sum by (container, owner_kind) (kube_pod_container_resource_requests{namespace=%q, container!~%q, resource="cpu",unit="core"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamCPUUsage = `sum by (container, owner_kind) (rate(container_cpu_usage_seconds_total{namespace=%q, container!~%q}[5m]) * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"} )`
teamMemoryRequest = `sum by (container, owner_kind) (kube_pod_container_resource_requests{namespace=%q, container!~%q, resource="memory",unit="byte"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamMemoryUsage = `sum by (container, owner_kind) (container_memory_working_set_bytes{namespace=%q, container!~%q} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamsCPURequest = `sum by (namespace, owner_kind) (kube_pod_container_resource_requests{namespace!~%q, container!~%q, resource="cpu",unit="core"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamsCPUUsage = `sum by (namespace, owner_kind) (rate(container_cpu_usage_seconds_total{namespace!~%q, container!~%q}[5m]) * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamsMemoryRequest = `sum by (namespace, owner_kind) (kube_pod_container_resource_requests{namespace!~%q, container!~%q, resource="memory",unit="byte"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamsMemoryUsage = `sum by (namespace, owner_kind) (container_memory_working_set_bytes{namespace!~%q, container!~%q} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
appCPULimit = `max by (container, namespace) (kube_pod_container_resource_limits{k8s_cluster_name=%q,namespace=%q, container=%q, resource="cpu", unit="core"})`
appCPURequest = `max by (container, namespace) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace=%q, container=%q, resource="cpu",unit="core"})`
appCPUUsage = `rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q, namespace=%q, container=%q}[5m])`
appCPUUsageAvg = `avg by (container, namespace) (rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q, namespace=%q, container=%q}[5m]))`
appMemoryLimit = `max by (container, namespace) (kube_pod_container_resource_limits{k8s_cluster_name=%q, namespace=%q, container=%q, resource="memory", unit="byte"})`
appMemoryRequest = `max by (container, namespace) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace=%q, container=%q, resource="memory",unit="byte"})`
appMemoryUsage = `last_over_time(container_memory_working_set_bytes{k8s_cluster_name=%q, namespace=%q, container=%q}[5m])`
appMemoryUsageAvg = `avg by (container, namespace) (last_over_time(container_memory_working_set_bytes{k8s_cluster_name=%q, namespace=%q, container=%q}[5m]))`
instanceCPUUsage = `rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q, namespace=%q, container=%q, pod=%q}[5m])`
instanceMemoryUsage = `last_over_time(container_memory_working_set_bytes{k8s_cluster_name=%q, namespace=%q, container=%q, pod=%q}[5m])`
teamCPURequest = `sum by (container, owner_kind) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace=%q, container!~%q, resource="cpu",unit="core"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamCPUUsage = `sum by (container, owner_kind) (rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q, namespace=%q, container!~%q}[5m]) * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"} )`
teamMemoryRequest = `sum by (container, owner_kind) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace=%q, container!~%q, resource="memory",unit="byte"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamMemoryUsage = `sum by (container, owner_kind) (container_memory_working_set_bytes{k8s_cluster_name=%q, namespace=%q, container!~%q} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamsCPURequest = `sum by (namespace, owner_kind) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace!~%q, container!~%q, resource="cpu",unit="core"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamsCPUUsage = `sum by (namespace, owner_kind) (rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q, namespace!~%q, container!~%q}[5m]) * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamsMemoryRequest = `sum by (namespace, owner_kind) (kube_pod_container_resource_requests{k8s_cluster_name=%q, namespace!~%q, container!~%q, resource="memory",unit="byte"} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`
teamsMemoryUsage = `sum by (namespace, owner_kind) (container_memory_working_set_bytes{k8s_cluster_name=%q, namespace!~%q, container!~%q} * on(pod,namespace) group_left(owner_kind) kube_pod_owner{owner_kind="ReplicaSet"})`

cpuRequestRecommendation = `max(
avg_over_time(
rate(container_cpu_usage_seconds_total{container=%q,namespace=%q}[5m])[1w:5m]
rate(container_cpu_usage_seconds_total{k8s_cluster_name=%q,namespace=%q, container=%q}[5m])[1w:5m]
)
and on ()
(hour() >= %d and hour() < %d and day_of_week() > 0 and day_of_week() < 6)
)`
memoryRequestRecommendation = `max(
avg_over_time(
quantile_over_time(0.8, container_memory_working_set_bytes{container=%q,namespace=%q}[5m])[1w:5m]
quantile_over_time(0.8, container_memory_working_set_bytes{k8s_cluster_name=%q,namespace=%q,container=%q}[5m])[1w:5m]
)
and on ()
time() >= (hour() >= %d and hour() < %d and day_of_week() > 0 and day_of_week() < 6)
Expand All @@ -54,7 +54,7 @@ const (
max_over_time(
quantile_over_time(
0.95,
container_memory_working_set_bytes{container=%q,namespace=%q}[5m]
container_memory_working_set_bytes{k8s_cluster_name=%q,namespace=%q, container=%q}[5m]
)[1w:5m]
)
and on ()
Expand Down Expand Up @@ -204,7 +204,7 @@ func WorkloadResourceRequest(ctx context.Context, env string, teamSlug slug.Slug

c := fromContext(ctx).client

v, err := c.Query(ctx, env, fmt.Sprintf(q, teamSlug, workloadName))
v, err := c.Query(ctx, env, fmt.Sprintf(q, env, teamSlug, workloadName))
if err != nil {
return 0, err
}
Expand All @@ -219,7 +219,7 @@ func WorkloadResourceLimit(ctx context.Context, env string, teamSlug slug.Slug,

c := fromContext(ctx).client

v, err := c.Query(ctx, env, fmt.Sprintf(q, teamSlug, workloadName))
v, err := c.Query(ctx, env, fmt.Sprintf(q, env, teamSlug, workloadName))
if err != nil {
return nil, err
}
Expand All @@ -239,28 +239,28 @@ func WorkloadResourceUsage(ctx context.Context, env string, teamSlug slug.Slug,

c := fromContext(ctx).client

v, err := c.Query(ctx, env, fmt.Sprintf(q, teamSlug, workloadName))
v, err := c.Query(ctx, env, fmt.Sprintf(q, env, teamSlug, workloadName))
if err != nil {
return 0, err
}

return ensuredVal(v), nil
}

func queryPrometheusRange(ctx context.Context, environmentName string, teamSlug slug.Slug, workloadName string, queryTemplate string, start time.Time, end time.Time, step int) ([]*UtilizationSample, error) {
func queryPrometheusRange(ctx context.Context, env string, teamSlug slug.Slug, workloadName string, queryTemplate string, start time.Time, end time.Time, step int) ([]*UtilizationSample, error) {
c := fromContext(ctx).client

// Format the query
query := fmt.Sprintf(queryTemplate, teamSlug, workloadName)
query := fmt.Sprintf(queryTemplate, env, teamSlug, workloadName)

// Perform the query
v, warnings, err := c.QueryRange(ctx, environmentName, query, promv1.Range{Start: start, End: end, Step: time.Duration(step) * time.Second})
v, warnings, err := c.QueryRange(ctx, env, query, promv1.Range{Start: start, End: end, Step: time.Duration(step) * time.Second})
if err != nil {
return nil, err
}
if len(warnings) > 0 {
fromContext(ctx).log.WithFields(logrus.Fields{
"environment": environmentName,
"environment": env,
"warnings": strings.Join(warnings, ", "),
}).Warn("prometheus query warnings")
}
Expand Down