You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

280 lines
7.1 KiB

3 months ago
// Copyright 2019 The prometheus-operator Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package e2e
import (
"context"
"log"
"os"
"strings"
"testing"
"time"
"github.com/Jeffail/gabs"
"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
)
var promClient *prometheusClient
func TestMain(m *testing.M) {
os.Exit(testMain(m))
}
// testMain circumvents the issue, that one can not call `defer` in TestMain, as
// `os.Exit` does not honor `defer` statements. For more details see:
// http://blog.englund.nu/golang,/testing/2017/03/12/using-defer-in-testmain.html
func testMain(m *testing.M) int {
kubeConfigPath, ok := os.LookupEnv("KUBECONFIG")
if !ok {
log.Fatal("failed to retrieve KUBECONFIG env var")
}
config, err := clientcmd.BuildConfigFromFlags("", kubeConfigPath)
if err != nil {
log.Fatal(err)
}
kubeClient, err := kubernetes.NewForConfig(config)
if err != nil {
log.Fatal(errors.Wrap(err, "creating kubeClient failed"))
}
promClient = newPrometheusClient(kubeClient)
return m.Run()
}
func TestQueryPrometheus(t *testing.T) {
queries := []struct {
query string
expectN int
}{
{
query: `up{job="node-exporter"} == 1`,
expectN: 1,
}, {
// query: `up{job="kubelet"} == 1`,
// expectN: 1,
// }, {
query: `up{job="apiserver"} == 1`,
expectN: 1,
}, {
query: `up{job="kube-state-metrics"} == 1`,
expectN: 1,
}, {
query: `up{job="prometheus-k8s"} == 1`,
expectN: 1,
}, {
query: `up{job="prometheus-operator"} == 1`,
expectN: 1,
}, {
query: `up{job="alertmanager-main"} == 1`,
expectN: 2,
},
}
// Wait for pod to respond at queries at all. Then start verifying their results.
err := wait.Poll(5*time.Second, 1*time.Minute, func() (bool, error) {
_, err := promClient.query("up")
return err == nil, nil
})
if err != nil {
t.Fatal(errors.Wrap(err, "wait for prometheus-k8s"))
}
err = wait.Poll(5*time.Second, 1*time.Minute, func() (bool, error) {
defer t.Log("---------------------------\n")
for _, q := range queries {
n, err := promClient.query(q.query)
if err != nil {
return false, err
}
if n < q.expectN {
// Don't return an error as targets may only become visible after a while.
t.Logf("expected at least %d results for %q but got %d", q.expectN, q.query, n)
return false, nil
}
t.Logf("query %q succeeded", q.query)
}
return true, nil
})
if err != nil {
t.Fatal(err)
}
}
func TestDroppedMetrics(t *testing.T) {
// query metadata for all metrics and their metadata
md, err := promClient.metadata("{job=~\".+\"}")
if err != nil {
t.Fatal(err)
}
for _, k := range md {
// check if the metric' help text contains Deprecated
if strings.Contains(k.Help, "Deprecated") {
// query prometheus for the Deprecated metric
n, err := promClient.query(k.Metric)
if err != nil {
t.Fatal(err)
}
if n > 0 {
t.Fatalf("deprecated metric with name: %s and help text: %s exists.", k.Metric, k.Help)
}
}
}
}
func TestTargetsScheme(t *testing.T) {
// query targets for all endpoints
tgs, err := promClient.targets()
if err != nil {
t.Fatal(err)
}
// exclude jobs from checking for http endpoints
// TODO(paulfantom): This should be reduced as we secure connections for those components
exclude := map[string]bool{
"alertmanager-main": true,
"prometheus-k8s": true,
"kube-dns": true,
"grafana": true,
}
for _, k := range tgs.Active {
job := k.Labels["job"]
if k.DiscoveredLabels["__scheme__"] == "http" && !exclude[string(job)] {
t.Fatalf("target exposing metrics over HTTP instead of HTTPS: %+v", k)
}
}
}
// TestFailedRuleEvaluations detects recording and alerting rules that may
// trigger "many-to-many" evaluation errors when multiple kube-state-metrics
// instances are running.
func TestFailedRuleEvaluations(t *testing.T) {
// Scale kube-state-metrics to 2 replicas.
kClient := promClient.kubeClient
scale, err := kClient.AppsV1().Deployments("monitoring").GetScale(context.Background(), "kube-state-metrics", metav1.GetOptions{})
if err != nil {
t.Fatal(err)
}
scale.Spec.Replicas = 2
scale, err = kClient.AppsV1().Deployments("monitoring").UpdateScale(context.Background(), "kube-state-metrics", scale, metav1.UpdateOptions{})
if err != nil {
t.Fatal(err)
}
// Rollback to 1 replica at the end of the test.
defer func() {
scale, err := kClient.AppsV1().Deployments("monitoring").GetScale(context.Background(), "kube-state-metrics", metav1.GetOptions{})
if err != nil {
t.Fatal(err)
}
scale.Spec.Replicas = 1
_, err = kClient.AppsV1().Deployments("monitoring").UpdateScale(context.Background(), "kube-state-metrics", scale, metav1.UpdateOptions{})
if err != nil {
t.Fatal(err)
}
}()
// Wait for the 2 replicas of kube-state-metrics to be successfully scraped.
err = wait.Poll(5*time.Second, 1*time.Minute, func() (bool, error) {
n, err := promClient.query(`up{job="kube-state-metrics"} == 1`)
if err != nil {
return false, err
}
if n != 2 {
t.Logf("expecting 2 kube-state-metrics targets, got %d", n)
return false, nil
}
return true, nil
})
if err != nil {
t.Fatal(err)
}
// Wait for all rule groups to be evaluated at least once without error.
now := time.Now()
err = wait.Poll(30*time.Second, 5*time.Minute, func() (bool, error) {
rsp, err := promClient.apiRequest("/api/v1/rules", "type", "")
if err != nil {
return false, err
}
res, err := gabs.ParseJSON(rsp.Data)
if err != nil {
return false, err
}
groups, err := res.Path("groups").Children()
if err != nil {
return false, err
}
if len(groups) == 0 {
return false, errors.New("got 0 rule groups")
}
for _, group := range groups {
groupName := group.Path("name").Data().(string)
if err != nil {
return false, err
}
lastEvalString := group.Path("lastEvaluation").Data().(string)
lastEval, err := time.Parse(time.RFC3339Nano, lastEvalString)
if err != nil {
return false, err
}
if lastEval.Before(now) {
t.Logf("%s not yet evaluated", groupName)
return false, nil
}
rules, err := group.Path("rules").Children()
if err != nil {
return false, err
}
if len(rules) == 0 {
return false, errors.Errorf("got 0 rules in group %s", groupName)
}
for _, rule := range rules {
health := rule.Path("health").Data().(string)
if health != "ok" {
return false, errors.Errorf("error evaluating rule: %v", rule)
}
}
}
return true, nil
})
if err != nil {
t.Fatal(err)
}
}