package service import ( "errors" "fmt" json "github.com/json-iterator/go" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/spf13/cast" "github.com/tidwall/gjson" "gitlab.wodcloud.com/smart-operation/so-operation-api/src/bean/entity" "gitlab.wodcloud.com/smart-operation/so-operation-api/src/bean/vo/response" "gitlab.wodcloud.com/smart-operation/so-operation-api/src/common/conf" "gitlab.wodcloud.com/smart-operation/so-operation-api/src/pkg/beagle/constant" "gitlab.wodcloud.com/smart-operation/so-operation-api/src/service/k8s" "gitlab.wodcloud.com/smart-operation/so-operation-api/src/util" "go.uber.org/zap" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" "net/http" "net/url" "strings" "sync" ) var prometheusRuleLabel map[string]string var once sync.Once func initPrometheusRuleLabel() { once.Do(func() { str := conf.Options.PrometheusRuleLabel err := json.Unmarshal([]byte(str), &prometheusRuleLabel) if err != nil { prometheusRuleLabel = map[string]string{ // 返回默认标签 "source": "aiops-systemmonitor-api", } } }) } // GetPrometheusRuleLabel 返回 prometheusRuleLabel 单例 func GetPrometheusRuleLabel() map[string]string { initPrometheusRuleLabel() return prometheusRuleLabel } type PrometheusRuleSvc struct { User entity.SystemUserInfo } func (p *PrometheusRuleSvc) Create(data response.AlertRulesItem) (err error) { prometheusRuleObjName := k8s.GetPrometheusRuleId(data.Id) pr := monitoringv1.PrometheusRule{ ObjectMeta: v1.ObjectMeta{ Name: prometheusRuleObjName, Namespace: conf.Options.MonitorMatchNs, Labels: k8s.GetAlertDefLabels(), }, } group := monitoringv1.RuleGroup{} groupInterval := monitoringv1.Duration(fmt.Sprintf("%d%s", data.CheckPeriod, "m")) group.Interval = &groupInterval ruleFor := monitoringv1.Duration(fmt.Sprintf("%d%s", data.Duration, data.DurationUnit)) group.Name = k8s.GetPrometheusRuleGroupName(data.MetricConfigId, string(*group.Interval)) for _, v := range data.AlertRange { item := fmt.Sprintf(`%s%s"%s"`, v.MetricLabel, v.Compare, v.Value) // http_requests_total{method="GET",pod="LeaseGrant"} data.Expr = strings.ReplaceAll(data.Expr, v.VariableName, item) } for k, v := range data.AlertCondition { labels := map[string]string{ "severity": "warning", "risk_level": cast.ToString(v.RiskLevel), "risk_level_name": constant.RiskLeveText(v.RiskLevel), "namespace": conf.Options.MonitorMatchNs, "alert_rules_id": data.Id, "metric_config_id": data.MetricConfigId, } for key, value := range GetPrometheusRuleLabel() { labels[key] = value } rule := monitoringv1.Rule{ // promhttp超过5万次告警-prom指标控制器请求数-较大风险-3 Alert: fmt.Sprintf("%s-%s-%s-%d", data.MetricName, data.MetricConfigName, constant.RiskLeveText(v.RiskLevel), k+1), For: &ruleFor, Labels: labels, Annotations: map[string]string{ "value": "{{ $value }}", "summary": fmt.Sprintf("分组名:%s, 检查周期:%s, 持续时间:%s", group.Name, string(groupInterval), string(ruleFor)), "description": "", }, } var ( condition int expr string ) if v.ThresholdsMin != nil { condition += 1 } if v.ThresholdsMax != nil { condition += 2 } // 为"空"状态下,默认表达式已经有比较判断,故直接使用表达式即可 if data.AlertRuleTypeName == "空" { condition = 0 } switch condition { default: expr = data.Expr case 1: expr = fmt.Sprintf("%s <= %s", cast.ToString(v.ThresholdsMin), data.Expr) case 2: expr = fmt.Sprintf("%s <= %s", data.Expr, cast.ToString(v.ThresholdsMax)) case 3: expr = fmt.Sprintf("%s <= %s <=%s", cast.ToString(v.ThresholdsMin), data.Expr, cast.ToString(v.ThresholdsMax)) } // 校验表达式正确性 err = CheckPrometheusQuerySyntax(expr) if err != nil { return } rule.Expr = intstr.FromString(expr) group.Rules = append(group.Rules, rule) } pr.Spec.Groups = append(pr.Spec.Groups, group) header := map[string]string{"Authorization": "Bearer " + conf.Options.KubernetesToken} prSvc := k8s.PrometheusRule{Header: header} conf.Logger.Info("pr", zap.Any("pr", pr)) err = prSvc.Create(&pr) return } func (p *PrometheusRuleSvc) Get(data response.AlertRulesItem) (obj *monitoringv1.PrometheusRule, exist bool, err error) { prometheusRuleObjName := k8s.GetPrometheusRuleId(data.Id) pr := monitoringv1.PrometheusRule{ ObjectMeta: v1.ObjectMeta{ Name: prometheusRuleObjName, Namespace: conf.Options.MonitorMatchNs, Labels: k8s.GetAlertDefLabels(), }, } header := map[string]string{"Authorization": "Bearer " + conf.Options.KubernetesToken} prSvc := k8s.PrometheusRule{Header: header} conf.Logger.Info("pr", zap.Any("pr", pr)) obj, err = prSvc.Get(pr.Namespace, pr.Name) if obj != nil && err == nil { exist = true } return } // CheckPrometheusQuerySyntax 校验普罗米修斯语法正确性 func CheckPrometheusQuerySyntax(expr string) error { params := url.Values{} params.Add("query", expr) query := params.Encode() webUrl := fmt.Sprintf("%s%s%s", conf.Options.PrometheusHost, "/api/v1/query?", query) resp, _ := util.Request(webUrl, http.MethodGet, nil, nil) if resp.StatusCode() != http.StatusOK { return errors.New(fmt.Sprintf("%s, err: %s", "普罗米修斯语法PromQL错误", gjson.GetBytes(resp.Body(), "error").String())) } return nil } func (p *PrometheusRuleSvc) Delete(data response.AlertRulesItem) (err error) { prometheusRuleObjName := k8s.GetPrometheusRuleId(data.Id) pr := monitoringv1.PrometheusRule{ ObjectMeta: v1.ObjectMeta{ Name: prometheusRuleObjName, Namespace: conf.Options.MonitorMatchNs, Labels: k8s.GetAlertDefLabels(), }, } header := map[string]string{"Authorization": "Bearer " + conf.Options.KubernetesToken} prSvc := k8s.PrometheusRule{Header: header} conf.Logger.Info("pr", zap.Any("pr", pr)) err = prSvc.Delete(pr.Namespace, pr.Name) return }