diff --git a/src/bean/entity/alert_rules.go b/src/bean/entity/alert_rules.go index 1033f80882f37ddfe161f5296a66b0ed9698332f..995c46e1386e1d8ba48ac57e18a4c45f1bfacbf4 100644 --- a/src/bean/entity/alert_rules.go +++ b/src/bean/entity/alert_rules.go @@ -36,8 +36,8 @@ type RulesAlertRange struct { } type AlertCondition struct { + ThresholdsMin int `json:"thresholds_min" form:"thresholds_min" binding:"required,lt=ThresholdsMax"` ThresholdsMax int `json:"thresholds_max" form:"thresholds_max" binding:"required"` - ThresholdsMin int `json:"thresholds_min" form:"thresholds_min" binding:"required"` RiskLevel int `json:"risk_level" form:"risk_level" binding:"required,oneof=1 2 3 4"` } diff --git a/src/controller/alert_rules.go b/src/controller/alert_rules.go index 3388c30acd1edcae5c9defa0d2fe68176140b8c6..9bea8b00bc34ff9311f6c063dfe59baa1475f1c9 100644 --- a/src/controller/alert_rules.go +++ b/src/controller/alert_rules.go @@ -6,6 +6,7 @@ import ( "gitlab.wodcloud.com/smart-operation/so-operation-api/src/pkg/beagle/resp" "gitlab.wodcloud.com/smart-operation/so-operation-api/src/router/middleware/header" "gitlab.wodcloud.com/smart-operation/so-operation-api/src/service" + "sort" ) // AddAlertRules 新增任务 @@ -16,6 +17,25 @@ func AddAlertRules(c *gin.Context) { return } + /* + [ + { + "thresholds_min": 12, + "thresholds_max": 66, + "risk_level": 1 + }, + { + "thresholds_min": 66, + "thresholds_max": 80, + "risk_level": 2 + } + ] + */ + // TODO 阈值范围判断,阈值上下限是否存在交叉 + sort.SliceStable(req.AlertCondition, func(i, j int) bool { + return req.AlertCondition[i].ThresholdsMin < req.AlertCondition[j].ThresholdsMin + }) + svc := service.AlertRulesSvc{User: header.GetUser(c)} err := svc.Add(req) if err != nil { diff --git a/src/pkg/beagle/constant/constant.go b/src/pkg/beagle/constant/constant.go index 1e530083d5ff463a0001977aa83b921350538279..3d3e196b345c85eb5ba7151f7d2ad358952f0c25 100644 --- a/src/pkg/beagle/constant/constant.go +++ b/src/pkg/beagle/constant/constant.go @@ -124,11 +124,24 @@ var OpTypeIntMap = map[OpType]int{ } // RiskLevel 风险等级 -type RiskLevel int - const ( - RiskLevelLow RiskLevel = iota + 1 // 1:低风险 - RiskLevelModerate // 1:一般风险 - RiskLevelHigh // 3:较大风险 - RiskLevelCritical // 4:重大风险 + RiskLevelLow = iota + 1 // 1:低风险 + RiskLevelModerate // 1:一般风险 + RiskLevelHigh // 3:较大风险 + RiskLevelCritical // 4:重大风险 ) + +func RiskLeveText(code int) string { + switch code { + case RiskLevelLow: + return "低风险" + case RiskLevelModerate: + return "一般风险" + case RiskLevelHigh: + return "较大风险" + case RiskLevelCritical: + return "重大风险" + default: + return "" + } +} diff --git a/src/service/alert_rules.go b/src/service/alert_rules.go index 36495bde79b28ee8866b7e17d8298e40fca92d3d..da64e4edfe337372bda1fd8474a494f2f6a54735 100644 --- a/src/service/alert_rules.go +++ b/src/service/alert_rules.go @@ -39,10 +39,20 @@ func (a *AlertRulesSvc) Add(req request.AddAlertRules) error { data.NotifyRecipients = util.ConvertToString(req.NotifyRecipients) switch req.DetectionType { case 1: + // TODO 暂时不能做事务,需要先插入数据,再进行查询 _, err = db.Insert(&data) if err != nil { return err } + + var item response.AlertRulesItem + item, err = a.GetDataById(request.DetailAlertRules{Id: data.Id}) + + // TODO 1.插入数据到 prometheus.yml --> rule_files + prSvc := PrometheusRuleSvc{User: a.User} + prSvc.Create(item) + _ = item + case 2: // 自定义 _, err = db.Transaction(func(session *xorm.Session) (interface{}, error) { // 添加自定义分类 diff --git a/src/service/k8s/prometheusrule.go b/src/service/k8s/prometheusrule.go index a9b9d05321a8e7f97ed76f950997722c8b1c4b22..a4d22d004872c79a08c5726d6e8b6fb15764f8a8 100644 --- a/src/service/k8s/prometheusrule.go +++ b/src/service/k8s/prometheusrule.go @@ -14,10 +14,29 @@ var ( PrometheusRuleGroup = "monitoring.beagle.io" // kubectl api-resources | grep -i prome PrometheusRuleVersion = "v1" PrometheusRuleKind = "PrometheusRule" + Namespace = "beagle-monitoring" PrometheusRuleApiVersion = PrometheusRuleGroup + "/" + PrometheusRuleVersion PrometheusRuleName = strings.ToLower(PrometheusRuleKind) + "s." + PrometheusRuleGroup + PrometheusRuleNamePrefix = "beagle-prometheus-so-operation-api-rules" // beagle-monitoring beagle-prometheus-prometheus-operator 43d ) +var AlertDefLabels = map[string]string{ + "app": "prometheus", + "app.bd-apaas.com/cluster-component": "monitoring", + "prometheus-operator": "monitoring", + "release": "beagle-prometheus", +} + +// GetPrometheusRuleName 获取规则CRD名称 +func GetPrometheusRuleName(alertRulesId string) string { + return fmt.Sprintf("%s-%s", PrometheusRuleNamePrefix, alertRulesId) +} + +// GetPrometheusRuleGroupName 获取规则组名称 +func GetPrometheusRuleGroupName(alertName string, groupInterval string) string { + return fmt.Sprintf("%s-group-%s", alertName, groupInterval) +} + type PrometheusRule struct { Header map[string]string } diff --git a/src/service/k8s/prometheusrule_test.go b/src/service/k8s/prometheusrule_test.go index aa24e6541c20a490dfec885fccbd99e2f6a6b851..fd06db24156a39743167fb98097c760bda2eafe9 100644 --- a/src/service/k8s/prometheusrule_test.go +++ b/src/service/k8s/prometheusrule_test.go @@ -2,7 +2,7 @@ package k8s import ( "fmt" - v1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/spf13/cast" "gitlab.wodcloud.com/smart-operation/so-operation-api/src/common/conf" "gitlab.wodcloud.com/smart-operation/so-operation-api/src/pkg/beagle/constant" @@ -13,14 +13,7 @@ import ( ) var svc PrometheusRule -var obj v1.PrometheusRule - -var AlertDefLabels = map[string]string{ - "app": "prometheus", - "app.bd-apaas.com/cluster-component": "monitoring", - "prometheus-operator": "monitoring", - "release": "beagle-prometheus", -} +var pr monitoringv1.PrometheusRule func init() { conf.Options = &conf.Config{} @@ -29,43 +22,66 @@ func init() { header := make(map[string]string) header["Authorization"] = "Bearer " + conf.Options.KubernetesToken svc = PrometheusRule{Header: header} - obj = v1.PrometheusRule{} - obj.Name = "beagle-prometheus-operation-api-rules-101" - obj.Namespace = "beagle-monitoring" - obj.Labels = AlertDefLabels + pr = monitoringv1.PrometheusRule{} + + alertRulesId := "83343ef6-4a99-47bd-abb4-bcff52feb2ec" // 预警规则id + prometheusRuleName := GetPrometheusRuleName(alertRulesId) + pr.Name = prometheusRuleName + pr.Namespace = "beagle-monitoring" + pr.Labels = AlertDefLabels } func TestRule(t *testing.T) { - group := v1.RuleGroup{} - group.Name = "demo-group-1m" - rule := v1.Rule{} - rule.Alert = "kubernetes启动状态" - rule.Annotations = make(map[string]string) - rule.Annotations["value"] = "{{ $value }}" - rule.Annotations["message"] = "启动了" - rule.Expr = intstr.FromString(`up{endpoint="https", instance="192.168.1.244:6443", job="apiserver", namespace="default", service="kubernetes"} == 1`) - ruleFor := v1.Duration("1m") - rule.For = &ruleFor - rule.Labels = make(map[string]string) - rule.Labels["severity"] = "warning" - rule.Labels["severity_name"] = "一般风险" - rule.Labels["risk_level"] = cast.ToString(constant.RiskLevelLow) - rule.Labels["source"] = "so-operation-api" - rule.Labels["alert_id"] = "101" - group.Rules = append(group.Rules, rule) - obj.Spec.Groups = append(obj.Spec.Groups, group) - err := svc.Create(&obj) + group := monitoringv1.RuleGroup{} + groupInterval := monitoringv1.Duration("5m") + group.Interval = &groupInterval + rule1 := monitoringv1.Rule{} + rule1.Alert = "kubernetes启动状态" + group.Name = GetPrometheusRuleGroupName(rule1.Alert, string(*group.Interval)) + rule1.Annotations = make(map[string]string) + rule1.Annotations["value"] = "{{ $value }}" + rule1.Annotations["message"] = "启动了" + rule1.Expr = intstr.FromString(`up{endpoint="https", instance="192.168.1.244:6443", job="apiserver", namespace="default", service="kubernetes"} == 1`) + ruleFor := monitoringv1.Duration("1m") + rule1.For = &ruleFor + rule1.Labels = map[string]string{ + "severity": "warning", + "risk_level": cast.ToString(constant.RiskLevelLow), + "risk_level_name": constant.RiskLeveText(constant.RiskLevelLow), + "source": "so-operation-api", + "alert_rules_id": "83343ef6-4a99-47bd-abb4-bcff52feb2ec", + } + + rule2 := monitoringv1.Rule{} + rule2.Alert = "kubernetes启动状态" + rule2.Annotations = make(map[string]string) + rule2.Annotations["value"] = "{{ $value }}" + rule2.Annotations["message"] = "启动了" + rule2.Expr = intstr.FromString(`up{endpoint="https", instance="192.168.1.244:6443", job="apiserver", namespace="default", service="kubernetes"} == 0`) + ruleFor2 := monitoringv1.Duration("1m") + rule2.For = &ruleFor2 + rule2.Labels = map[string]string{ + "severity": "warning", + "risk_level": cast.ToString(constant.RiskLevelModerate), + "risk_level_name": constant.RiskLeveText(constant.RiskLevelModerate), + "source": "so-operation-api", + "alert_rules_id": "83343ef6-4a99-47bd-abb4-bcff52feb2ec", + } + + group.Rules = append(group.Rules, rule1, rule2) + pr.Spec.Groups = append(pr.Spec.Groups, group) + err := svc.Create(&pr) if err != nil { - fmt.Println("添加失败" + err.Error()) + log.Println("添加失败" + err.Error()) } else { - fmt.Println("添加成功") + log.Println("添加成功") } } func TestGet(t *testing.T) { // 获取对象 - ruleObj, err := svc.Get(obj.Namespace, obj.Name) + ruleObj, err := svc.Get(pr.Namespace, pr.Name) if err != nil { fmt.Println("获取失败" + err.Error()) } else { @@ -75,7 +91,7 @@ func TestGet(t *testing.T) { } func TestUpdate(t *testing.T) { - ruleObj, err := svc.Get(obj.Namespace, obj.Name) + ruleObj, err := svc.Get(pr.Namespace, pr.Name) if err != nil { fmt.Println("获取失败" + err.Error()) } else { @@ -85,7 +101,7 @@ func TestUpdate(t *testing.T) { ruleObj.Name = "galileo-101" ruleObj.Spec.Groups[0].Rules[0].Alert = "请求次数大于10" ruleObj.Spec.Groups[0].Name = "2分钟组" - ruleFor := v1.Duration("2m") + ruleFor := monitoringv1.Duration("2m") ruleObj.Spec.Groups[0].Rules[0].For = &ruleFor ruleObj.Spec.Groups[0].Rules[0].Expr = intstr.FromString(`up{endpoint="https", instance="192.168.1.244:6443", job="apiserver", namespace="default", service="kubernetes"} == 0`) err = svc.Update(ruleObj) @@ -99,7 +115,7 @@ func TestUpdate(t *testing.T) { func TestDelete(t *testing.T) { // 删除 time.Sleep(time.Second * 10) - err := svc.Delete(obj.Namespace, obj.Name) + err := svc.Delete(pr.Namespace, pr.Name) if err != nil { fmt.Println("删除失败" + err.Error()) } else { diff --git a/src/service/prometheusrule.go b/src/service/prometheusrule.go new file mode 100644 index 0000000000000000000000000000000000000000..cac5e77ba7ce3059dcb987e9608c205ae848ef6a --- /dev/null +++ b/src/service/prometheusrule.go @@ -0,0 +1,64 @@ +package service + +import ( + "fmt" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/spf13/cast" + "gitlab.wodcloud.com/smart-operation/so-operation-api/src/bean/entity" + "gitlab.wodcloud.com/smart-operation/so-operation-api/src/bean/vo/response" + "gitlab.wodcloud.com/smart-operation/so-operation-api/src/common/conf" + "gitlab.wodcloud.com/smart-operation/so-operation-api/src/pkg/beagle/constant" + "gitlab.wodcloud.com/smart-operation/so-operation-api/src/service/k8s" + "k8s.io/apimachinery/pkg/util/intstr" + "strings" +) + +type PrometheusRuleSvc struct { + User entity.SystemUserInfo +} + +func (p *PrometheusRuleSvc) Create(data response.AlertRulesItem) { + header := make(map[string]string) + header["Authorization"] = "Bearer " + conf.Options.KubernetesToken + svc := k8s.PrometheusRule{Header: header} + pr := monitoringv1.PrometheusRule{} + prometheusRuleName := k8s.GetPrometheusRuleName(data.Id) + pr.Name = prometheusRuleName + pr.Namespace = k8s.Namespace + pr.Labels = k8s.AlertDefLabels + + group := monitoringv1.RuleGroup{} + groupInterval := monitoringv1.Duration(fmt.Sprintf("%d%s", data.CheckPeriod, "m")) + group.Interval = &groupInterval + ruleFor := monitoringv1.Duration(fmt.Sprintf("%d%s", data.Duration, data.DurationUnit)) + + // [{"variable_name":"$pod$","metric_name":"http_requests_total","metric_label":"pod","chinese_name":"demoString","is_required":true,"is_linked":true,"value":"LeaseGrant","compare":"="}] + // [{"thresholds_max":100,"thresholds_min":0,"risk_level":4}] + group.Name = k8s.GetPrometheusRuleGroupName(data.MetricConfigId, string(*group.Interval)) + for _, v := range data.AlertRange { + item := fmt.Sprintf("%s%s%s", v.MetricLabel, v.Compare, v.Value) // pod=LeaseGrant + data.Expr = strings.ReplaceAll(data.Expr, v.VariableName, item) + } + for _, v := range data.AlertCondition { + rule := monitoringv1.Rule{ + Alert: data.MetricConfigName, + For: &ruleFor, + Labels: map[string]string{ + "severity": "warning", + "risk_level": cast.ToString(v.RiskLevel), + "risk_level_name": constant.RiskLeveText(v.RiskLevel), + "source": "so-operation-api", + "alert_rules_id": data.MetricConfigId, + }, + Annotations: map[string]string{ + "value": "{{ $value }}", + "summary": "概述", + "description": "描述", + }, + } + expr := fmt.Sprintf("%d <= %s <=%d", v.ThresholdsMin, data.Expr, v.ThresholdsMax) + rule.Expr = intstr.FromString(expr) + group.Rules = append(group.Rules, rule) + } + _ = svc +}