Commit f6a7c526 authored by 李科's avatar 李科

ci: 同步prometheus告警规则

parent b4a22d18
......@@ -36,8 +36,8 @@ type RulesAlertRange struct {
}
type AlertCondition struct {
ThresholdsMin int `json:"thresholds_min" form:"thresholds_min" binding:"required,lt=ThresholdsMax"`
ThresholdsMax int `json:"thresholds_max" form:"thresholds_max" binding:"required"`
ThresholdsMin int `json:"thresholds_min" form:"thresholds_min" binding:"required"`
RiskLevel int `json:"risk_level" form:"risk_level" binding:"required,oneof=1 2 3 4"`
}
......
......@@ -6,6 +6,7 @@ import (
"gitlab.wodcloud.com/smart-operation/so-operation-api/src/pkg/beagle/resp"
"gitlab.wodcloud.com/smart-operation/so-operation-api/src/router/middleware/header"
"gitlab.wodcloud.com/smart-operation/so-operation-api/src/service"
"sort"
)
// AddAlertRules 新增任务
......@@ -16,6 +17,25 @@ func AddAlertRules(c *gin.Context) {
return
}
/*
[
{
"thresholds_min": 12,
"thresholds_max": 66,
"risk_level": 1
},
{
"thresholds_min": 66,
"thresholds_max": 80,
"risk_level": 2
}
]
*/
// TODO 阈值范围判断,阈值上下限是否存在交叉
sort.SliceStable(req.AlertCondition, func(i, j int) bool {
return req.AlertCondition[i].ThresholdsMin < req.AlertCondition[j].ThresholdsMin
})
svc := service.AlertRulesSvc{User: header.GetUser(c)}
err := svc.Add(req)
if err != nil {
......
......@@ -124,11 +124,24 @@ var OpTypeIntMap = map[OpType]int{
}
// RiskLevel 风险等级
type RiskLevel int
const (
RiskLevelLow RiskLevel = iota + 1 // 1:低风险
RiskLevelModerate // 1:一般风险
RiskLevelHigh // 3:较大风险
RiskLevelCritical // 4:重大风险
RiskLevelLow = iota + 1 // 1:低风险
RiskLevelModerate // 1:一般风险
RiskLevelHigh // 3:较大风险
RiskLevelCritical // 4:重大风险
)
func RiskLeveText(code int) string {
switch code {
case RiskLevelLow:
return "低风险"
case RiskLevelModerate:
return "一般风险"
case RiskLevelHigh:
return "较大风险"
case RiskLevelCritical:
return "重大风险"
default:
return ""
}
}
......@@ -39,10 +39,20 @@ func (a *AlertRulesSvc) Add(req request.AddAlertRules) error {
data.NotifyRecipients = util.ConvertToString(req.NotifyRecipients)
switch req.DetectionType {
case 1:
// TODO 暂时不能做事务,需要先插入数据,再进行查询
_, err = db.Insert(&data)
if err != nil {
return err
}
var item response.AlertRulesItem
item, err = a.GetDataById(request.DetailAlertRules{Id: data.Id})
// TODO 1.插入数据到 prometheus.yml --> rule_files
prSvc := PrometheusRuleSvc{User: a.User}
prSvc.Create(item)
_ = item
case 2: // 自定义
_, err = db.Transaction(func(session *xorm.Session) (interface{}, error) {
// 添加自定义分类
......
......@@ -14,10 +14,29 @@ var (
PrometheusRuleGroup = "monitoring.beagle.io" // kubectl api-resources | grep -i prome
PrometheusRuleVersion = "v1"
PrometheusRuleKind = "PrometheusRule"
Namespace = "beagle-monitoring"
PrometheusRuleApiVersion = PrometheusRuleGroup + "/" + PrometheusRuleVersion
PrometheusRuleName = strings.ToLower(PrometheusRuleKind) + "s." + PrometheusRuleGroup
PrometheusRuleNamePrefix = "beagle-prometheus-so-operation-api-rules" // beagle-monitoring beagle-prometheus-prometheus-operator 43d
)
var AlertDefLabels = map[string]string{
"app": "prometheus",
"app.bd-apaas.com/cluster-component": "monitoring",
"prometheus-operator": "monitoring",
"release": "beagle-prometheus",
}
// GetPrometheusRuleName 获取规则CRD名称
func GetPrometheusRuleName(alertRulesId string) string {
return fmt.Sprintf("%s-%s", PrometheusRuleNamePrefix, alertRulesId)
}
// GetPrometheusRuleGroupName 获取规则组名称
func GetPrometheusRuleGroupName(alertName string, groupInterval string) string {
return fmt.Sprintf("%s-group-%s", alertName, groupInterval)
}
type PrometheusRule struct {
Header map[string]string
}
......
......@@ -2,7 +2,7 @@ package k8s
import (
"fmt"
v1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/spf13/cast"
"gitlab.wodcloud.com/smart-operation/so-operation-api/src/common/conf"
"gitlab.wodcloud.com/smart-operation/so-operation-api/src/pkg/beagle/constant"
......@@ -13,14 +13,7 @@ import (
)
var svc PrometheusRule
var obj v1.PrometheusRule
var AlertDefLabels = map[string]string{
"app": "prometheus",
"app.bd-apaas.com/cluster-component": "monitoring",
"prometheus-operator": "monitoring",
"release": "beagle-prometheus",
}
var pr monitoringv1.PrometheusRule
func init() {
conf.Options = &conf.Config{}
......@@ -29,43 +22,66 @@ func init() {
header := make(map[string]string)
header["Authorization"] = "Bearer " + conf.Options.KubernetesToken
svc = PrometheusRule{Header: header}
obj = v1.PrometheusRule{}
obj.Name = "beagle-prometheus-operation-api-rules-101"
obj.Namespace = "beagle-monitoring"
obj.Labels = AlertDefLabels
pr = monitoringv1.PrometheusRule{}
alertRulesId := "83343ef6-4a99-47bd-abb4-bcff52feb2ec" // 预警规则id
prometheusRuleName := GetPrometheusRuleName(alertRulesId)
pr.Name = prometheusRuleName
pr.Namespace = "beagle-monitoring"
pr.Labels = AlertDefLabels
}
func TestRule(t *testing.T) {
group := v1.RuleGroup{}
group.Name = "demo-group-1m"
rule := v1.Rule{}
rule.Alert = "kubernetes启动状态"
rule.Annotations = make(map[string]string)
rule.Annotations["value"] = "{{ $value }}"
rule.Annotations["message"] = "启动了"
rule.Expr = intstr.FromString(`up{endpoint="https", instance="192.168.1.244:6443", job="apiserver", namespace="default", service="kubernetes"} == 1`)
ruleFor := v1.Duration("1m")
rule.For = &ruleFor
rule.Labels = make(map[string]string)
rule.Labels["severity"] = "warning"
rule.Labels["severity_name"] = "一般风险"
rule.Labels["risk_level"] = cast.ToString(constant.RiskLevelLow)
rule.Labels["source"] = "so-operation-api"
rule.Labels["alert_id"] = "101"
group.Rules = append(group.Rules, rule)
obj.Spec.Groups = append(obj.Spec.Groups, group)
err := svc.Create(&obj)
group := monitoringv1.RuleGroup{}
groupInterval := monitoringv1.Duration("5m")
group.Interval = &groupInterval
rule1 := monitoringv1.Rule{}
rule1.Alert = "kubernetes启动状态"
group.Name = GetPrometheusRuleGroupName(rule1.Alert, string(*group.Interval))
rule1.Annotations = make(map[string]string)
rule1.Annotations["value"] = "{{ $value }}"
rule1.Annotations["message"] = "启动了"
rule1.Expr = intstr.FromString(`up{endpoint="https", instance="192.168.1.244:6443", job="apiserver", namespace="default", service="kubernetes"} == 1`)
ruleFor := monitoringv1.Duration("1m")
rule1.For = &ruleFor
rule1.Labels = map[string]string{
"severity": "warning",
"risk_level": cast.ToString(constant.RiskLevelLow),
"risk_level_name": constant.RiskLeveText(constant.RiskLevelLow),
"source": "so-operation-api",
"alert_rules_id": "83343ef6-4a99-47bd-abb4-bcff52feb2ec",
}
rule2 := monitoringv1.Rule{}
rule2.Alert = "kubernetes启动状态"
rule2.Annotations = make(map[string]string)
rule2.Annotations["value"] = "{{ $value }}"
rule2.Annotations["message"] = "启动了"
rule2.Expr = intstr.FromString(`up{endpoint="https", instance="192.168.1.244:6443", job="apiserver", namespace="default", service="kubernetes"} == 0`)
ruleFor2 := monitoringv1.Duration("1m")
rule2.For = &ruleFor2
rule2.Labels = map[string]string{
"severity": "warning",
"risk_level": cast.ToString(constant.RiskLevelModerate),
"risk_level_name": constant.RiskLeveText(constant.RiskLevelModerate),
"source": "so-operation-api",
"alert_rules_id": "83343ef6-4a99-47bd-abb4-bcff52feb2ec",
}
group.Rules = append(group.Rules, rule1, rule2)
pr.Spec.Groups = append(pr.Spec.Groups, group)
err := svc.Create(&pr)
if err != nil {
fmt.Println("添加失败" + err.Error())
log.Println("添加失败" + err.Error())
} else {
fmt.Println("添加成功")
log.Println("添加成功")
}
}
func TestGet(t *testing.T) {
// 获取对象
ruleObj, err := svc.Get(obj.Namespace, obj.Name)
ruleObj, err := svc.Get(pr.Namespace, pr.Name)
if err != nil {
fmt.Println("获取失败" + err.Error())
} else {
......@@ -75,7 +91,7 @@ func TestGet(t *testing.T) {
}
func TestUpdate(t *testing.T) {
ruleObj, err := svc.Get(obj.Namespace, obj.Name)
ruleObj, err := svc.Get(pr.Namespace, pr.Name)
if err != nil {
fmt.Println("获取失败" + err.Error())
} else {
......@@ -85,7 +101,7 @@ func TestUpdate(t *testing.T) {
ruleObj.Name = "galileo-101"
ruleObj.Spec.Groups[0].Rules[0].Alert = "请求次数大于10"
ruleObj.Spec.Groups[0].Name = "2分钟组"
ruleFor := v1.Duration("2m")
ruleFor := monitoringv1.Duration("2m")
ruleObj.Spec.Groups[0].Rules[0].For = &ruleFor
ruleObj.Spec.Groups[0].Rules[0].Expr = intstr.FromString(`up{endpoint="https", instance="192.168.1.244:6443", job="apiserver", namespace="default", service="kubernetes"} == 0`)
err = svc.Update(ruleObj)
......@@ -99,7 +115,7 @@ func TestUpdate(t *testing.T) {
func TestDelete(t *testing.T) {
// 删除
time.Sleep(time.Second * 10)
err := svc.Delete(obj.Namespace, obj.Name)
err := svc.Delete(pr.Namespace, pr.Name)
if err != nil {
fmt.Println("删除失败" + err.Error())
} else {
......
package service
import (
"fmt"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/spf13/cast"
"gitlab.wodcloud.com/smart-operation/so-operation-api/src/bean/entity"
"gitlab.wodcloud.com/smart-operation/so-operation-api/src/bean/vo/response"
"gitlab.wodcloud.com/smart-operation/so-operation-api/src/common/conf"
"gitlab.wodcloud.com/smart-operation/so-operation-api/src/pkg/beagle/constant"
"gitlab.wodcloud.com/smart-operation/so-operation-api/src/service/k8s"
"k8s.io/apimachinery/pkg/util/intstr"
"strings"
)
type PrometheusRuleSvc struct {
User entity.SystemUserInfo
}
func (p *PrometheusRuleSvc) Create(data response.AlertRulesItem) {
header := make(map[string]string)
header["Authorization"] = "Bearer " + conf.Options.KubernetesToken
svc := k8s.PrometheusRule{Header: header}
pr := monitoringv1.PrometheusRule{}
prometheusRuleName := k8s.GetPrometheusRuleName(data.Id)
pr.Name = prometheusRuleName
pr.Namespace = k8s.Namespace
pr.Labels = k8s.AlertDefLabels
group := monitoringv1.RuleGroup{}
groupInterval := monitoringv1.Duration(fmt.Sprintf("%d%s", data.CheckPeriod, "m"))
group.Interval = &groupInterval
ruleFor := monitoringv1.Duration(fmt.Sprintf("%d%s", data.Duration, data.DurationUnit))
// [{"variable_name":"$pod$","metric_name":"http_requests_total","metric_label":"pod","chinese_name":"demoString","is_required":true,"is_linked":true,"value":"LeaseGrant","compare":"="}]
// [{"thresholds_max":100,"thresholds_min":0,"risk_level":4}]
group.Name = k8s.GetPrometheusRuleGroupName(data.MetricConfigId, string(*group.Interval))
for _, v := range data.AlertRange {
item := fmt.Sprintf("%s%s%s", v.MetricLabel, v.Compare, v.Value) // pod=LeaseGrant
data.Expr = strings.ReplaceAll(data.Expr, v.VariableName, item)
}
for _, v := range data.AlertCondition {
rule := monitoringv1.Rule{
Alert: data.MetricConfigName,
For: &ruleFor,
Labels: map[string]string{
"severity": "warning",
"risk_level": cast.ToString(v.RiskLevel),
"risk_level_name": constant.RiskLeveText(v.RiskLevel),
"source": "so-operation-api",
"alert_rules_id": data.MetricConfigId,
},
Annotations: map[string]string{
"value": "{{ $value }}",
"summary": "概述",
"description": "描述",
},
}
expr := fmt.Sprintf("%d <= %s <=%d", v.ThresholdsMin, data.Expr, v.ThresholdsMax)
rule.Expr = intstr.FromString(expr)
group.Rules = append(group.Rules, rule)
}
_ = svc
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment