ceph · pkalever · Nov 4, 2020 · Nov 4, 2020 · Nov 4, 2020 · Nov 6, 2020
diff --git a/.commitlintrc.yml b/.commitlintrc.yml
@@ -37,6 +37,7 @@ rules:
  - e2e
  - helm
  - journal
+ - liveness
  - rbd
  - rebase
  - revert

diff --git a/cmd/cephcsi.go b/cmd/cephcsi.go
@@ -66,10 +66,12 @@ func init() {
  flag.BoolVar(&conf.ForceKernelCephFS, "forcecephkernelclient", false, "enable Ceph Kernel clients on kernel < 4.17 which support quotas")
 
  // liveness/grpc metrics related flags
- flag.IntVar(&conf.MetricsPort, "metricsport", 8080, "TCP port for liveness/grpc metrics requests")
+ flag.StringVar(&conf.MetricsPort, "metricsport", "8080", "TCP port for liveness/grpc metrics requests")
  flag.StringVar(&conf.MetricsPath, "metricspath", "/metrics", "path of prometheus endpoint where metrics will be available")
  flag.DurationVar(&conf.PollTime, "polltime", time.Second*pollTime, "time interval in seconds between each poll")
- flag.DurationVar(&conf.PoolTimeout, "timeout", time.Second*probeTimeout, "probe timeout in seconds")
+ flag.DurationVar(&conf.ProbeTimeout, "timeout", time.Second*probeTimeout, "probe timeout in seconds")
+ flag.StringVar(&conf.HealthzPort, "healthzport", "9808", "TCP ports for listening healthz requests")
+ flag.StringVar(&conf.HealthzPath, "healthzpath", "/healthz", "path of liveness endpoint where health status will be available")
 
  flag.BoolVar(&conf.EnableGRPCMetrics, "enablegrpcmetrics", false, "[DEPRECATED] enable grpc metrics")
  flag.StringVar(&conf.HistogramOption, "histogramoption", "0.5,2,6",

diff --git a/deploy/cephfs/kubernetes/csi-cephfsplugin-provisioner.yaml b/deploy/cephfs/kubernetes/csi-cephfsplugin-provisioner.yaml
@@ -148,10 +148,13 @@ spec:
  image: quay.io/cephcsi/cephcsi:canary
  args:
  - "--type=liveness"
+ - "--v=5"
  - "--endpoint=$(CSI_ENDPOINT)"
  - "--metricsport=8681"
  - "--metricspath=/metrics"
- - "--polltime=60s"
+ - "--healthzport=9681"
+ - "--healthzpath=/healthz"
+ - "--polltime=15s"
  - "--timeout=3s"
  env:
  - name: CSI_ENDPOINT

diff --git a/deploy/cephfs/kubernetes/csi-cephfsplugin.yaml b/deploy/cephfs/kubernetes/csi-cephfsplugin.yaml
@@ -99,10 +99,13 @@ spec:
  image: quay.io/cephcsi/cephcsi:canary
  args:
  - "--type=liveness"
+ - "--v=5"
  - "--endpoint=$(CSI_ENDPOINT)"
  - "--metricsport=8681"
  - "--metricspath=/metrics"
- - "--polltime=60s"
+ - "--healthzport=9681"
+ - "--healthzpath=/healthz"
+ - "--polltime=15s"
  - "--timeout=3s"
  env:
  - name: CSI_ENDPOINT

diff --git a/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml b/deploy/rbd/kubernetes/csi-rbdplugin-provisioner.yaml
@@ -152,10 +152,13 @@ spec:
  image: quay.io/cephcsi/cephcsi:canary
  args:
  - "--type=liveness"
+ - "--v=5"
  - "--endpoint=$(CSI_ENDPOINT)"
  - "--metricsport=8680"
  - "--metricspath=/metrics"
- - "--polltime=60s"
+ - "--healthzport=9680"
+ - "--healthzpath=/healthz"
+ - "--polltime=15s"
  - "--timeout=3s"
  env:
  - name: CSI_ENDPOINT

diff --git a/deploy/rbd/kubernetes/csi-rbdplugin.yaml b/deploy/rbd/kubernetes/csi-rbdplugin.yaml
@@ -102,10 +102,13 @@ spec:
  image: quay.io/cephcsi/cephcsi:canary
  args:
  - "--type=liveness"
+ - "--v=5"
  - "--endpoint=$(CSI_ENDPOINT)"
  - "--metricsport=8680"
  - "--metricspath=/metrics"
- - "--polltime=60s"
+ - "--healthzport=9680"
+ - "--healthzpath=/healthz"
+ - "--polltime=15s"
  - "--timeout=3s"
  env:
  - name: CSI_ENDPOINT

diff --git a/docs/development-guide.md b/docs/development-guide.md
@@ -146,6 +146,7 @@ The `component` in the subject of the commit message can be one of the following
 
 * `cephfs`: bugs or enhancements related to CephFS
 * `rbd`: bugs or enhancements related to RBD
+* `liveness`: bugs or enhancements related to Liveness
 * `doc`: documentation updates
 * `util`: utilities shared between components use `cephfs` or `rbd` if the
  change is only relevant for one of the type of storage

diff --git a/internal/liveness/liveness.go b/internal/liveness/liveness.go
@@ -18,6 +18,8 @@ package liveness
 
 import (
  "context"
+ "net"
+ "net/http"
  "time"
 
  "github.com/ceph/ceph-csi/internal/util"
@@ -29,6 +31,11 @@ import (
  "google.golang.org/grpc"
 )
 
+type probeConn struct {
+ conn *grpc.ClientConn
+ config *util.Config
+}
+
 var (
  liveness = prometheus.NewGauge(prometheus.GaugeOpts{
  Namespace: "csi",
@@ -37,57 +44,113 @@ var (
  })
 )
 
-func getLiveness(timeout time.Duration, csiConn *grpc.ClientConn) {
- ctx, cancel := context.WithTimeout(context.Background(), timeout)
+func (c *probeConn) checkProbe(w http.ResponseWriter, req *http.Request) {
+ ctx, cancel := context.WithTimeout(req.Context(), c.config.ProbeTimeout)
  defer cancel()
 
- util.TraceLogMsg("Sending probe request to CSI driver")
- ready, err := rpc.Probe(ctx, csiConn)
+ util.TraceLog(ctx, "Healthz req: Sending probe request to CSI driver %s", c.config.DriverName)
+ ready, err := rpc.Probe(ctx, c.conn)
+ if err != nil {
+ w.WriteHeader(http.StatusInternalServerError)
+ _, err = w.Write([]byte(err.Error()))
+ if err != nil {
+ util.ErrorLog(ctx, "Healthz req: write failed: %v", err)
+ }
+ util.ErrorLog(ctx, "Healthz req: health check failed: %v", err)
+ return
+ }
+
+ if !ready {
+ w.WriteHeader(http.StatusInternalServerError)
+ _, err = w.Write([]byte("Healthz req: driver responded but is not ready"))
+ if err != nil {
+ util.ErrorLog(ctx, "Healthz req: write failed: %v", err)
+ }
+
+ util.ErrorLog(ctx, "Healthz req: driver responded but is not ready")
+ return
+ }
+
+ w.WriteHeader(http.StatusOK)
+ _, err = w.Write([]byte(`ok`))
+ if err != nil {
+ util.ErrorLog(ctx, "Healthz req: write failed: %v", err)
+ }
+ util.ExtendedLog(ctx, "Healthz req: Health check succeeded")
+}
- ready, err := rpc.Probe(ctx, c.conn)
- if err != nil {
- w.WriteHeader(http.StatusInternalServerError)
- _, err = w.Write([]byte(err.Error()))
- if err != nil {
- util.ErrorLog(ctx, "Healthz req: write failed: %v", err)
- }
- util.ErrorLog(ctx, "Healthz req: health check failed: %v", err)
- return
- }
-
- if !ready {
- w.WriteHeader(http.StatusInternalServerError)
- _, err = w.Write([]byte("Healthz req: driver responded but is not ready"))
- if err != nil {
- util.ErrorLog(ctx, "Healthz req: write failed: %v", err)
- }
-
- util.ErrorLog(ctx, "Healthz req: driver responded but is not ready")
- return
- }
-
- w.WriteHeader(http.StatusOK)
- _, err = w.Write([]byte(`ok`))
- if err != nil {
- util.ErrorLog(ctx, "Healthz req: write failed: %v", err)
- }
- util.ExtendedLog(ctx, "Healthz req: Health check succeeded")
-}
+resp:="ok"
+statuscode=http.StatusOK
+ready, err := rpc.Probe(ctx, c.conn)
+if err!=nil{
+resp=err.Error()
+statuscode=http.StatusInternalServerError
+}else{
+if !ready{
+ resp:="Healthz req: driver responded but is not ready"
+ statuscode=http.StatusInternalServerError
+ }
+} 
+w.WriteHeader(statuscode)
+_, err = w.Write([]byte(resp))
+if err != nil {
+ util.ErrorLog(ctx, "Healthz req: write failed: %v", err)
+}
- ready, err := rpc.Probe(ctx, c.conn)
- if err != nil {
- w.WriteHeader(http.StatusInternalServerError)
- _, err = w.Write([]byte(err.Error()))
- if err != nil {
- util.ErrorLog(ctx, "Healthz req: write failed: %v", err)
- }
- util.ErrorLog(ctx, "Healthz req: health check failed: %v", err)
- return
- }
-
- if !ready {
- w.WriteHeader(http.StatusInternalServerError)
- _, err = w.Write([]byte("Healthz req: driver responded but is not ready"))
- if err != nil {
- util.ErrorLog(ctx, "Healthz req: write failed: %v", err)
- }
-
- util.ErrorLog(ctx, "Healthz req: driver responded but is not ready")
- return
- }
-
- w.WriteHeader(http.StatusOK)
- _, err = w.Write([]byte(`ok`))
- if err != nil {
- util.ErrorLog(ctx, "Healthz req: write failed: %v", err)
- }
- util.ExtendedLog(ctx, "Healthz req: Health check succeeded")
-}
+resp:="ok"
+statuscode=http.StatusOK
+ready, err := rpc.Probe(ctx, c.conn)
+if err!=nil{
+resp=err.Error()
+statuscode=http.StatusInternalServerError
+}else{
+if !ready{
+ resp:="Healthz req: driver responded but is not ready"
+ statuscode=http.StatusInternalServerError
+ }
+} 
+w.WriteHeader(statuscode)
+_, err = w.Write([]byte(resp))
+if err != nil {
+ util.ErrorLog(ctx, "Healthz req: write failed: %v", err)
+}
+
+func getLiveness(c *probeConn) {
+ ctx, cancel := context.WithTimeout(context.Background(), c.config.ProbeTimeout)
+ defer cancel()
+
+ util.TraceLog(ctx, "Metrics req: Sending probe request to CSI driver: %s", c.config.DriverName)
+ ready, err := rpc.Probe(ctx, c.conn)
  if err != nil {
  liveness.Set(0)
- util.ErrorLogMsg("health check failed: %v", err)
+ util.ErrorLog(ctx, "Metrics req: health check failed: %v", err)
  return
  }
 
  if !ready {
  liveness.Set(0)
- util.ErrorLogMsg("driver responded but is not ready")
+ util.ErrorLog(ctx, "Metrics req: driver responded but is not ready")
  return
  }
  liveness.Set(1)
- util.ExtendedLogMsg("Health check succeeded")
+ util.ExtendedLog(ctx, "Metrics req: Health check succeeded")
 }
 
-func recordLiveness(endpoint, drivername string, pollTime, timeout time.Duration) {
- liveMetricsManager := metrics.NewCSIMetricsManager(drivername)
+func recordLiveness(c *probeConn) {
  // register prometheus metrics
  err := prometheus.Register(liveness)
  if err != nil {
  util.FatalLogMsg(err.Error())
  }
 
- csiConn, err := connlib.Connect(endpoint, liveMetricsManager)
- if err != nil {
- // connlib should retry forever so a returned error should mean
- // the grpc client is misconfigured rather than an error on the network
- util.FatalLogMsg("failed to establish connection to CSI driver: %v", err)
- }
-
  // get liveness periodically
- ticker := time.NewTicker(pollTime)
+ ticker := time.NewTicker(c.config.PollTime)
  defer ticker.Stop()
  for range ticker.C {
- getLiveness(timeout, csiConn)
+ getLiveness(c)
  }
 }
 
 // Run starts liveness collection and prometheus endpoint.
 func Run(conf *util.Config) {
  util.ExtendedLogMsg("Liveness Running")
 
+ liveMetricsManager := metrics.NewCSIMetricsManager("")
+
+ csiConn, err := connlib.Connect(conf.Endpoint, liveMetricsManager)
+ if err != nil {
+ // connlib should retry forever so a returned error should mean
+ // the grpc client is misconfigured rather than an error on the network
+ util.FatalLogMsg("failed to establish connection to CSI driver: %v", err)
+ }
+
+ conf.DriverName, err = rpc.GetDriverName(context.Background(), csiConn)
+ if err != nil {
+ util.FatalLogMsg("failed to get CSI driver name: %v", err)
+ }
+ liveMetricsManager.SetDriverName(conf.DriverName)
+ util.ExtendedLogMsg("CSI driver: %s, Endpoint: %s", conf.DriverName, conf.Endpoint)
+
+ pc := &probeConn{
+ config: conf,
+ conn: csiConn,
+ }
+
  // start liveness collection
- go recordLiveness(conf.Endpoint, conf.DriverName, conf.PollTime, conf.PoolTimeout)
+ go recordLiveness(pc)
 
  // start up prometheus endpoint
  util.StartMetricsServer(conf)
+
+ address := net.JoinHostPort(conf.MetricsIP, conf.HealthzPort)
+ http.HandleFunc(conf.HealthzPath, pc.checkProbe)
+ util.ExtendedLogMsg("Serving Health requests on: http://%s%s", address, conf.HealthzPath)
+ err = http.ListenAndServe(address, nil)
+ if err != nil {
+ util.FatalLogMsg("failed to listen on address %s: %v", address, err)
+ }
 }
diff --git a/internal/util/httpserver.go b/internal/util/httpserver.go
@@ -4,7 +4,6 @@ import (
  "net"
  "net/http"
  "net/url"
- "strconv"
 
  "github.com/prometheus/client_golang/prometheus/promhttp"
 )
@@ -17,10 +16,15 @@ func ValidateURL(c *Config) error {
 
 // StartMetricsServer starts http server.
 func StartMetricsServer(c *Config) {
- addr := net.JoinHostPort(c.MetricsIP, strconv.Itoa(c.MetricsPort))
+ addr := net.JoinHostPort(c.MetricsIP, c.MetricsPort)
  http.Handle(c.MetricsPath, promhttp.Handler())
- err := http.ListenAndServe(addr, nil)
- if err != nil {
- FatalLogMsg("failed to listen on address %v: %s", addr, err)
- }
+ ExtendedLogMsg("Serving Metrics requests on: http://%s%s", addr, c.MetricsPath)
+
+ // Spawn a new go routine to listen on specified endpoint
+ go func() {
+ err := http.ListenAndServe(addr, nil)
+ if err != nil {
+ FatalLogMsg("failed to listen on address %v: %s", addr, err)
+ }
+ }()
 }
diff --git a/internal/util/util.go b/internal/util/util.go
@@ -81,9 +81,11 @@ type Config struct {
  HistogramOption string // Histogram option for grpc metrics, should be comma separated value, ex:= "0.5,2,6" where start=0.5 factor=2, count=6
  MetricsIP string // TCP port for liveness/ metrics requests
  PidLimit int // PID limit to configure through cgroups")
- MetricsPort int // TCP port for liveness/grpc metrics requests
+ MetricsPort string // TCP port for liveness/grpc metrics requests
+ HealthzPort string // TCP port for liveness/health requests
+ HealthzPath string // path for liveness/health requests
  PollTime time.Duration // time interval in seconds between each poll
- PoolTimeout  time.Duration // probe timeout in seconds
+ ProbeTimeout time.Duration // probe timeout in seconds
  EnableGRPCMetrics bool // option to enable grpc metrics
 
  IsControllerServer bool // if set to true start provisoner server