diff --git a/charts/gundeck/templates/deployment.yaml b/charts/gundeck/templates/deployment.yaml index c70ddec4d1..55ed01e9ab 100644 --- a/charts/gundeck/templates/deployment.yaml +++ b/charts/gundeck/templates/deployment.yaml @@ -175,6 +175,20 @@ spec: lifecycle: preStop: exec: - command: ["sh", "-c", "sleep 10"] + command: + - sh + - -eu + - -c + - | + PORT={{ .Values.service.internalPort }} + curl -sS -m 2 "http://127.0.0.1:${PORT}/i/drain" || true + i=1 + while [ "$i" -le {{ .Values.preStopDrainSeconds }} ]; do + if connections=$(ss -tan "sport = :${PORT}" state established 2>/dev/null | sed -n '2,$p' | wc -l); then + printf 'gundeck preStop: connections=%s\n' "$connections" + fi + sleep 1 + i=$((i+1)) + done resources: {{ toYaml .Values.resources | indent 12 }} diff --git a/charts/gundeck/values.yaml b/charts/gundeck/values.yaml index 1a989c8510..9451c9710f 100644 --- a/charts/gundeck/values.yaml +++ b/charts/gundeck/values.yaml @@ -16,7 +16,10 @@ resources: memory: "1Gi" # Should be greater than Warp's graceful shutdown (default 30s). -terminationGracePeriodSeconds: 40 +terminationGracePeriodSeconds: 50 + +# Seconds to log connection counts during preStop (after /i/drain) +preStopDrainSeconds: 20 config: logLevel: Info logFormat: StructuredJSON diff --git a/libs/wire-api/src/Wire/API/Routes/Internal/Gundeck.hs b/libs/wire-api/src/Wire/API/Routes/Internal/Gundeck.hs index a89c1ab3d3..1bfd2e50c1 100644 --- a/libs/wire-api/src/Wire/API/Routes/Internal/Gundeck.hs +++ b/libs/wire-api/src/Wire/API/Routes/Internal/Gundeck.hs @@ -101,6 +101,7 @@ instance (HasOpenApi sub) => HasOpenApi (ReqBodyHack :> sub) where type InternalAPI = "i" :> ( Named "i-status" ("status" :> Get '[JSON] NoContent) + :<|> Named "i-drain" ("drain" :> Get '[JSON] NoContent) :<|> Named "i-push" ("push" :> "v2" :> ReqBody '[JSON] [Push] :> Post '[JSON] NoContent) :<|> ( "presences" :> ( Named "i-presences-get-for-users" (QueryParam' [Required, Strict] "ids" (CommaSeparatedList UserId) :> Get '[JSON] [Presence]) diff --git a/nix/wire-server.nix b/nix/wire-server.nix index 5063f8c4f8..3ba6df7638 100644 --- a/nix/wire-server.nix +++ b/nix/wire-server.nix @@ -315,6 +315,7 @@ let coreutils dig curl + iproute2 less gnutar gzip diff --git a/services/gundeck/src/Gundeck/API/Internal.hs b/services/gundeck/src/Gundeck/API/Internal.hs index c1c1591ab8..d540d497e8 100644 --- a/services/gundeck/src/Gundeck/API/Internal.hs +++ b/services/gundeck/src/Gundeck/API/Internal.hs @@ -33,6 +33,7 @@ import Gundeck.Push.Data qualified as PushTok import Gundeck.Push.Native.Types qualified as PushTok import Imports import Servant +import System.Logger.Class qualified as Log import Wire.API.Push.Token qualified as PushTok import Wire.API.Push.V2 import Wire.API.Routes.Internal.Gundeck @@ -41,6 +42,7 @@ import Wire.API.Routes.Named servantSitemap :: ServerT InternalAPI Gundeck servantSitemap = Named @"i-status" statusH + :<|> Named @"i-drain" drainH :<|> Named @"i-push" pushH :<|> ( Named @"i-presences-get-for-users" Presence.listAllH :<|> Named @"i-presences-get-for-user" Presence.listH @@ -55,6 +57,13 @@ servantSitemap = statusH :: (Applicative m) => m NoContent statusH = pure NoContent +drainH :: Gundeck NoContent +drainH = do + -- Flip the server into drain mode so all responses set Connection: close + setDrainMode True + Log.info $ Log.msg (Log.val "Entering drain mode: setting Connection: close on all responses") + pure NoContent + pushH :: [Push] -> Gundeck NoContent pushH ps = NoContent <$ Push.push ps diff --git a/services/gundeck/src/Gundeck/Env.hs b/services/gundeck/src/Gundeck/Env.hs index e3670c13a8..0dbb11ff1f 100644 --- a/services/gundeck/src/Gundeck/Env.hs +++ b/services/gundeck/src/Gundeck/Env.hs @@ -49,6 +49,7 @@ import Network.TLS as TLS import Network.TLS.Extra qualified as TLS import System.Logger qualified as Log import System.Logger.Extended qualified as Logger +import UnliftIO.IORef qualified as URef data Env = Env { _reqId :: !RequestId, @@ -61,7 +62,8 @@ data Env = Env _awsEnv :: !Aws.Env, _time :: !(IO Milliseconds), _threadBudgetState :: !(Maybe ThreadBudgetState), - _rabbitMqChannel :: MVar Channel + _rabbitMqChannel :: MVar Channel, + _drainMode :: URef.IORef Bool } makeLenses ''Env @@ -105,7 +107,8 @@ createEnv o = do } mtbs <- mkThreadBudgetState `mapM` (o ^. settings . maxConcurrentNativePushes) rabbitMqChannelMVar <- Q.mkRabbitMqChannelMVar l (Just "gundeck") (o ^. rabbitmq) - pure $! (rThread : rAdditionalThreads,) $! Env (RequestId defRequestId) o l n p r rAdditional a io mtbs rabbitMqChannelMVar + drainingRef <- URef.newIORef False + pure $! (rThread : rAdditionalThreads,) $! Env (RequestId defRequestId) o l n p r rAdditional a io mtbs rabbitMqChannelMVar drainingRef reqIdMsg :: RequestId -> Logger.Msg -> Logger.Msg reqIdMsg = ("request" Logger..=) . unRequestId @@ -158,3 +161,7 @@ createRedisPool l ep username password identifier = do safeShowConnInfo :: Redis.ConnectInfo -> String safeShowConnInfo connInfo = show $ connInfo {Redis.connectAuth = "[REDACTED]" <$ Redis.connectAuth connInfo} + +-- | Set drain mode on or off +setDrainModeIO :: Env -> Bool -> IO () +setDrainModeIO env v = URef.writeIORef (env ^. drainMode) v diff --git a/services/gundeck/src/Gundeck/Monad.hs b/services/gundeck/src/Gundeck/Monad.hs index 4d3d9607dd..c946708896 100644 --- a/services/gundeck/src/Gundeck/Monad.hs +++ b/services/gundeck/src/Gundeck/Monad.hs @@ -26,6 +26,7 @@ module Gundeck.Monad manager, cstate, createEnv, + setDrainMode, -- * Gundeck monad Gundeck, @@ -217,3 +218,9 @@ getRabbitMqChan = do Log.err $ Log.msg (Log.val "Could not retrieve RabbitMQ channel") throwM $ mkError status500 "internal-server-error" "Could not retrieve RabbitMQ channel" Just chan -> pure chan + +-- | Enable/disable drain mode in the server environment. +setDrainMode :: Bool -> Gundeck () +setDrainMode v = do + env <- ask + liftIO $ setDrainModeIO env v diff --git a/services/gundeck/src/Gundeck/Run.hs b/services/gundeck/src/Gundeck/Run.hs index f3f1ed140d..e75706a5c9 100644 --- a/services/gundeck/src/Gundeck/Run.hs +++ b/services/gundeck/src/Gundeck/Run.hs @@ -75,6 +75,7 @@ import Servant qualified import System.Logger qualified as Log import System.Logger.Class qualified as MonadLogger import UnliftIO.Async qualified as Async +import UnliftIO.IORef qualified as URef import Util.Options import Wire.API.Notification import Wire.API.Routes.Public.Gundeck (GundeckAPI) @@ -151,7 +152,9 @@ run opts = withTracer \tracer -> do middleware env = do otelMiddleWare <- newOpenTelemetryWaiMiddleware pure $ - versionMiddleware (foldMap expandVersionExp (opts ^. settings . disabledAPIVersions)) + serverIdentityHeaderMiddleware + . drainConnectionCloseMiddleware env + . versionMiddleware (foldMap expandVersionExp (opts ^. settings . disabledAPIVersions)) . otelMiddleWare . requestIdMiddleware (env ^. applog) defaultRequestIdHeaderName . Metrics.servantPrometheusMiddleware (Proxy @(GundeckAPI :<|> InternalAPI)) @@ -159,6 +162,21 @@ run opts = withTracer \tracer -> do . GZip.gzip GZip.def . catchErrors (env ^. applog) defaultRequestIdHeaderName + drainConnectionCloseMiddleware :: Env -> Middleware + drainConnectionCloseMiddleware env app req sendResponse = do + draining <- URef.readIORef (env ^. drainMode) + if draining + then app req (sendResponse . addClose) + else app req sendResponse + where + addClose res = mapResponseHeaders (("Connection", "close") :) res + + serverIdentityHeaderMiddleware :: Middleware + serverIdentityHeaderMiddleware app req sendResponse = do + hostname <- lookupEnv "HOSTNAME" + let addHdr = maybe id (\hn -> mapResponseHeaders (("X-Serving-Pod", fromString hn) :)) hostname + app req (sendResponse . addHdr) + mkApp :: Env -> Wai.Application mkApp env0 req cont = do let rid = getRequestId defaultRequestIdHeaderName req