[Metrics] Add SSH Latency (#7538)

lloyd-brown · lloydbrownjr · web-flow · commit 93cb798182af · 2025-10-30T14:36:49.000-07:00
* Add new metric.

* Add version support.

* Change metric.

* Working latency metric.

* Tweak wording for metric.

* Up interval.

* Bump version again.

* Add locking.

* Remove lock

---------

Co-authored-by: lloydbrownjr &lt;lloydbrown@berkeley.edu&gt;
diff --git a/sky/metrics/utils.py b/sky/metrics/utils.py
@@ -143,6 +143,24 @@
     'RSS increment after requests', ['name'],
     buckets=_MEM_BUCKETS)
 
+SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
+    'sky_apiserver_websocket_ssh_latency_seconds',
+    ('Time taken for ssh message to go from client to API server and back'
+     'to the client. This does not include: latency to reach the pod, '
+     'overhead from sending through the k8s port-forward tunnel, or '
+     'ssh server lag on the destination pod.'),
+    ['pid'],
+    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
+             0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
+             5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
+             50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
+             240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
+             420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
+             600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
+             780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
+             960.0, 980.0, 1000.0, float('inf')),
+)
+
 
 @contextlib.contextmanager
 def time_it(name: str, group: str = 'default'):
diff --git a/sky/server/constants.py b/sky/server/constants.py
@@ -10,7 +10,7 @@
 # based on version info is needed.
 # For more details and code guidelines, refer to:
 # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
-API_VERSION = 21
+API_VERSION = 22
 
 # The minimum peer API version that the code should still work with.
 # Notes (dev):
diff --git a/sky/server/server.py b/sky/server/server.py
@@ -6,6 +6,7 @@
 from concurrent.futures import ThreadPoolExecutor
 import contextlib
 import datetime
+from enum import IntEnum
 import hashlib
 import json
 import multiprocessing
@@ -15,6 +16,7 @@
 import re
 import resource
 import shutil
+import struct
 import sys
 import threading
 import traceback
@@ -1809,13 +1811,25 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
     )
 
 
+class KubernetesSSHMessageType(IntEnum):
+    REGULAR_DATA = 0
+    PINGPONG = 1
+    LATENCY_MEASUREMENT = 2
+
+
 @app.websocket('/kubernetes-pod-ssh-proxy')
-async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
-                                   cluster_name: str) -> None:
+async def kubernetes_pod_ssh_proxy(
+        websocket: fastapi.WebSocket,
+        cluster_name: str,
+        client_version: Optional[int] = None) -> None:
     """Proxies SSH to the Kubernetes pod with websocket."""
     await websocket.accept()
     logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
 
+    timestamps_supported = client_version is not None and client_version > 21
+    logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
+        client_version = {client_version}')
+
     # Run core.status in another thread to avoid blocking the event loop.
     with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
         cluster_records = await context_utils.to_thread_with_executor(
@@ -1870,6 +1884,42 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
         async def websocket_to_ssh():
             try:
                 async for message in websocket.iter_bytes():
+                    if timestamps_supported:
+                        type_size = struct.calcsize('!B')
+                        message_type = struct.unpack('!B',
+                                                     message[:type_size])[0]
+                        if (message_type ==
+                                KubernetesSSHMessageType.REGULAR_DATA):
+                            # Regular data - strip type byte and forward to SSH
+                            message = message[type_size:]
+                        elif message_type == KubernetesSSHMessageType.PINGPONG:
+                            # PING message - respond with PONG (type 1)
+                            ping_id_size = struct.calcsize('!I')
+                            if len(message) != type_size + ping_id_size:
+                                raise ValueError('Invalid PING message '
+                                                 f'length: {len(message)}')
+                            # Return the same PING message, so that the client
+                            # can measure the latency.
+                            await websocket.send_bytes(message)
+                            continue
+                        elif (message_type ==
+                              KubernetesSSHMessageType.LATENCY_MEASUREMENT):
+                            # Latency measurement from client
+                            latency_size = struct.calcsize('!Q')
+                            if len(message) != type_size + latency_size:
+                                raise ValueError(
+                                    'Invalid latency measurement '
+                                    f'message length: {len(message)}')
+                            avg_latency_ms = struct.unpack(
+                                '!Q',
+                                message[type_size:type_size + latency_size])[0]
+                            latency_seconds = avg_latency_ms / 1000
+                            metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds)  # pylint: disable=line-too-long
+                            continue
+                        else:
+                            # Unknown message type.
+                            raise ValueError(
+                                f'Unknown message type: {message_type}')
                     writer.write(message)
                     try:
                         await writer.drain()
@@ -1900,6 +1950,11 @@ async def ssh_to_websocket():
                             nonlocal ssh_failed
                             ssh_failed = True
                         break
+                    if timestamps_supported:
+                        # Prepend message type byte (0 = regular data)
+                        message_type_bytes = struct.pack(
+                            '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
+                        data = message_type_bytes + data
                     await websocket.send_bytes(data)
             except Exception:  # pylint: disable=broad-except
                 pass
diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py
@@ -548,3 +548,6 @@
 
 ARM64_ARCH = 'arm64'
 X86_64_ARCH = 'x86_64'
+
+SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
+    f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
diff --git a/sky/templates/websocket_proxy.py b/sky/templates/websocket_proxy.py
@@ -11,15 +11,23 @@
 import asyncio
 from http.cookiejar import MozillaCookieJar
 import os
+import struct
 import sys
-from typing import Dict
+import time
+from typing import Dict, Optional
 from urllib.request import Request
 
+import requests
 import websockets
 from websockets.asyncio.client import ClientConnection
 from websockets.asyncio.client import connect
 
+from sky.server import constants
+from sky.server.server import KubernetesSSHMessageType
+from sky.skylet import constants as skylet_constants
+
 BUFFER_SIZE = 2**16  # 64KB
+HEARTBEAT_INTERVAL_SECONDS = 10
 
 # Environment variable for a file path to the API cookie file.
 # Keep in sync with server/constants.py
@@ -28,6 +36,8 @@
 # Keep in sync with server/constants.py
 API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
 
+MAX_UNANSWERED_PINGS = 100
+
 
 def _get_cookie_header(url: str) -> Dict[str, str]:
     """Extract Cookie header value from a cookie jar for a specific URL"""
@@ -49,7 +59,7 @@ def _get_cookie_header(url: str) -> Dict[str, str]:
     return {'Cookie': cookie_header}
 
 
-async def main(url: str) -> None:
+async def main(url: str, timestamps_supported: bool) -> None:
     cookie_header = _get_cookie_header(url)
     async with connect(url,
                        ping_interval=None,
@@ -75,45 +85,149 @@ async def main(url: str) -> None:
                 asyncio.streams.FlowControlMixin, sys.stdout)  # type: ignore
             stdout_writer = asyncio.StreamWriter(transport, protocol, None,
                                                  loop)
+            # Dictionary to store last ping time for latency measurement
+            last_ping_time_dict: Optional[Dict[int, float]] = None
+            if timestamps_supported:
+                last_ping_time_dict = {}
+
+            # Use an Event to signal when websocket is closed
+            websocket_closed_event = asyncio.Event()
+            websocket_lock = asyncio.Lock()
 
-            await asyncio.gather(stdin_to_websocket(stdin_reader, websocket),
-                                 websocket_to_stdout(websocket, stdout_writer))
+            await asyncio.gather(
+                stdin_to_websocket(stdin_reader, websocket,
+                                   timestamps_supported, websocket_closed_event,
+                                   websocket_lock),
+                websocket_to_stdout(websocket, stdout_writer,
+                                    timestamps_supported, last_ping_time_dict,
+                                    websocket_closed_event, websocket_lock),
+                latency_monitor(websocket, last_ping_time_dict,
+                                websocket_closed_event, websocket_lock),
+                return_exceptions=True)
         finally:
             if old_settings:
                 termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
                                   old_settings)
 
 
+async def latency_monitor(websocket: ClientConnection,
+                          last_ping_time_dict: Optional[dict],
+                          websocket_closed_event: asyncio.Event,
+                          websocket_lock: asyncio.Lock):
+    """Periodically send PING messages (type 1) to measure latency."""
+    if last_ping_time_dict is None:
+        return
+    next_id = 0
+    while not websocket_closed_event.is_set():
+        try:
+            await asyncio.sleep(HEARTBEAT_INTERVAL_SECONDS)
+            if len(last_ping_time_dict) >= MAX_UNANSWERED_PINGS:
+                # We are not getting responses, clear the dictionary so
+                # as not to grow unbounded.
+                last_ping_time_dict.clear()
+            ping_time = time.time()
+            next_id += 1
+            last_ping_time_dict[next_id] = ping_time
+            message_header_bytes = struct.pack(
+                '!BI', KubernetesSSHMessageType.PINGPONG.value, next_id)
+            try:
+                async with websocket_lock:
+                    await websocket.send(message_header_bytes)
+            except websockets.exceptions.ConnectionClosed as e:
+                # Websocket is already closed.
+                print(f'Failed to send PING message: {e}', file=sys.stderr)
+                break
+        except Exception as e:
+            print(f'Error in latency_monitor: {e}', file=sys.stderr)
+            websocket_closed_event.set()
+            raise e
+
+
 async def stdin_to_websocket(reader: asyncio.StreamReader,
-                             websocket: ClientConnection):
+                             websocket: ClientConnection,
+                             timestamps_supported: bool,
+                             websocket_closed_event: asyncio.Event,
+                             websocket_lock: asyncio.Lock):
     try:
-        while True:
+        while not websocket_closed_event.is_set():
             # Read at most BUFFER_SIZE bytes, this not affect
             # responsiveness since it will return as soon as
             # there is at least one byte.
             # The BUFFER_SIZE is chosen to be large enough to improve
             # throughput.
             data = await reader.read(BUFFER_SIZE)
+
             if not data:
                 break
-            await websocket.send(data)
+            if timestamps_supported:
+                # Send message with type 0 to indicate data.
+                message_type_bytes = struct.pack(
+                    '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
+                data = message_type_bytes + data
+            async with websocket_lock:
+                await websocket.send(data)
+
     except Exception as e:  # pylint: disable=broad-except
         print(f'Error in stdin_to_websocket: {e}', file=sys.stderr)
     finally:
-        await websocket.close()
+        async with websocket_lock:
+            await websocket.close()
+        websocket_closed_event.set()
 
 
 async def websocket_to_stdout(websocket: ClientConnection,
-                              writer: asyncio.StreamWriter):
+                              writer: asyncio.StreamWriter,
+                              timestamps_supported: bool,
+                              last_ping_time_dict: Optional[dict],
+                              websocket_closed_event: asyncio.Event,
+                              websocket_lock: asyncio.Lock):
     try:
-        while True:
+        while not websocket_closed_event.is_set():
             message = await websocket.recv()
+            if (timestamps_supported and len(message) > 0 and
+                    last_ping_time_dict is not None):
+                message_type = struct.unpack('!B', message[:1])[0]
+                if message_type == KubernetesSSHMessageType.REGULAR_DATA.value:
+                    # Regular data - strip type byte and write to stdout
+                    message = message[1:]
+                elif message_type == KubernetesSSHMessageType.PINGPONG.value:
+                    # PONG response - calculate latency and send measurement
+                    if not len(message) == struct.calcsize('!BI'):
+                        raise ValueError(
+                            f'Invalid PONG message length: {len(message)}')
+                    pong_id = struct.unpack('!I', message[1:5])[0]
+                    pong_time = time.time()
+
+                    ping_time = last_ping_time_dict.pop(pong_id, None)
+
+                    if ping_time is None:
+                        continue
+
+                    latency_seconds = pong_time - ping_time
+                    latency_ms = int(latency_seconds * 1000)
+
+                    # Send latency measurement (type 2)
+                    message_type_bytes = struct.pack(
+                        '!B',
+                        KubernetesSSHMessageType.LATENCY_MEASUREMENT.value)
+                    latency_bytes = struct.pack('!Q', latency_ms)
+                    message = message_type_bytes + latency_bytes
+                    # Send to server.
+                    async with websocket_lock:
+                        await websocket.send(message)
+                    continue
+            # No timestamps support, write directly
             writer.write(message)
             await writer.drain()
     except websockets.exceptions.ConnectionClosed:
         print('WebSocket connection closed', file=sys.stderr)
     except Exception as e:  # pylint: disable=broad-except
         print(f'Error in websocket_to_stdout: {e}', file=sys.stderr)
+        raise e
+    finally:
+        async with websocket_lock:
+            await websocket.close()
+        websocket_closed_event.set()
 
 
 if __name__ == '__main__':
@@ -123,11 +237,25 @@ async def websocket_to_stdout(websocket: ClientConnection,
         # TODO(aylei): Remove this after 0.10.0
         server_url = f'http://{server_url}'
 
+    health_url = f'{server_url}/api/health'
+    health_response = requests.get(health_url)
+    health_data = health_response.json()
+    timestamps_are_supported = int(health_data['api_version']) > 21
+    disable_latency_measurement = os.environ.get(
+        skylet_constants.SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR, '0') == '1'
+    timestamps_are_supported = (timestamps_are_supported and
+                                not disable_latency_measurement)
+
     server_proto, server_fqdn = server_url.split('://')
     websocket_proto = 'ws'
     if server_proto == 'https':
         websocket_proto = 'wss'
     server_url = f'{websocket_proto}://{server_fqdn}'
+
+    client_version_str = (f'&client_version={constants.API_VERSION}'
+                          if timestamps_are_supported else '')
+
     websocket_url = (f'{server_url}/kubernetes-pod-ssh-proxy'
-                     f'?cluster_name={sys.argv[2]}')
-    asyncio.run(main(websocket_url))
+                     f'?cluster_name={sys.argv[2]}'
+                     f'{client_version_str}')
+    asyncio.run(main(websocket_url, timestamps_are_supported))