Skip to content

Commit cd6d98d

Browse files
authored
Merge pull request #1527 from IBM/query_performance_optimizations
Query and code performance optimizations in services
2 parents f66e4a3 + e6b44fa commit cd6d98d

File tree

10 files changed

+639
-405
lines changed

10 files changed

+639
-405
lines changed

.env.example

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,26 +733,41 @@ PROMPT_CACHE_SIZE=100
733733
MAX_PROMPT_SIZE=102400
734734
PROMPT_RENDER_TIMEOUT=10
735735

736+
#####################################
737+
# MCP server Health Check COnfigurations
738+
#####################################
739+
736740
# Health Check Configuration
737741
HEALTH_CHECK_INTERVAL=60
738742
# Health check timeout in seconds (default: 10, matches config.py)
739743
HEALTH_CHECK_TIMEOUT=10
740744
UNHEALTHY_THRESHOLD=5
741745
# Gateway URL validation timeout in seconds (default: 5, matches config.py)
742746
GATEWAY_VALIDATION_TIMEOUT=5
747+
# Maximum number of concurrent health checks per worker. Prevents resource exhaustion during health check operations (default: 20, matches config.py)
748+
MAX_CONCURRENT_HEALTH_CHECKS=20
743749

744750
# File lock name for gateway service leader election
745751
# Used to coordinate multiple gateway instances when running in cluster mode
746752
# Default: "gateway_service_leader.lock"
747753
FILELOCK_NAME=gateway_service_leader.lock
748754

755+
756+
#####################################
757+
# Default Root Paths
758+
#####################################
759+
749760
# Default root paths (JSON array)
750761
# List of default root paths for resource resolution
751762
# Example: ["/api/v1", "/mcp"]
752763
# Default: []
753764
DEFAULT_ROOTS=[]
754765

766+
767+
#####################################
755768
# OpenTelemetry Observability Configuration
769+
#####################################
770+
756771
# Enable distributed tracing and metrics collection
757772
# Options: true (default), false
758773
OTEL_ENABLE_OBSERVABILITY=true

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1996,6 +1996,7 @@ ENABLE_METRICS=false
19961996
| `UNHEALTHY_THRESHOLD` | Fail-count before peer deactivation, | `3` | int > 0 |
19971997
| | Set to -1 if deactivation is not needed. | | |
19981998
| `GATEWAY_VALIDATION_TIMEOUT` | Gateway URL validation timeout (secs) | `5` | int > 0 |
1999+
| `MAX_CONCURRENT_HEALTH_CHECKS` | Max Concurrent health checks | `20` | int > 0 |
19992000
| `FILELOCK_NAME` | File lock for leader election | `gateway_service_leader.lock` | string |
20002001
| `DEFAULT_ROOTS` | Default root paths for resources | `[]` | JSON array |
20012002

mcpgateway/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,7 @@ def parse_issuers(cls, v: Any) -> list[str]:
925925
health_check_interval: int = 60 # seconds
926926
health_check_timeout: int = 10 # seconds
927927
unhealthy_threshold: int = 5 # after this many failures, mark as Offline
928+
max_concurrent_health_checks: int = 20 # maximum concurrent health checks per worker
928929

929930
# Validation Gateway URL
930931
gateway_validation_timeout: int = 5 # seconds

mcpgateway/services/gateway_service.py

Lines changed: 250 additions & 172 deletions
Large diffs are not rendered by default.

mcpgateway/services/server_service.py

Lines changed: 156 additions & 70 deletions
Large diffs are not rendered by default.

mcpgateway/services/tool_service.py

Lines changed: 90 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -307,30 +307,27 @@ async def get_top_tools(self, db: Session, limit: Optional[int] = 5) -> List[Top
307307
- success_rate: Success rate percentage, or None if no metrics.
308308
- last_execution: Timestamp of the last execution, or None if no metrics.
309309
"""
310+
311+
success_rate = case(
312+
(func.count(ToolMetric.id) > 0, func.sum(case((ToolMetric.is_success.is_(True), 1), else_=0)).cast(Float) * 100 / func.count(ToolMetric.id)), else_=None # pylint: disable=not-callable
313+
)
314+
310315
query = (
311-
db.query(
316+
select(
312317
DbTool.id,
313318
DbTool.name,
314319
func.count(ToolMetric.id).label("execution_count"), # pylint: disable=not-callable
315-
func.avg(ToolMetric.response_time).label("avg_response_time"), # pylint: disable=not-callable
316-
case(
317-
(
318-
func.count(ToolMetric.id) > 0, # pylint: disable=not-callable
319-
func.sum(case((ToolMetric.is_success.is_(True), 1), else_=0)).cast(Float) / func.count(ToolMetric.id) * 100, # pylint: disable=not-callable
320-
),
321-
else_=None,
322-
).label("success_rate"),
323-
func.max(ToolMetric.timestamp).label("last_execution"), # pylint: disable=not-callable
320+
func.avg(ToolMetric.response_time).label("avg_response_time"),
321+
success_rate.label("success_rate"),
322+
func.max(ToolMetric.timestamp).label("last_execution"),
324323
)
325-
.outerjoin(ToolMetric)
324+
.outerjoin(ToolMetric, ToolMetric.tool_id == DbTool.id)
326325
.group_by(DbTool.id, DbTool.name)
327326
.order_by(desc("execution_count"))
327+
.limit(limit or 5)
328328
)
329329

330-
if limit is not None:
331-
query = query.limit(limit)
332-
333-
results = query.all()
330+
results = db.execute(query).all()
334331

335332
return build_top_performers(results)
336333

@@ -363,36 +360,38 @@ def _convert_tool_to_read(self, tool: DbTool, include_metrics: bool = True) -> T
363360
tool_dict = tool.__dict__.copy()
364361
tool_dict.pop("_sa_instance_state", None)
365362

366-
if include_metrics:
367-
tool_dict["metrics"] = tool.metrics_summary
368-
else:
369-
tool_dict["metrics"] = None
370-
371363
tool_dict["execution_count"] = tool.execution_count
364+
tool_dict["metrics"] = tool.metrics_summary if include_metrics else None
372365

373366
tool_dict["request_type"] = tool.request_type
374367
tool_dict["annotations"] = tool.annotations or {}
375368

376-
decoded_auth_value = decode_auth(tool.auth_value)
377-
if tool.auth_type == "basic":
378-
decoded_bytes = base64.b64decode(decoded_auth_value["Authorization"].split("Basic ")[1])
379-
username, password = decoded_bytes.decode("utf-8").split(":")
380-
tool_dict["auth"] = {
381-
"auth_type": "basic",
382-
"username": username,
383-
"password": "********" if password else None,
384-
}
385-
elif tool.auth_type == "bearer":
386-
tool_dict["auth"] = {
387-
"auth_type": "bearer",
388-
"token": "********" if decoded_auth_value["Authorization"] else None,
389-
}
390-
elif tool.auth_type == "authheaders":
391-
tool_dict["auth"] = {
392-
"auth_type": "authheaders",
393-
"auth_header_key": next(iter(decoded_auth_value)),
394-
"auth_header_value": "********" if decoded_auth_value[next(iter(decoded_auth_value))] else None,
395-
}
369+
# Only decode auth if auth_type is set
370+
if tool.auth_type and tool.auth_value:
371+
decoded_auth_value = decode_auth(tool.auth_value)
372+
if tool.auth_type == "basic":
373+
decoded_bytes = base64.b64decode(decoded_auth_value["Authorization"].split("Basic ")[1])
374+
username, password = decoded_bytes.decode("utf-8").split(":")
375+
tool_dict["auth"] = {
376+
"auth_type": "basic",
377+
"username": username,
378+
"password": "********" if password else None,
379+
}
380+
elif tool.auth_type == "bearer":
381+
tool_dict["auth"] = {
382+
"auth_type": "bearer",
383+
"token": "********" if decoded_auth_value["Authorization"] else None,
384+
}
385+
elif tool.auth_type == "authheaders":
386+
# Get first key
387+
first_key = next(iter(decoded_auth_value))
388+
tool_dict["auth"] = {
389+
"auth_type": "authheaders",
390+
"auth_header_key": first_key,
391+
"auth_header_value": "********" if decoded_auth_value[first_key] else None,
392+
}
393+
else:
394+
tool_dict["auth"] = None
396395
else:
397396
tool_dict["auth"] = None
398397

@@ -791,10 +790,17 @@ async def list_tools(
791790
if has_more:
792791
tools = tools[:page_size] # Trim to page_size
793792

793+
# Batch fetch team names for all tools at once
794+
team_ids = {getattr(t, "team_id", None) for t in tools if getattr(t, "team_id", None)}
795+
team_name_map = {}
796+
if team_ids:
797+
teams = db.query(EmailTeam.id, EmailTeam.name).filter(EmailTeam.id.in_(team_ids), EmailTeam.is_active.is_(True)).all()
798+
team_name_map = {team.id: team.name for team in teams}
799+
794800
# Convert to ToolRead objects
795801
result = []
796802
for t in tools:
797-
team_name = self._get_team_name(db, getattr(t, "team_id", None))
803+
team_name = team_name_map.get(getattr(t, "team_id", None))
798804
t.team = team_name
799805
result.append(self._convert_tool_to_read(t))
800806

@@ -944,9 +950,17 @@ async def list_tools_for_user(
944950
# query = query.offset(skip).limit(limit)
945951

946952
tools = db.execute(query).scalars().all()
953+
954+
# Batch fetch team names for all tools at once
955+
tool_team_ids = {getattr(t, "team_id", None) for t in tools if getattr(t, "team_id", None)}
956+
team_name_map = {}
957+
if tool_team_ids:
958+
teams = db.query(EmailTeam.id, EmailTeam.name).filter(EmailTeam.id.in_(tool_team_ids), EmailTeam.is_active.is_(True)).all()
959+
team_name_map = {team.id: team.name for team in teams}
960+
947961
result = []
948962
for t in tools:
949-
team_name = self._get_team_name(db, getattr(t, "team_id", None))
963+
team_name = team_name_map.get(getattr(t, "team_id", None))
950964
t.team = team_name
951965
result.append(self._convert_tool_to_read(t))
952966
return result
@@ -1876,31 +1890,53 @@ async def aggregate_metrics(self, db: Session) -> Dict[str, Any]:
18761890
>>> from unittest.mock import MagicMock
18771891
>>> service = ToolService()
18781892
>>> db = MagicMock()
1879-
>>> db.execute.return_value.scalar.return_value = 0
1893+
>>> # Mock the row result object returned by db.execute().one()
1894+
>>> mock_result_row = MagicMock()
1895+
>>> mock_result_row.total = 10
1896+
>>> mock_result_row.successful = 8
1897+
>>> mock_result_row.failed = 2
1898+
>>> mock_result_row.min_rt = 50.0
1899+
>>> mock_result_row.max_rt = 250.0
1900+
>>> mock_result_row.avg_rt = 150.0
1901+
>>> mock_result_row.last_time = "2023-01-01T12:00:00"
1902+
>>> db.execute.return_value.one.return_value = mock_result_row
18801903
>>> import asyncio
18811904
>>> result = asyncio.run(service.aggregate_metrics(db))
18821905
>>> isinstance(result, dict)
18831906
True
1907+
>>> result['total_executions']
1908+
10
1909+
>>> result['failure_rate']
1910+
0.2
18841911
"""
18851912

1886-
total = db.execute(select(func.count(ToolMetric.id))).scalar() or 0 # pylint: disable=not-callable
1887-
successful = db.execute(select(func.count(ToolMetric.id)).where(ToolMetric.is_success.is_(True))).scalar() or 0 # pylint: disable=not-callable
1888-
failed = db.execute(select(func.count(ToolMetric.id)).where(ToolMetric.is_success.is_(False))).scalar() or 0 # pylint: disable=not-callable
1913+
# Query to get all aggregated metrics at once
1914+
result = db.execute(
1915+
select(
1916+
func.count(ToolMetric.id).label("total"), # pylint: disable=not-callable
1917+
func.sum(case((ToolMetric.is_success.is_(True), 1), else_=0)).label("successful"), # pylint: disable=not-callable
1918+
func.sum(case((ToolMetric.is_success.is_(False), 1), else_=0)).label("failed"), # pylint: disable=not-callable
1919+
func.min(ToolMetric.response_time).label("min_rt"), # pylint: disable=not-callable
1920+
func.max(ToolMetric.response_time).label("max_rt"), # pylint: disable=not-callable
1921+
func.avg(ToolMetric.response_time).label("avg_rt"), # pylint: disable=not-callable
1922+
func.max(ToolMetric.timestamp).label("last_time"), # pylint: disable=not-callable
1923+
)
1924+
).one()
1925+
1926+
total = result.total or 0
1927+
successful = result.successful or 0
1928+
failed = result.failed or 0
18891929
failure_rate = failed / total if total > 0 else 0.0
1890-
min_rt = db.execute(select(func.min(ToolMetric.response_time))).scalar()
1891-
max_rt = db.execute(select(func.max(ToolMetric.response_time))).scalar()
1892-
avg_rt = db.execute(select(func.avg(ToolMetric.response_time))).scalar()
1893-
last_time = db.execute(select(func.max(ToolMetric.timestamp))).scalar()
18941930

18951931
return {
18961932
"total_executions": total,
18971933
"successful_executions": successful,
18981934
"failed_executions": failed,
18991935
"failure_rate": failure_rate,
1900-
"min_response_time": min_rt,
1901-
"max_response_time": max_rt,
1902-
"avg_response_time": avg_rt,
1903-
"last_execution_time": last_time,
1936+
"min_response_time": result.min_rt,
1937+
"max_response_time": result.max_rt,
1938+
"avg_response_time": result.avg_rt,
1939+
"last_execution_time": result.last_time,
19041940
}
19051941

19061942
async def reset_metrics(self, db: Session, tool_id: Optional[int] = None) -> None:

mcpgateway/static/admin.js

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22839,17 +22839,22 @@ function initializeRealTimeMonitoring() {
2283922839

2284022840
// --- Gateway Events ---
2284122841
// Handlers for specific states
22842-
// eventSource.addEventListener("gateway_activated", (e) => handleEntityEvent("gateway", e));
22842+
2284322843
// eventSource.addEventListener("gateway_deactivated", (e) => handleEntityEvent("gateway", e));
22844+
eventSource.addEventListener("gateway_activated", (e) =>
22845+
handleEntityEvent("gateway", e),
22846+
);
2284422847
eventSource.addEventListener("gateway_offline", (e) =>
2284522848
handleEntityEvent("gateway", e),
2284622849
);
2284722850

2284822851
// --- Tool Events ---
2284922852
// Handlers for specific states
2285022853

22851-
// eventSource.addEventListener("tool_activated", (e) => handleEntityEvent("tool", e));
2285222854
// eventSource.addEventListener("tool_deactivated", (e) => handleEntityEvent("tool", e));
22855+
eventSource.addEventListener("tool_activated", (e) =>
22856+
handleEntityEvent("tool", e),
22857+
);
2285322858
eventSource.addEventListener("tool_offline", (e) =>
2285422859
handleEntityEvent("tool", e),
2285522860
);

0 commit comments

Comments
 (0)