Skip to content

Rack fails to come up when one switch does not come up in a timely manner #9678

@rcgoodfellow

Description

@rcgoodfellow

On London we've observed the following.

We hit this condition

https://github.com/oxidecomputer/omicron/blame/4175bf66d6c69a037a8bad64c55401defec4e164/sled-agent/src/bootstrap/early_networking.rs#L209-L223

as seen in the logs

BRM42220030 # cat /pool/ext/062523e4-e44c-4a01-8231-febed9979916/crypt/debug/global/oxide-sled-agent:default.log.1768626882 | grep 'Only found' | grep 'passed' | looker
00:15:35.144Z WARN SledAgent (RSS): Only found one switch (expected two), but passed wait time of 300s: returning
    file = sled-agent/src/bootstrap/early_networking.rs:213
    switch_found = {Switch1: fd00:1122:3344:102::2}
    total_elapsed = 313.58204264s

I believe the idea here is to come up with one switch and then continue to try and bring up the second switch once nexus is up. However that latter part does not seem to be happening. We are stuck in RSS handoff with

this log in sled agent

2026-01-17 02:33:45.955Z INFO SledAgent/664 (RSS) on BRM42220030: Failed to handoff to nexus: Error Response: status: 404 Not Found; headers: {"content-type": "application/json", "x-request-id": "a3aab6a4-d92c-4b20-852b-e6e7de3999a5", "content-length": "151", "date": "Sat, 17 Jan 2026 02:33:45 GMT"}; value: Error { error_code: Some("ObjectNotFound"), message: "not found: switch-port with name \"qsfp0\"", request_id: "a3aab6a4-d92c-4b20-852b-e6e7de3999a5" }
    file = sled-agent/src/rack_setup/service.rs:1066

and this corresponding log in nexus

22:41:52.322Z INFO 275f2274-fddb-4b32-b60a-d53b65c420a5 (dropshot_lockstep): discovered ports for switch1: [
        Internal(
            Internal(
                "int0",
            ),
        ),
        Rear(
            Rear(
                "rear0",
            ),
        ),
        Rear(
            Rear(
                "rear1",
            ),
        ),
        Rear(
            Rear(
                "rear2",
            ),
        ),
        Rear(
            Rear(
                "rear3",
            ),
        ),
        Rear(
            Rear(
                "rear4",
            ),
        ),
        Rear(
            Rear(
                "rear5",
            ),
        ),
        Rear(
            Rear(
                "rear6",
            ),
        ),
        Rear(
            Rear(
                "rear7",
            ),
        ),
        Rear(
            Rear(
                "rear8",
            ),
        ),
        Rear(
            Rear(
                "rear9",
            ),
        ),
        Rear(
            Rear(
                "rear10",
            ),
        ),
        Rear(
            Rear(
                "rear11",
            ),
        ),
        Rear(
            Rear(
                "rear12",
            ),
        ),
        Rear(
            Rear(
                "rear13",
            ),
        ),
        Rear(
            Rear(
                "rear14",
            ),
        ),
        Rear(
            Rear(
                "rear15",
            ),
        ),
        Rear(
            Rear(
                "rear16",
            ),
        ),
        Rear(
            Rear(
                "rear17",
            ),
        ),
        Rear(
            Rear(
                "rear18",
            ),
        ),
        Rear(
            Rear(
                "rear19",
            ),
        ),
        Rear(
            Rear(
                "rear20",
            ),
        ),
        Rear(
            Rear(
                "rear21",
            ),
        ),
        Rear(
            Rear(
                "rear22",
            ),
        ),
        Rear(
            Rear(
                "rear23",
            ),
        ),
        Rear(
            Rear(
                "rear24",
            ),
        ),
        Rear(
            Rear(
                "rear25",
            ),
        ),
        Rear(
            Rear(
                "rear26",
            ),
        ),
        Rear(
            Rear(
                "rear27",
            ),
        ),
        Rear(
            Rear(
                "rear28",
            ),
        ),
        Rear(
            Rear(
                "rear29",
            ),
        ),
        Rear(
            Rear(
                "rear30",
            ),
        ),
        Rear(
            Rear(
                "rear31",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp0",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp1",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp2",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp3",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp4",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp5",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp6",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp7",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp8",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp9",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp10",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp11",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp12",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp13",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp14",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp15",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp16",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp17",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp18",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp19",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp20",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp21",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp22",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp23",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp24",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp25",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp26",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp27",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp28",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp29",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp30",
            ),
        ),
        Qsfp(
            Qsfp(
                "qsfp31",
            ),
        ),
    ]
    authenticated = true
    file = nexus/src/app/rack.rs:346
    local_addr = [fd00:1122:3344:102::4]:12232
    method = PUT
    remote_addr = [fd00:1122:3344:102::1]:45447
    req_id = 067f6c02-f80a-4185-b5b6-04b8144e3a32
    type = user_builtin
    uri = /racks/d3a56402-aca7-4f9d-8d68-f294621f2b20/initialization-complete
    user_builtin_id = 001de000-05e4-4000-8000-000000000002
22:41:52.322Z INFO 275f2274-fddb-4b32-b60a-d53b65c420a5 (dropshot_lockstep): populating ports for switch1: [
        Name(
            "qsfp0",
        ),
        Name(
            "qsfp1",
        ),
        Name(
            "qsfp2",
        ),
        Name(
            "qsfp3",
        ),
        Name(
            "qsfp4",
        ),
        Name(
            "qsfp5",
        ),
        Name(
            "qsfp6",
        ),
        Name(
            "qsfp7",
        ),
        Name(
            "qsfp8",
        ),
        Name(
            "qsfp9",
        ),
        Name(
            "qsfp10",
        ),
        Name(
            "qsfp11",
        ),
        Name(
            "qsfp12",
        ),
        Name(
            "qsfp13",
        ),
        Name(
            "qsfp14",
        ),
        Name(
            "qsfp15",
        ),
        Name(
            "qsfp16",
        ),
        Name(
            "qsfp17",
        ),
        Name(
            "qsfp18",
        ),
        Name(
            "qsfp19",
        ),
        Name(
            "qsfp20",
        ),
        Name(
            "qsfp21",
        ),
        Name(
            "qsfp22",
        ),
        Name(
            "qsfp23",
        ),
        Name(
            "qsfp24",
        ),
        Name(
            "qsfp25",
        ),
        Name(
            "qsfp26",
        ),
        Name(
            "qsfp27",
        ),
        Name(
            "qsfp28",
        ),
        Name(
            "qsfp29",
        ),
        Name(
            "qsfp30",
        ),
        Name(
            "qsfp31",
        ),
    ]
    authenticated = true
    file = nexus/src/app/rack.rs:356
    local_addr = [fd00:1122:3344:102::4]:12232
    method = PUT
    remote_addr = [fd00:1122:3344:102::1]:45447
    req_id = 067f6c02-f80a-4185-b5b6-04b8144e3a32
    type = user_builtin
    uri = /racks/d3a56402-aca7-4f9d-8d68-f294621f2b20/initialization-complete
    user_builtin_id = 001de000-05e4-4000-8000-000000000002
22:41:52.477Z INFO 275f2274-fddb-4b32-b60a-d53b65c420a5 (dropshot_lockstep): request completed
    error_message_external = not found: switch-port with name "qsfp0"
    error_message_internal = not found: switch-port with name "qsfp0"
    file = /home/alan/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/dropshot-0.16.6/src/server.rs:855
    latency_us = 216182
    local_addr = [fd00:1122:3344:102::4]:12232
    method = PUT
    remote_addr = [fd00:1122:3344:102::1]:45447
    req_id = 067f6c02-f80a-4185-b5b6-04b8144e3a32
    response_code = 404
    uri = /racks/d3a56402-aca7-4f9d-8d68-f294621f2b20/initialization-complete

It appears that we are not reporting switch0 in the initialize API call to nexus, so nexus does not create database records for switch0, but then subsequent operations in rack-initialize require that switch0's ports be populated and they are not.

CRDB state shows

root@[fd00:1122:3344:104::4]:32221/omicron> select switch_location, count(*) from switch_port group by switch_location;
  switch_location | count
------------------+--------
  switch1         |    32
(1 row)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions