diff --git a/sled-agent/src/bootstrap/early_networking.rs b/sled-agent/src/bootstrap/early_networking.rs index 0fbea9555ca..1bd0c8ad10a 100644 --- a/sled-agent/src/bootstrap/early_networking.rs +++ b/sled-agent/src/bootstrap/early_networking.rs @@ -122,15 +122,16 @@ impl<'a> EarlyNetworkSetup<'a> { /// * If `rack_network_config` specifies only one switch with an uplink, /// blocks until we can find that switch zone's underlay address. /// * If `rack_network_config` specifies two switches with uplinks, we will - /// block for a while (~5 minutes, currently) trying to find both - /// corresponding switch zones. If we pass our deadline without having + /// block up to the `wait_for_at_least_one` duration trying to find both + /// corresponding switch zones. If we pass the deadline without having /// found both, we will return as soon after that as we can find one of /// the switch zone's addresses. pub async fn lookup_uplinked_switch_zone_underlay_addrs( &self, resolver: &DnsResolver, config: &RackNetworkConfig, - ) -> HashSet { + wait_for_at_least_one: Duration, + ) -> HashMap { // Which switches have configured ports? let uplinked_switches = config .ports @@ -140,58 +141,17 @@ impl<'a> EarlyNetworkSetup<'a> { // If we have no uplinks, we have nothing to look up. if uplinked_switches.is_empty() { - return HashSet::new(); + return HashMap::new(); } - let uplinked_switch_zone_addrs = self - .lookup_switch_zone_underlay_addrs_impl( - resolver, - Some(uplinked_switches), - ) - .await; - - // lookup_switch_zone_underlay_addrs_impl should not return until it - // finds at least one of the uplinked_switches - assert!(!uplinked_switch_zone_addrs.is_empty()); - - uplinked_switch_zone_addrs.into_values().collect() - } - - /// Dynamically looks up (via internal DNS and queries to MGS) the underlay - /// addresses of the switch zones. - /// - /// Blocks until either: - /// - /// * We found the location and underlay addresses of all switch zones - /// reported by internal DNS - /// * We've passed a substantial deadline (~5 minutes, currently) and have - /// found the location and underlay address of one switch zone reported by - /// internal DNS - pub async fn lookup_switch_zone_underlay_addrs( - &self, - resolver: &DnsResolver, - ) -> HashMap { - self.lookup_switch_zone_underlay_addrs_impl(resolver, None).await - } - - async fn lookup_switch_zone_underlay_addrs_impl( - &self, - resolver: &DnsResolver, - switches_to_find: Option>, - ) -> HashMap { - // We will wait up to 5 minutes to try to find all switches; if we pass - // the 5 minute mark, we will return as soon as we find at least one - // switch. - const MAX_SWITCH_ZONE_WAIT_TIME: Duration = Duration::from_secs(60 * 5); - let query_start = Instant::now(); - retry_notify( + let uplinked_switch_zone_addrs = retry_notify( retry_policy_switch_mapping(), || async { match self .lookup_switch_zone_underlay_addrs_one_attempt( resolver, - switches_to_find.as_ref(), + &uplinked_switches, ) .await? { @@ -206,16 +166,16 @@ impl<'a> EarlyNetworkSetup<'a> { } LookupSwitchZoneAddrsResult::PartialSuccess(map) => { let elapsed = query_start.elapsed(); - if elapsed >= MAX_SWITCH_ZONE_WAIT_TIME { + if elapsed >= wait_for_at_least_one { // We only found one switch when we are expecting // two, but we've been waiting for too long: go with // just one. warn!( self.log, "Only found one switch (expected two), \ - but passed wait time of \ - {MAX_SWITCH_ZONE_WAIT_TIME:?}: returning"; + but passed requested wait time: returning"; "switch_found" => ?map, + "requested_wait_time" => ?wait_for_at_least_one, "total_elapsed" => ?elapsed, ); Ok(map) @@ -231,16 +191,25 @@ impl<'a> EarlyNetworkSetup<'a> { } }, |error, delay| { + let elapsed = query_start.elapsed(); warn!( self.log, "Failed to look up switch zone locations"; "error" => #%error, "retry_after" => ?delay, + "requested_wait_time" => ?wait_for_at_least_one, + "total_elapsed" => ?elapsed, ); }, ) .await - .expect("Expected an infinite retry loop finding switch zones") + .expect("Expected an infinite retry loop finding switch zones"); + + // lookup_switch_zone_underlay_addrs_impl should not return until it + // finds at least one of the uplinked_switches + assert!(!uplinked_switch_zone_addrs.is_empty()); + + uplinked_switch_zone_addrs } // TODO: #3601 Audit switch location discovery logic for robustness @@ -250,14 +219,12 @@ impl<'a> EarlyNetworkSetup<'a> { async fn lookup_switch_zone_underlay_addrs_one_attempt( &self, resolver: &DnsResolver, - switches_to_find: Option<&HashSet>, + switches_to_find: &HashSet, ) -> Result> { - if let Some(switches_to_find) = switches_to_find { - // We should only be called with a nonempty `switches_to_find`; - // otherwise we'll never return: we always want to find at least one - // of these switches. - assert!(!switches_to_find.is_empty()); - } + // We should only be called with a nonempty `switches_to_find`; + // otherwise we'll never return: we always want to find at least one + // of these switches. + assert!(!switches_to_find.is_empty()); // We might have stale DNS results; clear our resolver's cache. resolver.clear_cache(); @@ -313,45 +280,19 @@ impl<'a> EarlyNetworkSetup<'a> { } } - if let Some(switches_to_find) = switches_to_find { - // Were we tasked with finding _specific_ switches? If so, filter - // `switch_location_map` down to just the ones we care about, and - // then return total/partial/no success based on what's left. - switch_location_map - .retain(|location, _addr| switches_to_find.contains(location)); + // Filter `switch_location_map` down to just the ones we care about, + // and then return total/partial/no success based on what's left. + switch_location_map + .retain(|location, _addr| switches_to_find.contains(location)); - if switch_location_map.is_empty() { - Err(BackoffError::transient( - "No switch locations found".to_string(), - )) - } else if switch_location_map.len() == switches_to_find.len() { - Ok(LookupSwitchZoneAddrsResult::TotalSuccess( - switch_location_map, - )) - } else { - Ok(LookupSwitchZoneAddrsResult::PartialSuccess( - switch_location_map, - )) - } + if switch_location_map.is_empty() { + Err(BackoffError::transient( + "No switch locations found".to_string(), + )) + } else if switch_location_map.len() == switches_to_find.len() { + Ok(LookupSwitchZoneAddrsResult::TotalSuccess(switch_location_map)) } else { - // We were not tasked with finding specific switches: we're done if - // we found both, or if we found one and internal DNS only gave us - // one IP address (e.g., in test environments with just one switch). - if switch_location_map.is_empty() { - Err(BackoffError::transient( - "No switch locations found".to_string(), - )) - } else if switch_location_map.len() == 1 - && switch_zone_addrs.len() > 1 - { - Ok(LookupSwitchZoneAddrsResult::PartialSuccess( - switch_location_map, - )) - } else { - Ok(LookupSwitchZoneAddrsResult::TotalSuccess( - switch_location_map, - )) - } + Ok(LookupSwitchZoneAddrsResult::PartialSuccess(switch_location_map)) } } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index e5c0cdd0e94..c456ac958f6 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -1382,7 +1382,17 @@ impl ServiceInner { // Ask MGS in each switch zone which switch it is. let switch_mgmt_addrs = EarlyNetworkSetup::new(&self.log) - .lookup_switch_zone_underlay_addrs(&resolver) + .lookup_uplinked_switch_zone_underlay_addrs( + &resolver, + &config.rack_network_config, + // We willing to wait forever to find all the switches that have + // configured uplinks; if we attempt to proceed without doing + // so, we'll fail handing off to Nexus later. (Ideally we could + // complete RSS with only one switch up and then configure the + // second later, but that currently doesn't work.) + // + Duration::MAX, + ) .await; rss_step.update(RssStep::InitNtp); diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 7c9691370cb..a1d0eb0a964 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1031,7 +1031,35 @@ impl ServiceManager { &self, zone_args: &ZoneArgs<'_>, ) -> Result, Error> { - // Only some services currently need OPTE ports + // As a part of setting up OPTE ports, we notify dendrite on all + // switches that have an uplink about the new required NAT entries. This + // requires finding the switch zone IP addresses. We currently block + // until either: + // + // 1. We find all switch zone IPs. + // 2. We find at least one switch zone IP and this timeout elapses. + // + // If Nexus is up, we don't really need to do any of this work; it has a + // background task that will sync NAT entries periodically. However, + // it's critical that we set up NAT entries for boundary NTP in + // particular during cold boot; otherwise, we won't be able to timesync + // and bring the rack up. + // + // The choice of timeout here is a tension between wanting to wait for + // both switches and not wanting to block zone startup indefinitely if + // one of the scrimlets or switches is unavailable for an extended + // period of time. We should probably revist this entirely - maybe + // sled-agent should have its own NAT config reconciler for cold boot + // (although it's unclear how something like that wout interact with + // Nexus)? + // + // We'll pick 5 minutes, which has historically been the timeout here + // and should hopefully give enough time for a "just rebooted" scrimlet + // to bring its switch zone up, if we get unlucky in coincidental + // timings. + const WAIT_FOR_ALL_SWITCH_ZONES_TIMEOUT: Duration = + Duration::from_secs(5 * 60); + if !matches!( zone_args.omicron_type(), Some(OmicronZoneType::ExternalDns { .. }) @@ -1065,11 +1093,12 @@ impl ServiceManager { .lookup_uplinked_switch_zone_underlay_addrs( resolver, rack_network_config, + WAIT_FOR_ALL_SWITCH_ZONES_TIMEOUT, ) .await; let dpd_clients: Vec = uplinked_switch_zone_addrs - .iter() + .values() .map(|addr| { DpdClient::new( &format!("http://[{}]:{}", addr, DENDRITE_PORT),