Skip to content

Commit fb41aef

Browse files
Flavio Criscianilmbarros
authored andcommitted
Fix sandbox cleanup
Driver and Sanbox have 2 different stores where the endpoints are saved It is possible that the 2 store go out of sync if the endpoint is added to the driver but there is a crash before the sandbox join. On restart now we take the list of endpoints from the network and we assign them back to the sandbox (This is a balena cherry-pick of moby#1805) Signed-off-by: Flavio Crisciani <[email protected]>
1 parent 4d09346 commit fb41aef

File tree

2 files changed

+44
-23
lines changed

2 files changed

+44
-23
lines changed

endpoint.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -844,10 +844,6 @@ func (ep *endpoint) Delete(force bool) error {
844844
}
845845
}
846846

847-
if err = n.getController().deleteFromStore(ep); err != nil {
848-
return err
849-
}
850-
851847
defer func() {
852848
if err != nil && !force {
853849
ep.dbExists = false
@@ -864,6 +860,11 @@ func (ep *endpoint) Delete(force bool) error {
864860
return err
865861
}
866862

863+
// This has to come after the sandbox and the driver to guarantee that can be the source of truth on restart cases
864+
if err = n.getController().deleteFromStore(ep); err != nil {
865+
return err
866+
}
867+
867868
ep.releaseAddress()
868869

869870
if err := n.getEpCnt().DecEndpointCnt(); err != nil {

sandbox_store.go

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
package libnetwork
22

33
import (
4+
"container/heap"
45
"encoding/json"
5-
"sync"
66

77
"github.com/docker/libnetwork/datastore"
88
"github.com/docker/libnetwork/osl"
@@ -207,6 +207,40 @@ func (c *controller) sandboxCleanup(activeSandboxes map[string]interface{}) {
207207
return
208208
}
209209

210+
// Get all the endpoints
211+
// Use the network as the source of truth so that if there was an issue before the sandbox registered the endpoint
212+
// this will be taken anyway
213+
endpointsInSandboxID := map[string][]*endpoint{}
214+
nl, err := c.getNetworksForScope(datastore.LocalScope)
215+
if err != nil {
216+
logrus.Warnf("Could not get list of networks during sandbox cleanup: %v", err)
217+
return
218+
}
219+
220+
for _, n := range nl {
221+
var epl []*endpoint
222+
epl, err = n.getEndpointsFromStore()
223+
if err != nil {
224+
logrus.Warnf("Could not get list of endpoints in network %s during sandbox cleanup: %v", n.name, err)
225+
continue
226+
}
227+
for _, ep := range epl {
228+
ep, err = n.getEndpointFromStore(ep.id)
229+
if err != nil {
230+
logrus.Warnf("Could not get endpoint in network %s during sandbox cleanup: %v", n.name, err)
231+
continue
232+
}
233+
if ep.sandboxID == "" {
234+
logrus.Warnf("Endpoint %s not associated to any sandbox, deleting it", ep.id)
235+
ep.Delete(true)
236+
continue
237+
}
238+
239+
// Append the endpoint to the corresponding sandboxID
240+
endpointsInSandboxID[ep.sandboxID] = append(endpointsInSandboxID[ep.sandboxID], ep)
241+
}
242+
}
243+
210244
for _, kvo := range kvol {
211245
sbs := kvo.(*sbState)
212246

@@ -252,25 +286,11 @@ func (c *controller) sandboxCleanup(activeSandboxes map[string]interface{}) {
252286
c.sandboxes[sb.id] = sb
253287
c.Unlock()
254288

255-
for _, eps := range sbs.Eps {
256-
n, err := c.getNetworkFromStore(eps.Nid)
257-
var ep *endpoint
258-
if err != nil {
259-
logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err)
260-
n = &network{id: eps.Nid, ctrlr: c, drvOnce: &sync.Once{}, persist: true}
261-
ep = &endpoint{id: eps.Eid, network: n, sandboxID: sbs.ID}
262-
} else {
263-
ep, err = n.getEndpointFromStore(eps.Eid)
264-
if err != nil {
265-
logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
266-
ep = &endpoint{id: eps.Eid, network: n, sandboxID: sbs.ID}
267-
}
268-
}
269-
if _, ok := activeSandboxes[sb.ID()]; ok && err != nil {
270-
logrus.Errorf("failed to restore endpoint %s in %s for container %s due to %v", eps.Eid, eps.Nid, sb.ContainerID(), err)
271-
continue
289+
// Restore all the endpoints that are supposed to be in this sandbox
290+
if eps, ok := endpointsInSandboxID[sb.id]; ok {
291+
for _, ep := range eps {
292+
heap.Push(&sb.endpoints, ep)
272293
}
273-
sb.addEndpoint(ep)
274294
}
275295

276296
if _, ok := activeSandboxes[sb.ID()]; !ok {

0 commit comments

Comments
 (0)