NM-288: fix orphan nodes on bulk host deletion and refactor node cleanup (#3948)

* fix(go): set persistent keep alive when registering host using sso;

* fix(go): run posture check violations on delete;

* fix(go): upsert node on approving pending host;

* fix(go): resolve concurrency issues during group delete cleanup;

* fix(go): update doc links;

* fix(go): add created and updated fields to host;

* fix(go): skip delete and update superadmin on sync users;

* fix(go): use conn directly for now;

* fix(go): remove acl for idp groups;

* fix(go): quote fields;

* fix(go): use filters with count;

* feat(go): add a search query;

* fix(go): cleanup acls;

* fix(go): review fixes;

* fix(go): remove additional loop;

* fix(go): fix

* v1.5.1: separate out idp sync and reset signals for HA

* v1.5.1: add grps with name for logging

* v1.5.1: clear posture check violations when all checks are deleted

* v1.5.1: set static when default host

* v1.5.1: fix db status check

* rm set max conns

* v1.5.1: reset auto assigned gw when disconnected

* fix(go): skip global network admin and user groups when splitting;

* v1.5.1: fix update node call from client

* fix(go): separate out migration from normal usage;

* fix(go): skip default groups;

* fix(go): create policies for existing groups on network create;

* fix(go): skip fatal log on clickhouse conn;

* fix(go): add posture check cleanup;

* NM-288: populate relevant name for acl types for UI

* NM-288: populate grp names for posture check apis

* NM-228: add network grps api

* NM-288: add network users api

* now check each group's NetworkRoles for either the specific network ID or schema.AllNetworks (all_networks)

* NM-288: check and unassign auto gw when node is disconnected from cli

* NM-288: optimise network users api call

* NM-288: block auto assign when set to use inet gw

* NM-288: pass all network scoped get apis for platform users

* NM-288: fix orphan nodes on bulk host deletion and refactor node cleanup

- Extract cleanupNodeReferences() from DeleteNode to handle best-effort
  cleanup of relay, internet gw, failover, nameserver, ACL, and egress refs
- DisassociateAllNodesFromHost now calls cleanupNodeReferences + DeleteNodeByID
  directly, bypassing redundant host-association updates that could fail and
  leave nodes orphaned
- Track failed node deletions instead of unconditionally clearing host.Nodes;
  return error if any nodes couldn't be deleted to prevent host removal
- Fix DeleteNode returning error even when node was successfully deleted in
  the host-not-found path

* NM-288: cleanupNodeReferences(node) now only runs in the hard-delete path (when purge=true or alreadyDeleted=true), so it won't execute twice — once on soft-delete and again when the zombie manager hard-deletes.

* NM-288: add orphan deletion log

* DisassociateAllNodesFromHost now always returns nil, logging warnings for any partial failures (upsert errors or undeletable nodes). This allows RemoveHost to always proceed to h.Delete()

---------

Co-authored-by: VishalDalwadi <dalwadivishal26@gmail.com>
Co-authored-by: Vishal Dalwadi <51291657+VishalDalwadi@users.noreply.github.com>
This commit is contained in:
Abhishek Kondur
2026-03-31 17:08:49 +05:30
committed by GitHub
parent aebf809f02
commit b97fbc69db
2 changed files with 54 additions and 41 deletions
+16 -5
View File
@@ -422,7 +422,9 @@ func DissasociateNodeFromHost(n *models.Node, h *schema.Host) error {
return UpsertHost(h)
}
// DisassociateAllNodesFromHost - deletes all nodes of the host
// DisassociateAllNodesFromHost - deletes all nodes of the host.
// Performs reference cleanup and directly deletes each node record,
// bypassing host-association updates since the host itself is being removed.
func DisassociateAllNodesFromHost(hostIDStr string) error {
hostID, err := uuid.Parse(hostIDStr)
if err != nil {
@@ -433,20 +435,29 @@ func DisassociateAllNodesFromHost(hostIDStr string) error {
if err := host.Get(db.WithContext(context.TODO())); err != nil {
return err
}
var failedNodes []string
for _, nodeID := range host.Nodes {
node, err := GetNodeByID(nodeID)
if err != nil {
logger.Log(0, "failed to get host node, node id:", nodeID, err.Error())
continue
}
if err := DeleteNode(&node, true); err != nil {
logger.Log(0, "failed to delete node", node.ID.String(), err.Error())
cleanupNodeReferences(&node)
if err := DeleteNodeByID(&node); err != nil {
slog.Error("failed to delete node record", "node", node.ID, "host", hostIDStr, "error", err)
failedNodes = append(failedNodes, nodeID)
continue
}
logger.Log(3, "deleted node", node.ID.String(), "of host", host.ID.String())
}
host.Nodes = []string{}
return UpsertHost(host)
host.Nodes = failedNodes
if err := UpsertHost(host); err != nil {
slog.Error("failed to upsert host after node cleanup", "host", hostIDStr, "error", err)
}
if len(failedNodes) > 0 {
slog.Warn("some nodes could not be deleted during host cleanup", "host", hostIDStr, "failed_count", len(failedNodes), "failed_nodes", failedNodes)
}
return nil
}
// GetDefaultHosts - retrieve all hosts marked as default from DB
+38 -36
View File
@@ -279,17 +279,16 @@ func UpdateNode(currentNode *models.Node, newNode *models.Node) error {
}
// DeleteNode - marks node for deletion (and adds to zombie list) if called by UI or deletes node if called by node
func DeleteNode(node *models.Node, purge bool) error {
alreadyDeleted := node.PendingDelete || node.Action == models.NODE_DELETE
node.Action = models.NODE_DELETE
//delete ext clients if node is ingress gw
// cleanupNodeReferences handles best-effort cleanup of all external references
// to a node (relay, internet gw, failover, nameservers, ACL, egress, enrollment keys).
// Errors are logged but do not prevent node deletion.
func cleanupNodeReferences(node *models.Node) {
if node.IsIngressGateway {
if err := DeleteGatewayExtClients(node.ID.String(), node.Network); err != nil {
slog.Error("failed to delete ext clients", "nodeid", node.ID.String(), "error", err.Error())
}
}
if node.IsRelayed {
// cleanup node from relayednodes on relay node
relayNode, err := GetNodeByID(node.RelayedBy)
if err == nil {
relayedNodes := []string{}
@@ -310,7 +309,6 @@ func DeleteNode(node *models.Node, purge bool) error {
ResetAutoRelayedPeer(node)
}
if node.IsRelay {
// unset all the relayed nodes
SetRelayedNodes(false, node.ID.String(), node.RelayedNodes)
}
if node.InternetGwID != "" {
@@ -330,6 +328,35 @@ func DeleteNode(node *models.Node, purge bool) error {
if node.IsInternetGateway {
UnsetInternetGw(node)
}
filters := make(map[string]bool)
if node.Address.IP != nil {
filters[node.Address.IP.String()] = true
}
if node.Address6.IP != nil {
filters[node.Address6.IP.String()] = true
}
nameservers, _ := (&schema.Nameserver{
NetworkID: node.Network,
}).ListByNetwork(db.WithContext(context.TODO()))
for _, ns := range nameservers {
ns.Servers = FilterOutIPs(ns.Servers, filters)
if len(ns.Servers) > 0 {
_ = ns.Update(db.WithContext(context.TODO()))
} else {
_ = ns.Delete(db.WithContext(context.TODO()))
}
}
go RemoveNodeFromAclPolicy(*node)
go RemoveNodeFromEgress(*node)
go RemoveNodeFromEnrollmentKeys(node)
}
func DeleteNode(node *models.Node, purge bool) error {
alreadyDeleted := node.PendingDelete || node.Action == models.NODE_DELETE
node.Action = models.NODE_DELETE
if !purge && !alreadyDeleted {
newnode := *node
newnode.PendingDelete = true
@@ -342,47 +369,22 @@ func DeleteNode(node *models.Node, purge bool) error {
if alreadyDeleted {
logger.Log(1, "forcibly deleting node", node.ID.String())
}
cleanupNodeReferences(node)
host := &schema.Host{
ID: node.HostID,
}
err := host.Get(db.WithContext(context.TODO()))
if err != nil {
if err := host.Get(db.WithContext(context.TODO())); err != nil {
logger.Log(1, "no host found for node", node.ID.String(), "deleting..")
if delErr := DeleteNodeByID(node); delErr != nil {
logger.Log(0, "failed to delete node", node.ID.String(), delErr.Error())
return delErr
}
return err
logger.Log(1, "deleted orphaned node (no host record found)", node.ID.String())
return nil
}
if err := DissasociateNodeFromHost(node, host); err != nil {
return err
}
filters := make(map[string]bool)
if node.Address.IP != nil {
filters[node.Address.IP.String()] = true
}
if node.Address6.IP != nil {
filters[node.Address6.IP.String()] = true
}
nameservers, _ := (&schema.Nameserver{
NetworkID: node.Network,
}).ListByNetwork(db.WithContext(context.TODO()))
for _, ns := range nameservers {
ns.Servers = FilterOutIPs(ns.Servers, filters)
if len(ns.Servers) > 0 {
_ = ns.Update(db.WithContext(context.TODO()))
} else {
// TODO: deleting a nameserver dns server could cause trouble for other nodes.
// TODO: try to figure out a sequence that works the best.
_ = ns.Delete(db.WithContext(context.TODO()))
}
}
go RemoveNodeFromAclPolicy(*node)
go RemoveNodeFromEgress(*node)
go RemoveNodeFromEnrollmentKeys(node)
return nil
}