mirror of
https://github.com/gravitl/netmaker.git
synced 2026-04-22 16:07:11 +08:00
NM-288: fix orphan nodes on bulk host deletion and refactor node cleanup (#3948)
* fix(go): set persistent keep alive when registering host using sso; * fix(go): run posture check violations on delete; * fix(go): upsert node on approving pending host; * fix(go): resolve concurrency issues during group delete cleanup; * fix(go): update doc links; * fix(go): add created and updated fields to host; * fix(go): skip delete and update superadmin on sync users; * fix(go): use conn directly for now; * fix(go): remove acl for idp groups; * fix(go): quote fields; * fix(go): use filters with count; * feat(go): add a search query; * fix(go): cleanup acls; * fix(go): review fixes; * fix(go): remove additional loop; * fix(go): fix * v1.5.1: separate out idp sync and reset signals for HA * v1.5.1: add grps with name for logging * v1.5.1: clear posture check violations when all checks are deleted * v1.5.1: set static when default host * v1.5.1: fix db status check * rm set max conns * v1.5.1: reset auto assigned gw when disconnected * fix(go): skip global network admin and user groups when splitting; * v1.5.1: fix update node call from client * fix(go): separate out migration from normal usage; * fix(go): skip default groups; * fix(go): create policies for existing groups on network create; * fix(go): skip fatal log on clickhouse conn; * fix(go): add posture check cleanup; * NM-288: populate relevant name for acl types for UI * NM-288: populate grp names for posture check apis * NM-228: add network grps api * NM-288: add network users api * now check each group's NetworkRoles for either the specific network ID or schema.AllNetworks (all_networks) * NM-288: check and unassign auto gw when node is disconnected from cli * NM-288: optimise network users api call * NM-288: block auto assign when set to use inet gw * NM-288: pass all network scoped get apis for platform users * NM-288: fix orphan nodes on bulk host deletion and refactor node cleanup - Extract cleanupNodeReferences() from DeleteNode to handle best-effort cleanup of relay, internet gw, failover, nameserver, ACL, and egress refs - DisassociateAllNodesFromHost now calls cleanupNodeReferences + DeleteNodeByID directly, bypassing redundant host-association updates that could fail and leave nodes orphaned - Track failed node deletions instead of unconditionally clearing host.Nodes; return error if any nodes couldn't be deleted to prevent host removal - Fix DeleteNode returning error even when node was successfully deleted in the host-not-found path * NM-288: cleanupNodeReferences(node) now only runs in the hard-delete path (when purge=true or alreadyDeleted=true), so it won't execute twice — once on soft-delete and again when the zombie manager hard-deletes. * NM-288: add orphan deletion log * DisassociateAllNodesFromHost now always returns nil, logging warnings for any partial failures (upsert errors or undeletable nodes). This allows RemoveHost to always proceed to h.Delete() --------- Co-authored-by: VishalDalwadi <dalwadivishal26@gmail.com> Co-authored-by: Vishal Dalwadi <51291657+VishalDalwadi@users.noreply.github.com>
This commit is contained in:
+16
-5
@@ -422,7 +422,9 @@ func DissasociateNodeFromHost(n *models.Node, h *schema.Host) error {
|
||||
return UpsertHost(h)
|
||||
}
|
||||
|
||||
// DisassociateAllNodesFromHost - deletes all nodes of the host
|
||||
// DisassociateAllNodesFromHost - deletes all nodes of the host.
|
||||
// Performs reference cleanup and directly deletes each node record,
|
||||
// bypassing host-association updates since the host itself is being removed.
|
||||
func DisassociateAllNodesFromHost(hostIDStr string) error {
|
||||
hostID, err := uuid.Parse(hostIDStr)
|
||||
if err != nil {
|
||||
@@ -433,20 +435,29 @@ func DisassociateAllNodesFromHost(hostIDStr string) error {
|
||||
if err := host.Get(db.WithContext(context.TODO())); err != nil {
|
||||
return err
|
||||
}
|
||||
var failedNodes []string
|
||||
for _, nodeID := range host.Nodes {
|
||||
node, err := GetNodeByID(nodeID)
|
||||
if err != nil {
|
||||
logger.Log(0, "failed to get host node, node id:", nodeID, err.Error())
|
||||
continue
|
||||
}
|
||||
if err := DeleteNode(&node, true); err != nil {
|
||||
logger.Log(0, "failed to delete node", node.ID.String(), err.Error())
|
||||
cleanupNodeReferences(&node)
|
||||
if err := DeleteNodeByID(&node); err != nil {
|
||||
slog.Error("failed to delete node record", "node", node.ID, "host", hostIDStr, "error", err)
|
||||
failedNodes = append(failedNodes, nodeID)
|
||||
continue
|
||||
}
|
||||
logger.Log(3, "deleted node", node.ID.String(), "of host", host.ID.String())
|
||||
}
|
||||
host.Nodes = []string{}
|
||||
return UpsertHost(host)
|
||||
host.Nodes = failedNodes
|
||||
if err := UpsertHost(host); err != nil {
|
||||
slog.Error("failed to upsert host after node cleanup", "host", hostIDStr, "error", err)
|
||||
}
|
||||
if len(failedNodes) > 0 {
|
||||
slog.Warn("some nodes could not be deleted during host cleanup", "host", hostIDStr, "failed_count", len(failedNodes), "failed_nodes", failedNodes)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetDefaultHosts - retrieve all hosts marked as default from DB
|
||||
|
||||
+38
-36
@@ -279,17 +279,16 @@ func UpdateNode(currentNode *models.Node, newNode *models.Node) error {
|
||||
}
|
||||
|
||||
// DeleteNode - marks node for deletion (and adds to zombie list) if called by UI or deletes node if called by node
|
||||
func DeleteNode(node *models.Node, purge bool) error {
|
||||
alreadyDeleted := node.PendingDelete || node.Action == models.NODE_DELETE
|
||||
node.Action = models.NODE_DELETE
|
||||
//delete ext clients if node is ingress gw
|
||||
// cleanupNodeReferences handles best-effort cleanup of all external references
|
||||
// to a node (relay, internet gw, failover, nameservers, ACL, egress, enrollment keys).
|
||||
// Errors are logged but do not prevent node deletion.
|
||||
func cleanupNodeReferences(node *models.Node) {
|
||||
if node.IsIngressGateway {
|
||||
if err := DeleteGatewayExtClients(node.ID.String(), node.Network); err != nil {
|
||||
slog.Error("failed to delete ext clients", "nodeid", node.ID.String(), "error", err.Error())
|
||||
}
|
||||
}
|
||||
if node.IsRelayed {
|
||||
// cleanup node from relayednodes on relay node
|
||||
relayNode, err := GetNodeByID(node.RelayedBy)
|
||||
if err == nil {
|
||||
relayedNodes := []string{}
|
||||
@@ -310,7 +309,6 @@ func DeleteNode(node *models.Node, purge bool) error {
|
||||
ResetAutoRelayedPeer(node)
|
||||
}
|
||||
if node.IsRelay {
|
||||
// unset all the relayed nodes
|
||||
SetRelayedNodes(false, node.ID.String(), node.RelayedNodes)
|
||||
}
|
||||
if node.InternetGwID != "" {
|
||||
@@ -330,6 +328,35 @@ func DeleteNode(node *models.Node, purge bool) error {
|
||||
if node.IsInternetGateway {
|
||||
UnsetInternetGw(node)
|
||||
}
|
||||
|
||||
filters := make(map[string]bool)
|
||||
if node.Address.IP != nil {
|
||||
filters[node.Address.IP.String()] = true
|
||||
}
|
||||
if node.Address6.IP != nil {
|
||||
filters[node.Address6.IP.String()] = true
|
||||
}
|
||||
nameservers, _ := (&schema.Nameserver{
|
||||
NetworkID: node.Network,
|
||||
}).ListByNetwork(db.WithContext(context.TODO()))
|
||||
for _, ns := range nameservers {
|
||||
ns.Servers = FilterOutIPs(ns.Servers, filters)
|
||||
if len(ns.Servers) > 0 {
|
||||
_ = ns.Update(db.WithContext(context.TODO()))
|
||||
} else {
|
||||
_ = ns.Delete(db.WithContext(context.TODO()))
|
||||
}
|
||||
}
|
||||
|
||||
go RemoveNodeFromAclPolicy(*node)
|
||||
go RemoveNodeFromEgress(*node)
|
||||
go RemoveNodeFromEnrollmentKeys(node)
|
||||
}
|
||||
|
||||
func DeleteNode(node *models.Node, purge bool) error {
|
||||
alreadyDeleted := node.PendingDelete || node.Action == models.NODE_DELETE
|
||||
node.Action = models.NODE_DELETE
|
||||
|
||||
if !purge && !alreadyDeleted {
|
||||
newnode := *node
|
||||
newnode.PendingDelete = true
|
||||
@@ -342,47 +369,22 @@ func DeleteNode(node *models.Node, purge bool) error {
|
||||
if alreadyDeleted {
|
||||
logger.Log(1, "forcibly deleting node", node.ID.String())
|
||||
}
|
||||
cleanupNodeReferences(node)
|
||||
host := &schema.Host{
|
||||
ID: node.HostID,
|
||||
}
|
||||
err := host.Get(db.WithContext(context.TODO()))
|
||||
if err != nil {
|
||||
if err := host.Get(db.WithContext(context.TODO())); err != nil {
|
||||
logger.Log(1, "no host found for node", node.ID.String(), "deleting..")
|
||||
if delErr := DeleteNodeByID(node); delErr != nil {
|
||||
logger.Log(0, "failed to delete node", node.ID.String(), delErr.Error())
|
||||
return delErr
|
||||
}
|
||||
return err
|
||||
logger.Log(1, "deleted orphaned node (no host record found)", node.ID.String())
|
||||
return nil
|
||||
}
|
||||
if err := DissasociateNodeFromHost(node, host); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
filters := make(map[string]bool)
|
||||
if node.Address.IP != nil {
|
||||
filters[node.Address.IP.String()] = true
|
||||
}
|
||||
|
||||
if node.Address6.IP != nil {
|
||||
filters[node.Address6.IP.String()] = true
|
||||
}
|
||||
|
||||
nameservers, _ := (&schema.Nameserver{
|
||||
NetworkID: node.Network,
|
||||
}).ListByNetwork(db.WithContext(context.TODO()))
|
||||
for _, ns := range nameservers {
|
||||
ns.Servers = FilterOutIPs(ns.Servers, filters)
|
||||
if len(ns.Servers) > 0 {
|
||||
_ = ns.Update(db.WithContext(context.TODO()))
|
||||
} else {
|
||||
// TODO: deleting a nameserver dns server could cause trouble for other nodes.
|
||||
// TODO: try to figure out a sequence that works the best.
|
||||
_ = ns.Delete(db.WithContext(context.TODO()))
|
||||
}
|
||||
}
|
||||
|
||||
go RemoveNodeFromAclPolicy(*node)
|
||||
go RemoveNodeFromEgress(*node)
|
||||
go RemoveNodeFromEnrollmentKeys(node)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user