Files
netmaker/logic/peers.go
T
Abhishek Kondur 292af315dd NM-271: Scalability Improvements (#3921)
* feat(go): add user schema;

* feat(go): migrate to user schema;

* feat(go): add audit fields;

* feat(go): remove unused fields from the network model;

* feat(go): add network schema;

* feat(go): migrate to network schema;

* refactor(go): add comment to clarify migration logic;

* fix(go): test failures;

* fix(go): test failures;

* feat(go): change membership table to store memberships at all scopes;

* feat(go): add schema for access grants;

* feat(go): remove nameservers from new networks table; ensure db passed for schema functions;

* feat(go): set max conns for sqlite to 1;

* fix(go): issues updating user account status;

* NM-236: streamline operations in HA mode

* NM-236: only master pod should subscribe to updates from clients

* refactor(go): remove converters and access grants;

* refactor(go): add json tags in schema models;

* refactor(go): rename file to migrate_v1_6_0.go;

* refactor(go): add user groups and user roles tables; use schema tables;

* refactor(go): inline get and list from schema package;

* refactor(go): inline get network and list users from schema package;

* fix(go): staticcheck issues;

* fix(go): remove test not in use; fix test case;

* fix(go): validate network;

* fix(go): resolve static checks;

* fix(go): new models errors;

* fix(go): test errors;

* fix(go): handle no records;

* fix(go): add validations for user object;

* fix(go): set correct extclient status;

* fix(go): test error;

* feat(go): make schema the base package;

* feat(go): add host schema;

* feat(go): use schema host everywhere;

* feat(go): inline get host, list hosts and delete host;

* feat(go): use non-ptr value;

* feat(go): use save to upsert all fields;

* feat(go): use save to upsert all fields;

* feat(go): save turn endpoint as string;

* feat(go): check for gorm error record not found;

* fix(go): test failures;

* fix(go): update all network fields;

* fix(go): update all network fields;

* feat(go): add paginated list networks api;

* feat(go): add paginated list users api;

* feat(go): add paginated list hosts api;

* feat(go): add pagination to list groups api;

* fix(go): comment;

* fix(go): implement marshal and unmarshal text for custom types;

* fix(go): implement marshal and unmarshal json for custom types;

* fix(go): just use the old model for unmarshalling;

* fix(go): implement marshal and unmarshal json for custom types;

* NM-271:Import swap: compress/gzip replaced with github.com/klauspost/compress/gzip (2-4x faster, wire-compatible output). Added sync import.
Two sync.Pool variables (gzipWriterPool, bufferPool): reuse gzip.Writer and bytes.Buffer across calls instead of allocating fresh ones per publish.
compressPayload rewritten: pulls writer + buffer from pools, resets them, compresses at gzip.BestSpeed (level 1), copies the result out of the pooled buffer, and returns both objects to the pools.

* feat(go): remove paginated list networks api;

* feat(go): use custom paginated response object;

* NM-271: Improve server scalability under high host count

- Replace stdlib compress/gzip with klauspost/compress at BestSpeed and
  pool gzip writers and buffers via sync.Pool to eliminate compression
  as the dominant CPU hotspot.

- Debounce peer update broadcasts with a 500ms resettable window capped
  at 3s max-wait, coalescing rapid-fire PublishPeerUpdate calls into a
  single broadcast cycle.

- Cache HostPeerInfo (batch-refreshed by debounce worker) and
  HostPeerUpdate (stored as side-effect of each publish) so the pull API
  and peer_info API serve from pre-computed maps instead of triggering
  expensive per-host computations under thundering herd conditions.

- Warm both caches synchronously at startup before the first publish
  cycle so early pull requests are served instantly.

- Bound concurrent MQTT publishes to 5 via semaphore to prevent
  broker TCP buffer overflows that caused broken pipe disconnects.

- Remove manual Disconnect+SetupMQTT from ConnectionLostHandler and
  rely on the paho client's built-in AutoReconnect; add a 5s retry
  wait in publish() to ride out brief reconnection windows.

* NM-271: Reduce server CPU contention under high concurrent load

- Cache ServerSettings with atomic.Value to eliminate repeated DB reads
  on every pull request (was 32+ goroutines blocked on read lock)
- Batch UpdateNodeCheckin writes in memory, flush every 30s to reduce
  per-checkin write lock contention (was 88+ goroutines blocked)
- Enable SQLite WAL mode + busy_timeout and remove global dbMutex;
  let SQLite handle concurrency natively (reads no longer block writes)
- Move ResetFailedOverPeer/ResetAutoRelayedPeer to async in pull()
  handler since results don't affect the cached response
- Skip no-op UpsertNode writes in failover/relay reset functions
  (early return when node has no failover/relay state)
- Remove CheckHostPorts from hostUpdateFallback hot path
- Switch to pure-Go SQLite driver (glebarez/sqlite), set CGO_ENABLED=0

* fix(go): ensure default values for page and per_page are used when not passed;

* fix(go): rename v1.6.0 to v1.5.1;

* fix(go): check for gorm.ErrRecordNotFound instead of database.IsEmptyRecord;

* fix(go): use host id, not pending host id;

* NM-271: Revert pure-Go SQLite and FIPS disable to verify impact

Revert to CGO-based mattn/go-sqlite3 driver and re-enable FIPS to
isolate whether these changes are still needed now that the global
dbMutex has been removed and WAL mode is enabled. Keep WAL mode
pragma with mattn-compatible DSN format.

* feat(go): add filters to paginated apis;

* feat(go): add filters to paginated apis;

* feat(go): remove check for max username length;

* feat(go): add filters to count as well;

* feat(go): use library to check email address validity;

* feat(go): ignore pagination if params not passed;

* fix(go): pagination issues;

* fix(go): check exists before using;

* fix(go): remove debug log;

* NM-271: rm debug logs

* NM-271: check if caching is enabled

* NM-271: add server sync mq topic for HA mode

* NM-271: fix build

* NM-271: push metrics in batch to exproter over api

* NM-271: use basic auth for exporter metrics api

* fix(go): use gorm err record not found;

* NM-271: Add monitoring stack on demand

* NM-271: -m arg for install script should only add monitoring stack

* fix(go): use gorm err record not found;

* NM-271: update docker compose file for prometheus

* NM-271: update docker compose file for prometheus

* fix(go): use user principal name when creating pending user;

* fix(go): use schema package for consts;

* NM-236: rm duplicate network hook

* NM-271: add server topic to reset idp hooks on master node

* fix(go): prevent disabling superadmin user;

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): swap is admin and is superadmin;

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): remove dead code block;

https://github.com/gravitl/netmaker/pull/3910#discussion_r2928837937

* fix(go): incorrect message when trying to disable self;

https://github.com/gravitl/netmaker/pull/3910#discussion_r2928837934

* NM-271: fix stale peers on reset_failovered pull and add HTTP timeout to metrics exporter

Run the failover/relay reset synchronously in the pull handler so the
response reflects post-reset topology instead of serving stale cached
peers. Add a 30s timeout to the metrics exporter HTTP client to prevent
PushAllMetricsToExporter from blocking the Keepalive loop.

* NM-271: fix gzip pool corruption, MQTT topic mismatch, stale settings cache, and reduce redundant DB fetches

- Only return gzip.Writer to pool after successful Close to prevent
  silently malformed MQTT payloads from a previously errored writer.
- Fix serversync subscription to exact topic match since syncType is
  now in the message payload, not the topic path.
- Prevent zero-value ServerSettings from being cached indefinitely
  when the DB record is missing or unmarshal fails on startup.
- Return fetched hosts/nodes from RefreshHostPeerInfoCache so
  warmPeerCaches reuses them instead of querying the DB twice.
- Compute fresh HostPeerUpdate on reset_failovered pull instead of
  serving stale cache, and store result back for subsequent requests.

* NM-271: fix gzip writer pool leak, log checkin flush errors, and fix master pod ordinal parsing

- Reset gzip.Writer to io.Discard before returning to pool so errored
  writers are never leaked or silently reused with corrupt state.
- Track and log failed DB inserts in FlushNodeCheckins so operators
  have visibility when check-in timestamps are lost.
- Parse StatefulSet pod ordinal as integer instead of using HasSuffix
  to prevent netmaker-10 from being misidentified as master pod.

* NM-271: simplify masterpod logic

* fix(go): use correct header;

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): return after error response;

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): use correct order of params;

https://github.com/gravitl/netmaker/pull/3910#discussion_r2929593036

* fix(go): set default values for page and page size; use v2 instead of /list;

* NM-271: use host name

* Update mq/serversync.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* NM-271: fix duplicate serversynce case

* NM-271: streamline gw updates

* Update logic/auth.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* Update schema/user_roles.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): syntax error;

* fix(go): set default values when page and per_page are not passed or 0;

* fix(go): use uuid.parse instead of uuid.must parse;

* fix(go): review errors;

* fix(go): review errors;

* Update controllers/user.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* Update controllers/user.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* NM-163: fix errors:

* Update db/types/options.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): persist return user in event;

* Update db/types/options.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* NM-271: signal pull on ip changes

* NM-163: duplicate lines of code

* NM-163: fix(go): fix missing return and filter parsing in user controller

- Add missing return after error response in updateUserAccountStatus
  to prevent double-response and spurious ext-client side-effects
- Use switch statements in listUsers to skip unrecognized
  account_status and mfa_status filter values

* NM-271: signal pull req on node ip change

* fix(go): check for both min and max page size;

* NM-271: refresh node object before update

* fix(go): enclose transfer superadmin in transaction;

* fix(go): review errors;

* fix(go): remove free tier checks;

* fix(go): review fixes;

* NM-271: streamline ip pool ops

* NM-271: fix tests, set max idle conns

* NM-271: fix(go): fix data races in settings cache and peer update worker

- Use pointer type in atomic.Value for serverSettingsCache to avoid
  replacing the variable non-atomically in InvalidateServerSettingsCache
- Swap peerUpdateReplace flag before draining the channel to prevent
  a concurrent replacePeers=true from being consumed by the wrong cycle

---------

Co-authored-by: VishalDalwadi <dalwadivishal26@gmail.com>
Co-authored-by: Vishal Dalwadi <51291657+VishalDalwadi@users.noreply.github.com>
Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>
2026-03-18 00:24:54 +05:30

950 lines
32 KiB
Go

package logic
import (
"context"
"errors"
"fmt"
"net"
"net/netip"
"sync"
"time"
"github.com/google/uuid"
"github.com/gravitl/netmaker/database"
"github.com/gravitl/netmaker/db"
"github.com/gravitl/netmaker/logger"
"github.com/gravitl/netmaker/logic/acls/nodeacls"
"github.com/gravitl/netmaker/models"
"github.com/gravitl/netmaker/schema"
"github.com/gravitl/netmaker/servercfg"
"golang.org/x/exp/slices"
"golang.org/x/exp/slog"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
)
var (
// ResetFailOver - function to reset failOvered peers on this node
ResetFailOver = func(failOverNode *models.Node) error {
return nil
}
// ResetFailedOverPeer - removes failed over node from network peers
ResetFailedOverPeer = func(failedOverNode *models.Node) error {
return nil
}
// FailOverExists - check if failover node existed or not
FailOverExists = func(network string) (failOverNode models.Node, exists bool) {
return failOverNode, exists
}
// GetFailOverPeerIps - gets failover peerips
GetFailOverPeerIps = func(peer, node *models.Node) []net.IPNet {
return []net.IPNet{}
}
// CreateFailOver - creates failover in a network
CreateFailOver = func(node models.Node) error {
return nil
}
)
var (
// ResetAutoRelay - function to reset autorelayed peers on this node
ResetAutoRelay = func(autoRelayNode *models.Node) error {
return nil
}
// ResetAutoRelayedPeer - removes relayed peers for node
ResetAutoRelayedPeer = func(failedOverNode *models.Node) error {
return nil
}
// GetAutoRelayPeerIps - gets autorelay peerips
GetAutoRelayPeerIps = func(peer, node *models.Node) []net.IPNet {
return []net.IPNet{}
}
// SetAutoRelay - sets autorelay flag on the node
SetAutoRelay = func(node *models.Node) {
node.IsAutoRelay = false
}
)
var (
hostPeerInfoCache map[string]models.HostPeerInfo
hostPeerInfoCacheMu sync.RWMutex
)
var (
hostPeerUpdateCache map[string]models.HostPeerUpdate
hostPeerUpdateCacheMu sync.RWMutex
)
// InvalidateHostPeerCaches clears both hostPeerInfoCache and
// hostPeerUpdateCache so they are rebuilt on next access or refresh.
func InvalidateHostPeerCaches() {
hostPeerInfoCacheMu.Lock()
hostPeerInfoCache = nil
hostPeerInfoCacheMu.Unlock()
hostPeerUpdateCacheMu.Lock()
hostPeerUpdateCache = nil
hostPeerUpdateCacheMu.Unlock()
}
// StoreHostPeerUpdate - caches a computed HostPeerUpdate for a host.
// Called as a side-effect of PublishSingleHostPeerUpdate during broadcast.
func StoreHostPeerUpdate(hostID string, peerUpdate models.HostPeerUpdate) {
hostPeerUpdateCacheMu.Lock()
if hostPeerUpdateCache == nil {
hostPeerUpdateCache = make(map[string]models.HostPeerUpdate)
}
hostPeerUpdateCache[hostID] = peerUpdate
hostPeerUpdateCacheMu.Unlock()
}
// GetCachedHostPeerUpdate - returns a cached HostPeerUpdate if available.
func GetCachedHostPeerUpdate(hostID string) (models.HostPeerUpdate, bool) {
hostPeerUpdateCacheMu.RLock()
defer hostPeerUpdateCacheMu.RUnlock()
if hostPeerUpdateCache == nil {
return models.HostPeerUpdate{}, false
}
hpu, ok := hostPeerUpdateCache[hostID]
return hpu, ok
}
// GetHostPeerInfo - returns cached peer info for a host.
// Falls back to on-demand computation if the cache is not yet populated.
func GetHostPeerInfo(host *schema.Host) (models.HostPeerInfo, error) {
hostID := host.ID.String()
hostPeerInfoCacheMu.RLock()
if hostPeerInfoCache != nil {
if info, ok := hostPeerInfoCache[hostID]; ok {
hostPeerInfoCacheMu.RUnlock()
return info, nil
}
}
hostPeerInfoCacheMu.RUnlock()
return computeHostPeerInfo(host, nil, models.ServerConfig{})
}
// RefreshHostPeerInfoCache - batch pre-computes peer info for all hosts
// and stores the results in the cache. Returns the fetched hosts and
// nodes so callers can reuse them without redundant DB queries.
func RefreshHostPeerInfoCache() ([]schema.Host, []models.Node) {
hosts, err := (&schema.Host{}).ListAll(db.WithContext(context.TODO()))
if err != nil {
slog.Error("failed to refresh host peer info cache", "error", err)
return nil, nil
}
allNodes, err := GetAllNodes()
if err != nil {
slog.Error("failed to refresh host peer info cache", "error", err)
return nil, nil
}
serverInfo := GetServerInfo()
newCache := make(map[string]models.HostPeerInfo, len(hosts))
for i := range hosts {
info, err := computeHostPeerInfo(&hosts[i], allNodes, serverInfo)
if err != nil {
continue
}
newCache[hosts[i].ID.String()] = info
}
hostPeerInfoCacheMu.Lock()
hostPeerInfoCache = newCache
hostPeerInfoCacheMu.Unlock()
return hosts, allNodes
}
// computeHostPeerInfo - computes peer info for a single host.
// If allNodes is nil or serverInfo is zero-value, fetches them fresh.
func computeHostPeerInfo(host *schema.Host, allNodes []models.Node, serverInfo models.ServerConfig) (models.HostPeerInfo, error) {
peerInfo := models.HostPeerInfo{
NetworkPeerIDs: make(map[schema.NetworkID]models.PeerMap),
}
var err error
if allNodes == nil {
allNodes, err = GetAllNodes()
if err != nil {
return peerInfo, err
}
}
if serverInfo.Server == "" {
serverInfo = GetServerInfo()
}
for _, nodeID := range host.Nodes {
nodeID := nodeID
node, err := GetNodeByID(nodeID)
if err != nil {
continue
}
if !node.Connected || node.PendingDelete || node.Action == models.NODE_DELETE {
continue
}
networkPeersInfo := make(models.PeerMap)
defaultDevicePolicy, _ := GetDefaultPolicy(schema.NetworkID(node.Network), models.DevicePolicy)
currentPeers := GetNetworkNodesMemory(allNodes, node.Network)
for _, peer := range currentPeers {
peer := peer
if peer.ID.String() == node.ID.String() {
continue
}
peerHost := &schema.Host{
ID: peer.HostID,
}
err := peerHost.Get(db.WithContext(context.TODO()))
if err != nil {
logger.Log(4, "no peer host", peer.HostID.String(), err.Error())
continue
}
var allowedToComm bool
if defaultDevicePolicy.Enabled {
allowedToComm = true
} else {
allowedToComm = IsPeerAllowed(node, peer, false)
}
if peer.Action != models.NODE_DELETE &&
!peer.PendingDelete &&
peer.Connected &&
(!serverInfo.OldAClsSupport || nodeacls.AreNodesAllowed(nodeacls.NetworkID(node.Network), nodeacls.NodeID(node.ID.String()), nodeacls.NodeID(peer.ID.String()))) &&
(allowedToComm) {
networkPeersInfo[peerHost.PublicKey.String()] = models.IDandAddr{
ID: peer.ID.String(),
HostID: peerHost.ID.String(),
Address: peer.PrimaryAddress(),
Address4: peer.Address.IP.String(),
Address6: peer.Address6.IP.String(),
Name: peerHost.Name,
Network: peer.Network,
ListenPort: peerHost.ListenPort,
}
}
}
var extPeerIDAndAddrs []models.IDandAddr
if node.IsIngressGateway {
_, extPeerIDAndAddrs, _, err = GetExtPeers(&node, &node, make(map[string]models.PeerIdentity))
if err == nil {
for _, extPeerIdAndAddr := range extPeerIDAndAddrs {
networkPeersInfo[extPeerIdAndAddr.ID] = extPeerIdAndAddr
}
}
}
peerInfo.NetworkPeerIDs[schema.NetworkID(node.Network)] = networkPeersInfo
}
return peerInfo, nil
}
// GetPeerUpdateForHost - gets the consolidated peer update for the host from all networks
func GetPeerUpdateForHost(network string, host *schema.Host, allNodes []models.Node,
deletedNode *models.Node, deletedClients []models.ExtClient) (models.HostPeerUpdate, error) {
if host == nil {
return models.HostPeerUpdate{}, errors.New("host is nil")
}
// track which nodes are deleted
// after peer calculation, if peer not in list, add delete config of peer
hostPeerUpdate := models.HostPeerUpdate{
Host: *host,
Server: servercfg.GetServer(),
ServerVersion: servercfg.GetVersion(),
ServerAddrs: []models.ServerAddr{},
FwUpdate: models.FwUpdate{
AllowAll: true,
EgressInfo: make(map[string]models.EgressInfo),
IngressInfo: make(map[string]models.IngressInfo),
AclRules: make(map[string]models.AclRule),
},
PeerIDs: make(models.PeerMap, 0),
Peers: []wgtypes.PeerConfig{},
NodePeers: []wgtypes.PeerConfig{},
HostNetworkInfo: models.HostInfoMap{},
ServerConfig: GetServerInfo(),
DnsNameservers: GetNameserversForHost(host),
AutoRelayNodes: make(map[schema.NetworkID][]models.Node),
GwNodes: make(map[schema.NetworkID][]models.Node),
AddressIdentityMap: make(map[string]models.PeerIdentity),
}
if host.DNS == "no" {
hostPeerUpdate.ManageDNS = false
}
if !GetFeatureFlags().EnableFlowLogs || !GetServerSettings().EnableFlowLogs {
host.EnableFlowLogs = false
}
defer func() {
if !hostPeerUpdate.FwUpdate.AllowAll {
if len(hostPeerUpdate.FwUpdate.AllowedNetworks) > 0 {
hostPeerUpdate.FwUpdate.EgressInfo["allowed-network-rules"] = models.EgressInfo{
EgressID: "allowed-network-rules",
EgressFwRules: make(map[string]models.AclRule),
}
}
for _, aclRule := range hostPeerUpdate.FwUpdate.AllowedNetworks {
hostPeerUpdate.FwUpdate.AclRules[aclRule.ID] = aclRule
hostPeerUpdate.FwUpdate.EgressInfo["allowed-network-rules"].EgressFwRules[aclRule.ID] = aclRule
}
}
}()
slog.Debug("peer update for host", "hostId", host.ID.String())
peerIndexMap := make(map[string]int)
for _, nodeID := range host.Nodes {
networkAllowAll := true
nodeID := nodeID
if nodeID == uuid.Nil.String() {
continue
}
node, err := GetNodeByID(nodeID)
if err != nil {
continue
}
if !node.Connected || node.PendingDelete || node.Action == models.NODE_DELETE ||
(!node.LastCheckIn.IsZero() && time.Since(node.LastCheckIn) > time.Hour) {
if deletedNode == nil || deletedNode.ID != node.ID {
continue
}
}
if host.EnableFlowLogs {
if node.Address.IP != nil {
hostPeerUpdate.AddressIdentityMap[node.Address.IP.String()+"/32"] = models.PeerIdentity{
ID: node.ID.String(),
Type: models.PeerType_Node,
Name: host.Name,
}
}
if node.Address6.IP != nil {
hostPeerUpdate.AddressIdentityMap[node.Address6.IP.String()+"/128"] = models.PeerIdentity{
ID: node.ID.String(),
Type: models.PeerType_Node,
Name: host.Name,
}
}
}
hostPeerUpdate.Nodes = append(hostPeerUpdate.Nodes, node)
acls, _ := ListAclsByNetwork(schema.NetworkID(node.Network))
eli, _ := (&schema.Egress{Network: node.Network}).ListByNetwork(db.WithContext(context.TODO()))
GetNodeEgressInfo(&node, eli, acls)
egsWithDomain := ListAllByRoutingNodeWithDomain(eli, node.ID.String())
if len(egsWithDomain) > 0 {
hostPeerUpdate.EgressWithDomains = append(hostPeerUpdate.EgressWithDomains, egsWithDomain...)
}
hostPeerUpdate = SetDefaultGw(node, hostPeerUpdate)
if !hostPeerUpdate.IsInternetGw {
hostPeerUpdate.IsInternetGw = IsInternetGw(node)
}
hostPeerUpdate.DnsNameservers = append(hostPeerUpdate.DnsNameservers, GetEgressDomainNSForNode(&node)...)
defaultUserPolicy, _ := GetDefaultPolicy(schema.NetworkID(node.Network), models.UserPolicy)
defaultDevicePolicy, _ := GetDefaultPolicy(schema.NetworkID(node.Network), models.DevicePolicy)
if (defaultDevicePolicy.Enabled && defaultUserPolicy.Enabled) ||
(!CheckIfAnyPolicyisUniDirectional(node, acls) &&
!(node.EgressDetails.IsEgressGateway && len(node.EgressDetails.EgressGatewayRanges) > 0)) {
aclRule := models.AclRule{
ID: fmt.Sprintf("%s-allowed-network-rules", node.ID.String()),
AllowedProtocol: models.ALL,
Direction: models.TrafficDirectionBi,
Allowed: true,
IPList: []net.IPNet{node.NetworkRange},
IP6List: []net.IPNet{node.NetworkRange6},
}
if !(defaultDevicePolicy.Enabled && defaultUserPolicy.Enabled) {
aclRule.Dst = []net.IPNet{node.NetworkRange}
aclRule.Dst6 = []net.IPNet{node.NetworkRange6}
}
hostPeerUpdate.FwUpdate.AllowedNetworks = append(hostPeerUpdate.FwUpdate.AllowedNetworks, aclRule)
} else {
networkAllowAll = false
hostPeerUpdate.FwUpdate.AllowAll = false
rules := GetAclRulesForNode(&node)
if len(hostPeerUpdate.FwUpdate.AclRules) == 0 {
hostPeerUpdate.FwUpdate.AclRules = rules
} else {
for aclID, rule := range rules {
hostPeerUpdate.FwUpdate.AclRules[aclID] = rule
}
}
}
currentPeers := GetNetworkNodesMemory(allNodes, node.Network)
for _, peer := range currentPeers {
if peer.ID.String() == node.ID.String() {
// skip yourself
continue
}
peerHost := &schema.Host{
ID: peer.HostID,
}
err := peerHost.Get(db.WithContext(context.TODO()))
if err != nil {
logger.Log(4, "no peer host", peer.HostID.String(), err.Error())
continue
}
peerConfig := wgtypes.PeerConfig{
PublicKey: peerHost.PublicKey.Key,
PersistentKeepaliveInterval: &peerHost.PersistentKeepalive,
ReplaceAllowedIPs: true,
}
GetNodeEgressInfo(&peer, eli, acls)
if peer.EgressDetails.IsEgressGateway {
AddEgressInfoToPeerByAccess(&node, &peer, eli, acls, defaultDevicePolicy.Enabled)
}
if node.Mutex != nil {
node.Mutex.Lock()
}
_, isFailOverPeer := node.FailOverPeers[peer.ID.String()]
peerAutoRelayID, isAutoRelayPeer := node.AutoRelayedPeers[peer.ID.String()]
if node.Mutex != nil {
node.Mutex.Unlock()
}
if peer.EgressDetails.IsEgressGateway {
peerKey := peerHost.PublicKey.String()
if isFailOverPeer && peer.FailedOverBy.String() != node.ID.String() {
// get relay host
failOverNode, err := GetNodeByID(peer.FailedOverBy.String())
if err == nil {
relayHost := &schema.Host{
ID: failOverNode.HostID,
}
err := relayHost.Get(db.WithContext(context.TODO()))
if err == nil {
peerKey = relayHost.PublicKey.String()
}
}
}
if isAutoRelayPeer && peerAutoRelayID != node.ID.String() {
// get relay host
autoRelayNode, err := GetNodeByID(peerAutoRelayID)
if err == nil {
relayHost := &schema.Host{
ID: autoRelayNode.HostID,
}
err = relayHost.Get(db.WithContext(context.TODO()))
if err == nil {
peerKey = relayHost.PublicKey.String()
}
}
}
if peer.IsRelayed && (peer.RelayedBy != node.ID.String()) {
// get relay host
relayNode, err := GetNodeByID(peer.RelayedBy)
if err == nil {
relayHost := &schema.Host{
ID: relayNode.HostID,
}
err := relayHost.Get(db.WithContext(context.TODO()))
if err == nil {
peerKey = relayHost.PublicKey.String()
}
}
}
hostPeerUpdate.EgressRoutes = append(hostPeerUpdate.EgressRoutes, models.EgressNetworkRoutes{
PeerKey: peerKey,
EgressGwAddr: peer.Address,
EgressGwAddr6: peer.Address6,
NodeAddr: node.Address,
NodeAddr6: node.Address6,
EgressRanges: filterConflictingEgressRoutes(node, peer),
EgressRangesWithMetric: filterConflictingEgressRoutesWithMetric(node, peer),
Network: peer.Network,
})
}
if peer.IsIngressGateway {
hostPeerUpdate.EgressRoutes = append(hostPeerUpdate.EgressRoutes, getExtpeersExtraRoutes(node)...)
}
var allowedToComm bool
if defaultDevicePolicy.Enabled {
allowedToComm = true
} else {
allowedToComm = IsPeerAllowed(node, peer, false)
}
if allowedToComm {
if peer.IsAutoRelay {
hostPeerUpdate.AutoRelayNodes[schema.NetworkID(peer.Network)] = append(hostPeerUpdate.AutoRelayNodes[schema.NetworkID(peer.Network)],
peer)
}
if node.AutoAssignGateway && peer.IsGw {
hostPeerUpdate.GwNodes[schema.NetworkID(peer.Network)] = append(hostPeerUpdate.GwNodes[schema.NetworkID(peer.Network)],
peer)
}
}
if (node.IsRelayed && node.RelayedBy != peer.ID.String()) ||
(peer.IsRelayed && peer.RelayedBy != node.ID.String()) || isFailOverPeer || isAutoRelayPeer {
// if node is relayed and peer is not the relay, set remove to true
if _, ok := peerIndexMap[peerHost.PublicKey.String()]; ok {
continue
}
peerConfig.Remove = true
hostPeerUpdate.Peers = append(hostPeerUpdate.Peers, peerConfig)
peerIndexMap[peerHost.PublicKey.String()] = len(hostPeerUpdate.Peers) - 1
continue
}
if node.IsRelayed && node.RelayedBy == peer.ID.String() {
hostPeerUpdate = SetDefaultGwForRelayedUpdate(node, peer, hostPeerUpdate)
}
uselocal := false
if host.EndpointIP.String() == peerHost.EndpointIP.String() {
// peer is on same network
// set to localaddress
uselocal = true
if node.LocalAddress.IP == nil {
// use public endpint
uselocal = false
}
if node.LocalAddress.String() == peer.LocalAddress.String() {
uselocal = false
}
}
//1. check currHost has ipv4 endpoint and peerhost has ipv4 then set ipv4 endpoint for peer
// 2. check currHost has ipv6 endpoint and peerhost has ipv6 then set ipv6 endpoint for peer
//if host is ipv4 only or ipv4+ipv6, set the peer endpoint to ipv4 address, if host is ipv6 only, set the peer endpoint to ipv6 address
var peerEndpoint net.IP
if host.EndpointIP != nil && peerHost.EndpointIP != nil {
peerEndpoint = peerHost.EndpointIP
} else if host.EndpointIPv6 != nil && peerHost.EndpointIPv6 != nil {
peerEndpoint = peerHost.EndpointIPv6
}
if host.EndpointIP == nil && peerEndpoint == nil {
if peerHost.EndpointIP != nil {
peerEndpoint = peerHost.EndpointIP
}
}
if host.EndpointIPv6 == nil && peerEndpoint == nil {
if peerHost.EndpointIPv6 != nil {
peerEndpoint = peerHost.EndpointIPv6
}
}
if node.IsRelay && peer.RelayedBy == node.ID.String() && peer.InternetGwID == "" && !peer.IsStatic {
// don't set endpoint on relayed peer
peerEndpoint = nil
}
if isFailOverPeer && peer.FailedOverBy == node.ID && !peer.IsStatic {
peerEndpoint = nil
}
if isAutoRelayPeer && peerAutoRelayID == node.ID.String() && !peer.IsStatic {
peerEndpoint = nil
}
peerConfig.Endpoint = &net.UDPAddr{
IP: peerEndpoint,
Port: GetPeerListenPort(peerHost),
}
if uselocal {
peerConfig.Endpoint.Port = peerHost.ListenPort
}
if peer.Action != models.NODE_DELETE &&
!peer.PendingDelete &&
peer.Connected &&
(!hostPeerUpdate.ServerConfig.OldAClsSupport || nodeacls.AreNodesAllowed(nodeacls.NetworkID(node.Network), nodeacls.NodeID(node.ID.String()), nodeacls.NodeID(peer.ID.String()))) &&
(allowedToComm) &&
(deletedNode == nil || (peer.ID.String() != deletedNode.ID.String())) {
peerConfig.AllowedIPs = GetAllowedIPs(&node, &peer, nil) // only append allowed IPs if valid connection
}
var nodePeer wgtypes.PeerConfig
if _, ok := peerIndexMap[peerHost.PublicKey.String()]; !ok {
hostPeerUpdate.Peers = append(hostPeerUpdate.Peers, peerConfig)
peerIndexMap[peerHost.PublicKey.String()] = len(hostPeerUpdate.Peers) - 1
hostPeerUpdate.HostNetworkInfo[peerHost.PublicKey.String()] = models.HostNetworkInfo{
Interfaces: peerHost.Interfaces,
ListenPort: peerHost.ListenPort,
IsStaticPort: peerHost.IsStaticPort,
IsStatic: peerHost.IsStatic,
}
nodePeer = peerConfig
} else {
peerAllowedIPs := hostPeerUpdate.Peers[peerIndexMap[peerHost.PublicKey.String()]].AllowedIPs
peerAllowedIPs = append(peerAllowedIPs, peerConfig.AllowedIPs...)
hostPeerUpdate.Peers[peerIndexMap[peerHost.PublicKey.String()]].AllowedIPs = peerAllowedIPs
hostPeerUpdate.Peers[peerIndexMap[peerHost.PublicKey.String()]].Remove = false
hostPeerUpdate.Peers[peerIndexMap[peerHost.PublicKey.String()]].Endpoint = peerConfig.Endpoint
hostPeerUpdate.HostNetworkInfo[peerHost.PublicKey.String()] = models.HostNetworkInfo{
Interfaces: peerHost.Interfaces,
ListenPort: peerHost.ListenPort,
IsStaticPort: peerHost.IsStaticPort,
IsStatic: peerHost.IsStatic,
}
nodePeer = hostPeerUpdate.Peers[peerIndexMap[peerHost.PublicKey.String()]]
}
if node.Network == network && !peerConfig.Remove && len(peerConfig.AllowedIPs) > 0 { // add to peers map for metrics
hostPeerUpdate.PeerIDs[peerHost.PublicKey.String()] = models.IDandAddr{
ID: peer.ID.String(),
HostID: peerHost.ID.String(),
Address: peer.PrimaryAddress(),
Name: peerHost.Name,
Network: peer.Network,
ListenPort: peerHost.ListenPort,
}
hostPeerUpdate.NodePeers = append(hostPeerUpdate.NodePeers, nodePeer)
}
if host.EnableFlowLogs {
if peer.Address.IP != nil {
hostPeerUpdate.AddressIdentityMap[peer.Address.IP.String()+"/32"] = models.PeerIdentity{
ID: peer.ID.String(),
Type: models.PeerType_Node,
Name: peerHost.Name,
}
}
if peer.Address6.IP != nil {
hostPeerUpdate.AddressIdentityMap[peer.Address6.IP.String()+"/128"] = models.PeerIdentity{
ID: peer.ID.String(),
Type: models.PeerType_Node,
Name: peerHost.Name,
}
}
}
}
var extPeers []wgtypes.PeerConfig
var extPeerIDAndAddrs []models.IDandAddr
var egressRoutes []models.EgressNetworkRoutes
if node.IsIngressGateway {
hostPeerUpdate.FwUpdate.IsIngressGw = true
extPeers, extPeerIDAndAddrs, egressRoutes, err = GetExtPeers(&node, &node, hostPeerUpdate.AddressIdentityMap)
if err == nil {
if !defaultDevicePolicy.Enabled || !defaultUserPolicy.Enabled {
ingFwUpdate := models.IngressInfo{
IngressID: node.ID.String(),
Network: node.NetworkRange,
Network6: node.NetworkRange6,
StaticNodeIps: GetStaticNodeIps(node),
Rules: GetFwRulesOnIngressGateway(node),
}
ingFwUpdate.EgressRanges, ingFwUpdate.EgressRanges6 = getExtpeerEgressRanges(node)
hostPeerUpdate.FwUpdate.IngressInfo[node.ID.String()] = ingFwUpdate
}
hostPeerUpdate.EgressRoutes = append(hostPeerUpdate.EgressRoutes, egressRoutes...)
hostPeerUpdate.Peers = append(hostPeerUpdate.Peers, extPeers...)
for _, extPeerIdAndAddr := range extPeerIDAndAddrs {
extPeerIdAndAddr := extPeerIdAndAddr
if node.Network == network {
hostPeerUpdate.PeerIDs[extPeerIdAndAddr.ID] = extPeerIdAndAddr
hostPeerUpdate.NodePeers = append(hostPeerUpdate.NodePeers, extPeers...)
}
}
} else if !database.IsEmptyRecord(err) {
logger.Log(1, "error retrieving external clients:", err.Error())
}
}
if node.EgressDetails.IsEgressGateway && len(node.EgressDetails.EgressGatewayRequest.Ranges) > 0 {
hostPeerUpdate.FwUpdate.IsEgressGw = true
hostPeerUpdate.FwUpdate.EgressInfo[node.ID.String()] = models.EgressInfo{
EgressID: node.ID.String(),
Network: node.PrimaryNetworkRange(),
EgressGwAddr: net.IPNet{
IP: net.ParseIP(node.PrimaryAddress()),
Mask: getCIDRMaskFromAddr(node.PrimaryAddress()),
},
Network6: node.NetworkRange6,
EgressGwAddr6: net.IPNet{
IP: node.Address6.IP,
Mask: getCIDRMaskFromAddr(node.Address6.IP.String()),
},
EgressGWCfg: node.EgressDetails.EgressGatewayRequest,
EgressFwRules: make(map[string]models.AclRule),
}
if host.EnableFlowLogs {
for _, egressRange := range node.EgressDetails.EgressGatewayRequest.RangesWithMetric {
if egressRange.EgressID != "" {
hostPeerUpdate.AddressIdentityMap[egressRange.Network] = models.PeerIdentity{
ID: egressRange.EgressID,
Type: models.PeerType_EgressRoute,
Name: egressRange.EgressName,
}
}
}
}
}
if node.EgressDetails.IsEgressGateway {
if !networkAllowAll {
egressInfo := hostPeerUpdate.FwUpdate.EgressInfo[node.ID.String()]
if egressInfo.EgressFwRules == nil {
egressInfo.EgressFwRules = make(map[string]models.AclRule)
}
egressInfo.EgressFwRules = GetEgressRulesForNode(node)
hostPeerUpdate.FwUpdate.EgressInfo[node.ID.String()] = egressInfo
}
}
if IsInternetGw(node) {
hostPeerUpdate.FwUpdate.IsEgressGw = true
egressrange := []string{"0.0.0.0/0"}
if node.Address6.IP != nil {
egressrange = append(egressrange, "::/0")
}
rangeWithMetric := []models.EgressRangeMetric{}
for _, rangeI := range egressrange {
rangeWithMetric = append(rangeWithMetric, models.EgressRangeMetric{
Network: rangeI,
RouteMetric: 256,
Nat: true,
Mode: schema.DirectNAT,
})
}
inetEgressInfo := models.EgressInfo{
EgressID: fmt.Sprintf("%s-%s", node.ID.String(), "inet"),
Network: node.PrimaryAddressIPNet(),
EgressGwAddr: net.IPNet{
IP: net.ParseIP(node.PrimaryAddress()),
Mask: getCIDRMaskFromAddr(node.PrimaryAddress()),
},
Network6: node.NetworkRange6,
EgressGwAddr6: net.IPNet{
IP: node.Address6.IP,
Mask: getCIDRMaskFromAddr(node.Address6.IP.String()),
},
EgressGWCfg: models.EgressGatewayRequest{
NodeID: fmt.Sprintf("%s-%s", node.ID.String(), "inet"),
NetID: node.Network,
NatEnabled: "yes",
Ranges: egressrange,
RangesWithMetric: rangeWithMetric,
},
}
if !networkAllowAll {
inetEgressInfo.EgressFwRules = GetAclRuleForInetGw(node)
}
hostPeerUpdate.FwUpdate.EgressInfo[fmt.Sprintf("%s-%s", node.ID.String(), "inet")] = inetEgressInfo
}
}
// == post peer calculations ==
// indicate removal if no allowed IPs were calculated
for i := range hostPeerUpdate.Peers {
peer := hostPeerUpdate.Peers[i]
if len(peer.AllowedIPs) == 0 {
peer.Remove = true
}
hostPeerUpdate.Peers[i] = peer
}
if deletedNode != nil && host.OS != models.OS_Types.IoT {
peerHost := &schema.Host{
ID: deletedNode.HostID,
}
err := peerHost.Get(db.WithContext(context.TODO()))
if err == nil && host.ID != peerHost.ID {
if _, ok := peerIndexMap[peerHost.PublicKey.String()]; !ok {
hostPeerUpdate.Peers = append(hostPeerUpdate.Peers, wgtypes.PeerConfig{
PublicKey: peerHost.PublicKey.Key,
Remove: true,
})
}
}
}
for i := range hostPeerUpdate.NodePeers {
peer := hostPeerUpdate.NodePeers[i]
if len(peer.AllowedIPs) == 0 {
peer.Remove = true
}
hostPeerUpdate.NodePeers[i] = peer
}
if len(deletedClients) > 0 {
for i := range deletedClients {
deletedClient := deletedClients[i]
key, err := wgtypes.ParseKey(deletedClient.PublicKey)
if err == nil {
hostPeerUpdate.Peers = append(hostPeerUpdate.Peers, wgtypes.PeerConfig{
PublicKey: key,
Remove: true,
})
}
}
}
return hostPeerUpdate, nil
}
// GetPeerListenPort - given a host, retrieve it's appropriate listening port
func GetPeerListenPort(host *schema.Host) int {
peerPort := host.ListenPort
if !host.IsStaticPort && host.WgPublicListenPort != 0 {
peerPort = host.WgPublicListenPort
}
return peerPort
}
func filterConflictingEgressRoutes(node, peer models.Node) []string {
egressIPs := slices.Clone(peer.EgressDetails.EgressGatewayRanges)
if node.EgressDetails.IsEgressGateway {
// filter conflicting addrs
nodeEgressMap := make(map[string]struct{})
for _, rangeI := range node.EgressDetails.EgressGatewayRanges {
nodeEgressMap[rangeI] = struct{}{}
}
for i := len(egressIPs) - 1; i >= 0; i-- {
if _, ok := nodeEgressMap[egressIPs[i]]; ok {
egressIPs = append(egressIPs[:i], egressIPs[i+1:]...)
}
}
}
return UniqueStrings(egressIPs)
}
func filterConflictingEgressRoutesWithMetric(node, peer models.Node) []models.EgressRangeMetric {
egressIPs := slices.Clone(peer.EgressDetails.EgressGatewayRequest.RangesWithMetric)
if node.EgressDetails.IsEgressGateway {
// filter conflicting addrs
nodeEgressMap := make(map[string]struct{})
for _, rangeI := range node.EgressDetails.EgressGatewayRanges {
nodeEgressMap[rangeI] = struct{}{}
}
for i := len(egressIPs) - 1; i >= 0; i-- {
// Use virtual network range for conflict detection when virtual NAT is enabled
checkRange := egressIPs[i].Network
if egressIPs[i].Nat && egressIPs[i].VirtualNetwork != "" {
checkRange = egressIPs[i].VirtualNetwork
}
if _, ok := nodeEgressMap[checkRange]; ok {
egressIPs = append(egressIPs[:i], egressIPs[i+1:]...)
}
}
}
return egressIPs
}
// GetAllowedIPs - calculates the wireguard allowedip field for a peer of a node based on the peer and node settings
func GetAllowedIPs(node, peer *models.Node, metrics *models.Metrics) []net.IPNet {
var allowedips []net.IPNet
allowedips = getNodeAllowedIPs(peer, node)
if peer.IsInternetGateway && node.InternetGwID == peer.ID.String() {
allowedips = append(allowedips, GetAllowedIpForInetNodeClient(node, peer)...)
return allowedips
}
if node.IsRelayed && node.RelayedBy == peer.ID.String() {
allowedips = append(allowedips, GetAllowedIpsForRelayed(node, peer)...)
if peer.InternetGwID != "" {
return allowedips
}
}
// handle ingress gateway peers
if peer.IsIngressGateway {
extPeers, _, _, err := GetExtPeers(peer, node, make(map[string]models.PeerIdentity))
if err != nil {
logger.Log(2, "could not retrieve ext peers for ", peer.ID.String(), err.Error())
}
for _, extPeer := range extPeers {
allowedips = append(allowedips, extPeer.AllowedIPs...)
}
}
return allowedips
}
func GetEgressIPs(peer *models.Node) []net.IPNet {
peerHost := &schema.Host{
ID: peer.HostID,
}
err := peerHost.Get(db.WithContext(context.TODO()))
if err != nil {
logger.Log(0, "error retrieving host for peer", peer.ID.String(), "host id", peer.HostID.String(), err.Error())
}
// check for internet gateway
internetGateway := false
if slices.Contains(peer.EgressDetails.EgressGatewayRanges, "0.0.0.0/0") || slices.Contains(peer.EgressDetails.EgressGatewayRanges, "::/0") {
internetGateway = true
}
allowedips := []net.IPNet{}
for _, iprange := range peer.EgressDetails.EgressGatewayRanges { // go through each cidr for egress gateway
_, ipnet, err := net.ParseCIDR(iprange) // confirming it's valid cidr
if err != nil {
logger.Log(1, "could not parse gateway IP range. Not adding ", iprange)
continue // if can't parse CIDR
}
// getting the public ip of node
if ipnet.Contains(peerHost.EndpointIP) && !internetGateway { // ensuring egress gateway range does not contain endpoint of node
logger.Log(2, "egress IP range of ", iprange, " overlaps with ", peerHost.EndpointIP.String(), ", omitting")
continue // skip adding egress range if overlaps with node's ip
}
// TODO: Could put in a lot of great logic to avoid conflicts / bad routes
if ipnet.Contains(peer.LocalAddress.IP) && !internetGateway { // ensuring egress gateway range does not contain public ip of node
logger.Log(2, "egress IP range of ", iprange, " overlaps with ", peer.LocalAddress.String(), ", omitting")
continue // skip adding egress range if overlaps with node's local ip
}
allowedips = append(allowedips, *ipnet)
}
return allowedips
}
func getNodeAllowedIPs(peer, node *models.Node) []net.IPNet {
var allowedips = []net.IPNet{}
if peer.Address.IP != nil {
allowed := net.IPNet{
IP: peer.Address.IP,
Mask: net.CIDRMask(32, 32),
}
allowedips = append(allowedips, allowed)
}
if peer.Address6.IP != nil {
allowed := net.IPNet{
IP: peer.Address6.IP,
Mask: net.CIDRMask(128, 128),
}
allowedips = append(allowedips, allowed)
}
// handle egress gateway peers
if peer.EgressDetails.IsEgressGateway {
// hasGateway = true
egressIPs := GetEgressIPs(peer)
if node.EgressDetails.IsEgressGateway {
// filter conflicting addrs
nodeEgressMap := make(map[string]struct{})
for _, rangeI := range node.EgressDetails.EgressGatewayRequest.RangesWithMetric {
if rangeI.Nat {
nodeEgressMap[rangeI.VirtualNetwork] = struct{}{}
} else {
nodeEgressMap[rangeI.Network] = struct{}{}
}
}
for i := len(egressIPs) - 1; i >= 0; i-- {
if _, ok := nodeEgressMap[egressIPs[i].String()]; ok {
egressIPs = append(egressIPs[:i], egressIPs[i+1:]...)
}
}
}
allowedips = append(allowedips, egressIPs...)
}
if peer.IsRelay {
allowedips = append(allowedips, RelayedAllowedIPs(peer, node)...)
}
if peer.IsFailOver {
allowedips = append(allowedips, GetFailOverPeerIps(peer, node)...)
}
if peer.IsAutoRelay {
allowedips = append(allowedips, GetAutoRelayPeerIps(peer, node)...)
}
return allowedips
}
func getCIDRMaskFromAddr(addr string) net.IPMask {
cidr := net.CIDRMask(32, 32)
ipAddr, err := netip.ParseAddr(addr)
if err != nil {
return cidr
}
if ipAddr.Is6() {
cidr = net.CIDRMask(128, 128)
}
return cidr
}