Files
netmaker/pro/initialize.go
T
Abhishek Kondur 292af315dd NM-271: Scalability Improvements (#3921)
* feat(go): add user schema;

* feat(go): migrate to user schema;

* feat(go): add audit fields;

* feat(go): remove unused fields from the network model;

* feat(go): add network schema;

* feat(go): migrate to network schema;

* refactor(go): add comment to clarify migration logic;

* fix(go): test failures;

* fix(go): test failures;

* feat(go): change membership table to store memberships at all scopes;

* feat(go): add schema for access grants;

* feat(go): remove nameservers from new networks table; ensure db passed for schema functions;

* feat(go): set max conns for sqlite to 1;

* fix(go): issues updating user account status;

* NM-236: streamline operations in HA mode

* NM-236: only master pod should subscribe to updates from clients

* refactor(go): remove converters and access grants;

* refactor(go): add json tags in schema models;

* refactor(go): rename file to migrate_v1_6_0.go;

* refactor(go): add user groups and user roles tables; use schema tables;

* refactor(go): inline get and list from schema package;

* refactor(go): inline get network and list users from schema package;

* fix(go): staticcheck issues;

* fix(go): remove test not in use; fix test case;

* fix(go): validate network;

* fix(go): resolve static checks;

* fix(go): new models errors;

* fix(go): test errors;

* fix(go): handle no records;

* fix(go): add validations for user object;

* fix(go): set correct extclient status;

* fix(go): test error;

* feat(go): make schema the base package;

* feat(go): add host schema;

* feat(go): use schema host everywhere;

* feat(go): inline get host, list hosts and delete host;

* feat(go): use non-ptr value;

* feat(go): use save to upsert all fields;

* feat(go): use save to upsert all fields;

* feat(go): save turn endpoint as string;

* feat(go): check for gorm error record not found;

* fix(go): test failures;

* fix(go): update all network fields;

* fix(go): update all network fields;

* feat(go): add paginated list networks api;

* feat(go): add paginated list users api;

* feat(go): add paginated list hosts api;

* feat(go): add pagination to list groups api;

* fix(go): comment;

* fix(go): implement marshal and unmarshal text for custom types;

* fix(go): implement marshal and unmarshal json for custom types;

* fix(go): just use the old model for unmarshalling;

* fix(go): implement marshal and unmarshal json for custom types;

* NM-271:Import swap: compress/gzip replaced with github.com/klauspost/compress/gzip (2-4x faster, wire-compatible output). Added sync import.
Two sync.Pool variables (gzipWriterPool, bufferPool): reuse gzip.Writer and bytes.Buffer across calls instead of allocating fresh ones per publish.
compressPayload rewritten: pulls writer + buffer from pools, resets them, compresses at gzip.BestSpeed (level 1), copies the result out of the pooled buffer, and returns both objects to the pools.

* feat(go): remove paginated list networks api;

* feat(go): use custom paginated response object;

* NM-271: Improve server scalability under high host count

- Replace stdlib compress/gzip with klauspost/compress at BestSpeed and
  pool gzip writers and buffers via sync.Pool to eliminate compression
  as the dominant CPU hotspot.

- Debounce peer update broadcasts with a 500ms resettable window capped
  at 3s max-wait, coalescing rapid-fire PublishPeerUpdate calls into a
  single broadcast cycle.

- Cache HostPeerInfo (batch-refreshed by debounce worker) and
  HostPeerUpdate (stored as side-effect of each publish) so the pull API
  and peer_info API serve from pre-computed maps instead of triggering
  expensive per-host computations under thundering herd conditions.

- Warm both caches synchronously at startup before the first publish
  cycle so early pull requests are served instantly.

- Bound concurrent MQTT publishes to 5 via semaphore to prevent
  broker TCP buffer overflows that caused broken pipe disconnects.

- Remove manual Disconnect+SetupMQTT from ConnectionLostHandler and
  rely on the paho client's built-in AutoReconnect; add a 5s retry
  wait in publish() to ride out brief reconnection windows.

* NM-271: Reduce server CPU contention under high concurrent load

- Cache ServerSettings with atomic.Value to eliminate repeated DB reads
  on every pull request (was 32+ goroutines blocked on read lock)
- Batch UpdateNodeCheckin writes in memory, flush every 30s to reduce
  per-checkin write lock contention (was 88+ goroutines blocked)
- Enable SQLite WAL mode + busy_timeout and remove global dbMutex;
  let SQLite handle concurrency natively (reads no longer block writes)
- Move ResetFailedOverPeer/ResetAutoRelayedPeer to async in pull()
  handler since results don't affect the cached response
- Skip no-op UpsertNode writes in failover/relay reset functions
  (early return when node has no failover/relay state)
- Remove CheckHostPorts from hostUpdateFallback hot path
- Switch to pure-Go SQLite driver (glebarez/sqlite), set CGO_ENABLED=0

* fix(go): ensure default values for page and per_page are used when not passed;

* fix(go): rename v1.6.0 to v1.5.1;

* fix(go): check for gorm.ErrRecordNotFound instead of database.IsEmptyRecord;

* fix(go): use host id, not pending host id;

* NM-271: Revert pure-Go SQLite and FIPS disable to verify impact

Revert to CGO-based mattn/go-sqlite3 driver and re-enable FIPS to
isolate whether these changes are still needed now that the global
dbMutex has been removed and WAL mode is enabled. Keep WAL mode
pragma with mattn-compatible DSN format.

* feat(go): add filters to paginated apis;

* feat(go): add filters to paginated apis;

* feat(go): remove check for max username length;

* feat(go): add filters to count as well;

* feat(go): use library to check email address validity;

* feat(go): ignore pagination if params not passed;

* fix(go): pagination issues;

* fix(go): check exists before using;

* fix(go): remove debug log;

* NM-271: rm debug logs

* NM-271: check if caching is enabled

* NM-271: add server sync mq topic for HA mode

* NM-271: fix build

* NM-271: push metrics in batch to exproter over api

* NM-271: use basic auth for exporter metrics api

* fix(go): use gorm err record not found;

* NM-271: Add monitoring stack on demand

* NM-271: -m arg for install script should only add monitoring stack

* fix(go): use gorm err record not found;

* NM-271: update docker compose file for prometheus

* NM-271: update docker compose file for prometheus

* fix(go): use user principal name when creating pending user;

* fix(go): use schema package for consts;

* NM-236: rm duplicate network hook

* NM-271: add server topic to reset idp hooks on master node

* fix(go): prevent disabling superadmin user;

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): swap is admin and is superadmin;

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): remove dead code block;

https://github.com/gravitl/netmaker/pull/3910#discussion_r2928837937

* fix(go): incorrect message when trying to disable self;

https://github.com/gravitl/netmaker/pull/3910#discussion_r2928837934

* NM-271: fix stale peers on reset_failovered pull and add HTTP timeout to metrics exporter

Run the failover/relay reset synchronously in the pull handler so the
response reflects post-reset topology instead of serving stale cached
peers. Add a 30s timeout to the metrics exporter HTTP client to prevent
PushAllMetricsToExporter from blocking the Keepalive loop.

* NM-271: fix gzip pool corruption, MQTT topic mismatch, stale settings cache, and reduce redundant DB fetches

- Only return gzip.Writer to pool after successful Close to prevent
  silently malformed MQTT payloads from a previously errored writer.
- Fix serversync subscription to exact topic match since syncType is
  now in the message payload, not the topic path.
- Prevent zero-value ServerSettings from being cached indefinitely
  when the DB record is missing or unmarshal fails on startup.
- Return fetched hosts/nodes from RefreshHostPeerInfoCache so
  warmPeerCaches reuses them instead of querying the DB twice.
- Compute fresh HostPeerUpdate on reset_failovered pull instead of
  serving stale cache, and store result back for subsequent requests.

* NM-271: fix gzip writer pool leak, log checkin flush errors, and fix master pod ordinal parsing

- Reset gzip.Writer to io.Discard before returning to pool so errored
  writers are never leaked or silently reused with corrupt state.
- Track and log failed DB inserts in FlushNodeCheckins so operators
  have visibility when check-in timestamps are lost.
- Parse StatefulSet pod ordinal as integer instead of using HasSuffix
  to prevent netmaker-10 from being misidentified as master pod.

* NM-271: simplify masterpod logic

* fix(go): use correct header;

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): return after error response;

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): use correct order of params;

https://github.com/gravitl/netmaker/pull/3910#discussion_r2929593036

* fix(go): set default values for page and page size; use v2 instead of /list;

* NM-271: use host name

* Update mq/serversync.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* NM-271: fix duplicate serversynce case

* NM-271: streamline gw updates

* Update logic/auth.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* Update schema/user_roles.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): syntax error;

* fix(go): set default values when page and per_page are not passed or 0;

* fix(go): use uuid.parse instead of uuid.must parse;

* fix(go): review errors;

* fix(go): review errors;

* Update controllers/user.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* Update controllers/user.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* NM-163: fix errors:

* Update db/types/options.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* fix(go): persist return user in event;

* Update db/types/options.go

Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>

* NM-271: signal pull on ip changes

* NM-163: duplicate lines of code

* NM-163: fix(go): fix missing return and filter parsing in user controller

- Add missing return after error response in updateUserAccountStatus
  to prevent double-response and spurious ext-client side-effects
- Use switch statements in listUsers to skip unrecognized
  account_status and mfa_status filter values

* NM-271: signal pull req on node ip change

* fix(go): check for both min and max page size;

* NM-271: refresh node object before update

* fix(go): enclose transfer superadmin in transaction;

* fix(go): review errors;

* fix(go): remove free tier checks;

* fix(go): review fixes;

* NM-271: streamline ip pool ops

* NM-271: fix tests, set max idle conns

* NM-271: fix(go): fix data races in settings cache and peer update worker

- Use pointer type in atomic.Value for serverSettingsCache to avoid
  replacing the variable non-atomically in InvalidateServerSettingsCache
- Swap peerUpdateReplace flag before draining the channel to prevent
  a concurrent replacePeers=true from being consumed by the wrong cycle

---------

Co-authored-by: VishalDalwadi <dalwadivishal26@gmail.com>
Co-authored-by: Vishal Dalwadi <51291657+VishalDalwadi@users.noreply.github.com>
Co-authored-by: tenki-reviewer[bot] <262613592+tenki-reviewer[bot]@users.noreply.github.com>
2026-03-18 00:24:54 +05:30

299 lines
11 KiB
Go

//go:build ee
// +build ee
package pro
import (
"context"
"sync"
"time"
ch "github.com/gravitl/netmaker/clickhouse"
controller "github.com/gravitl/netmaker/controllers"
"github.com/gravitl/netmaker/db"
"github.com/gravitl/netmaker/logger"
"github.com/gravitl/netmaker/logic"
"github.com/gravitl/netmaker/models"
"github.com/gravitl/netmaker/mq"
"github.com/gravitl/netmaker/pro/auth"
proControllers "github.com/gravitl/netmaker/pro/controllers"
"github.com/gravitl/netmaker/pro/email"
proLogic "github.com/gravitl/netmaker/pro/logic"
"github.com/gravitl/netmaker/schema"
"github.com/gravitl/netmaker/servercfg"
"golang.org/x/exp/slog"
)
// InitPro - Initialize Pro Logic
func InitPro() {
servercfg.IsPro = true
models.SetLogo(retrieveProLogo())
controller.HttpMiddlewares = append(
controller.HttpMiddlewares,
// TODO: try to add clickhouse middleware only if needed.
ch.Middleware,
proControllers.OnlyServerAPIWhenUnlicensedMiddleware,
)
controller.HttpHandlers = append(
controller.HttpHandlers,
proControllers.MetricHandlers,
proControllers.UserHandlers,
proControllers.FailOverHandlers,
proControllers.RacHandlers,
proControllers.EventHandlers,
proControllers.TagHandlers,
proControllers.NetworkHandlers,
proControllers.AutoRelayHandlers,
// TODO: try to add flow handler only if flow logs are enabled.
proControllers.FlowHandlers,
proControllers.PostureCheckHandlers,
proControllers.JITHandlers,
)
controller.ListRoles = proControllers.ListRoles
logic.EnterpriseCheckFuncs = append(logic.EnterpriseCheckFuncs, func(ctx context.Context, wg *sync.WaitGroup) {
// == License Handling ==
enableLicenseHook := true
// licenseKeyValue := servercfg.GetLicenseKey()
// netmakerTenantID := servercfg.GetNetmakerTenantID()
// if licenseKeyValue != "" && netmakerTenantID != "" {
// enableLicenseHook = true
// }
if !enableLicenseHook {
err := initTrial()
if err != nil {
logger.Log(0, "failed to init trial", err.Error())
enableLicenseHook = true
}
trialEndDate, err := getTrialEndDate()
if err != nil {
slog.Error("failed to get trial end date", "error", err)
enableLicenseHook = true
} else {
// check if trial ended
if time.Now().After(trialEndDate) {
// trial ended already
enableLicenseHook = true
}
}
}
if enableLicenseHook {
logger.Log(0, "starting license checker")
ClearLicenseCache()
if err := ValidateLicense(); err != nil {
slog.Error(err.Error())
return
}
logger.Log(0, "proceeding with Paid Tier license")
logic.SetFreeTierForTelemetry(false)
// == End License Handling ==
// License validation runs on all pods to avoid audit issues
AddLicenseHooks()
} else {
logger.Log(0, "starting trial license hook")
addTrialLicenseHook()
}
//AddUnauthorisedUserNodeHooks()
var authProvider = auth.InitializeAuthProvider()
if authProvider != "" {
slog.Info("OAuth provider,", authProvider+",", "initialized")
} else {
slog.Error("no OAuth provider found or not configured, continuing without OAuth")
}
proLogic.LoadNodeMetricsToCache()
proLogic.InitFailOverCache()
if servercfg.CacheEnabled() {
proLogic.InitAutoRelayCache()
}
// Only run singleton operations on master pod in HA setup
// These include IDP sync, posture checks, JIT expiry, and flow cleanup
if servercfg.IsMasterPod() {
auth.ResetIDPSyncHook()
proLogic.AddPostureCheckHook()
// Register JIT expiry hook with email notifications
addJitExpiryHookWithEmail()
if proLogic.GetFeatureFlags().EnableFlowLogs && logic.GetServerSettings().EnableFlowLogs {
err := ch.Initialize()
if err != nil {
logger.FatalLog("error connecting to clickhouse:", err.Error())
}
proLogic.StartFlowCleanupLoop()
wg.Add(1)
go func(ctx context.Context, wg *sync.WaitGroup) {
<-ctx.Done()
proLogic.StopFlowCleanupLoop()
ch.Close()
wg.Done()
}(ctx, wg)
}
}
// These can run on all pods
email.Init()
go proLogic.EventWatcher()
logic.GetMetricsMonitor().Start()
})
logic.ResetFailOver = proLogic.ResetFailOver
logic.ResetFailedOverPeer = proLogic.ResetFailedOverPeer
logic.FailOverExists = proLogic.FailOverExists
logic.CreateFailOver = proLogic.CreateFailOver
logic.GetFailOverPeerIps = proLogic.GetFailOverPeerIps
logic.ResetAutoRelay = proLogic.ResetAutoRelay
logic.ResetAutoRelayedPeer = proLogic.ResetAutoRelayedPeer
logic.SetAutoRelay = proLogic.SetAutoRelay
logic.GetAutoRelayPeerIps = proLogic.GetAutoRelayPeerIps
logic.DenyClientNodeAccess = proLogic.DenyClientNode
logic.IsClientNodeAllowed = proLogic.IsClientNodeAllowed
logic.AllowClientNodeAccess = proLogic.RemoveDeniedNodeFromClient
logic.SetClientDefaultACLs = proLogic.SetClientDefaultACLs
logic.SetClientACLs = proLogic.SetClientACLs
logic.UpdateProNodeACLs = proLogic.UpdateProNodeACLs
logic.GetMetrics = proLogic.GetMetrics
logic.UpdateMetrics = proLogic.UpdateMetrics
logic.DeleteMetrics = proLogic.DeleteMetrics
logic.GetTrialEndDate = getTrialEndDate
mq.UpdateMetrics = proLogic.MQUpdateMetrics
mq.UpdateMetricsFallBack = proLogic.MQUpdateMetricsFallBack
logic.GetFilteredNodesByUserAccess = proLogic.GetFilteredNodesByUserAccess
logic.DeleteRole = proLogic.DeleteRole
logic.NetworkPermissionsCheck = proLogic.NetworkPermissionsCheck
logic.GlobalPermissionsCheck = proLogic.GlobalPermissionsCheck
logic.DeleteNetworkRoles = proLogic.DeleteNetworkRoles
logic.CreateDefaultNetworkRolesAndGroups = proLogic.CreateDefaultNetworkRolesAndGroups
logic.FilterNetworksByRole = proLogic.FilterNetworksByRole
logic.IsGroupsValid = proLogic.IsGroupsValid
logic.InitialiseRoles = proLogic.UserRolesInit
logic.UpdateUserGwAccess = proLogic.UpdateUserGwAccess
logic.CreateDefaultUserPolicies = proLogic.CreateDefaultUserPolicies
logic.IntialiseGroups = proLogic.UserGroupsInit
logic.AddGlobalNetRolesToAdmins = proLogic.AddGlobalNetRolesToAdmins
logic.GetUserGroup = proLogic.GetUserGroup
logic.GetNodeStatus = proLogic.GetNodeStatus
logic.IsOAuthConfigured = auth.IsOAuthConfigured
logic.ResetAuthProvider = auth.ResetAuthProvider
logic.ResetIDPSyncHook = auth.ResetIDPSyncHook
logic.EmailInit = email.Init
logic.LogEvent = proLogic.LogEvent
logic.RemoveUserFromAclPolicy = proLogic.RemoveUserFromAclPolicy
logic.IsUserAllowedToCommunicate = proLogic.IsUserAllowedToCommunicate
logic.DeleteAllNetworkTags = proLogic.DeleteAllNetworkTags
logic.CreateDefaultTags = proLogic.CreateDefaultTags
logic.IsPeerAllowed = proLogic.IsPeerAllowed
logic.IsAclPolicyValid = proLogic.IsAclPolicyValid
logic.GetEgressUserRulesForNode = proLogic.GetEgressUserRulesForNode
logic.GetTagMapWithNodesByNetwork = proLogic.GetTagMapWithNodesByNetwork
logic.GetUserAclRulesForNode = proLogic.GetUserAclRulesForNode
logic.CheckIfAnyPolicyisUniDirectional = proLogic.CheckIfAnyPolicyisUniDirectional
logic.MigrateToGws = proLogic.MigrateToGws
logic.GetFwRulesForNodeAndPeerOnGw = proLogic.GetFwRulesForNodeAndPeerOnGw
logic.GetFwRulesForUserNodesOnGw = proLogic.GetFwRulesForUserNodesOnGw
logic.GetFeatureFlags = proLogic.GetFeatureFlags
logic.GetDeploymentMode = proLogic.GetDeploymentMode
logic.GetNameserversForHost = proLogic.GetNameserversForHost
logic.GetNameserversForNode = proLogic.GetNameserversForNode
logic.ValidateNameserverReq = proLogic.ValidateNameserverReq
logic.ValidateEgressReq = proLogic.ValidateEgressReq
logic.CheckPostureViolations = proLogic.CheckPostureViolations
logic.GetPostureCheckDeviceInfoByNode = proLogic.GetPostureCheckDeviceInfoByNode
logic.StartFlowCleanupLoop = proLogic.StartFlowCleanupLoop
logic.StopFlowCleanupLoop = proLogic.StopFlowCleanupLoop
// Expose JIT functions
logic.CheckJITAccess = proLogic.CheckJITAccess
logic.AssignVirtualRangeToEgress = proLogic.AssignVirtualRangeToEgress
}
// addJitExpiryHookWithEmail - registers a hook that expires JIT grants and sends email notifications
func addJitExpiryHookWithEmail() {
if !proLogic.GetFeatureFlags().EnableJIT {
return
}
// Register JIT grant expiry hook with email notifications - runs every 5 minutes
logic.HookManagerCh <- models.HookDetails{
ID: "jit-expiry-hook",
Hook: logic.WrapHook(expireJITGrantsWithEmail),
Interval: 5 * time.Minute,
}
}
// expireJITGrantsWithEmail - expires JIT grants and sends email notifications
func expireJITGrantsWithEmail() error {
ctx := db.WithContext(context.Background())
// Get grants that are about to expire or just expired (within last 5 minutes)
// We check before expiring so we can send emails
grant := schema.JITGrant{}
allGrants, err := grant.ListExpired(ctx)
if err != nil {
slog.Warn("failed to list expired grants for email notification", "error", err)
// Continue with expiration even if listing fails
}
// Track grants that need email before we expire them
var grantsToEmail []struct {
Grant *schema.JITGrant
Request *schema.JITRequest
Network *schema.Network
}
fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
for _, expiredGrant := range allGrants {
// Only send email for grants that expired recently (within last 5 minutes)
if expiredGrant.ExpiresAt.After(fiveMinutesAgo) && expiredGrant.RequestID != "" {
request := schema.JITRequest{ID: expiredGrant.RequestID}
if err := request.Get(ctx); err == nil {
network := &schema.Network{Name: expiredGrant.NetworkID}
err = network.Get(db.WithContext(context.TODO()))
if err == nil {
grantsToEmail = append(grantsToEmail, struct {
Grant *schema.JITGrant
Request *schema.JITRequest
Network *schema.Network
}{&expiredGrant, &request, network})
}
}
}
}
// First, expire the grants (this handles the business logic)
if err := proLogic.ExpireJITGrants(); err != nil {
return err
}
// Then, send email notifications for the grants we tracked
for _, item := range grantsToEmail {
if err := email.SendJITExpirationEmail(item.Grant, item.Request, item.Network, false, ""); err != nil {
slog.Warn("failed to send expiration email", "grant_id", item.Grant.ID, "user", item.Request.UserName, "error", err)
}
}
return nil
}
func retrieveProLogo() string {
return `
__ __ ______ ______ __ __ ______ __ __ ______ ______
/\ "-.\ \ /\ ___\ /\__ _\ /\ "-./ \ /\ __ \ /\ \/ / /\ ___\ /\ == \
\ \ \-. \ \ \ __\ \/_/\ \/ \ \ \-./\ \ \ \ __ \ \ \ _"-. \ \ __\ \ \ __<
\ \_\\"\_\ \ \_____\ \ \_\ \ \_\ \ \_\ \ \_\ \_\ \ \_\ \_\ \ \_____\ \ \_\ \_\
\/_/ \/_/ \/_____/ \/_/ \/_/ \/_/ \/_/\/_/ \/_/\/_/ \/_____/ \/_/ /_/
___ ___ ____
____ ____ ____ / _ \ / _ \ / __ \ ____ ____ ____
/___/ /___/ /___/ / ___/ / , _// /_/ / /___/ /___/ /___/
/___/ /___/ /___/ /_/ /_/|_| \____/ /___/ /___/ /___/
`
}