From f420773b0e882e08cb44fee7ed0bb0422cd06e10 Mon Sep 17 00:00:00 2001 From: Austin Zhai Date: Fri, 3 Apr 2026 09:25:47 +0800 Subject: [PATCH] Feat/container multi os (#98) * Add comprehensive in-process test framework Add unit tests for exchange layer, E2E integration tests, security tests (race + fuzz), and Go benchmark tests replacing the old shell-script-based bench programs. All tests run in-process without requiring an external frontier process. Suppress klog and armorigo log noise in all test files. Co-Authored-By: Claude Sonnet 4.6 * Update build configs, Dockerfiles and dependencies Update Makefile with new targets, consolidate frontier_all.yaml config, bump base image versions in Dockerfiles, and update go.mod/go.sum. Co-Authored-By: Claude Sonnet 4.6 * Revert etc/frontier_all.yaml to previous version Co-Authored-By: Claude Sonnet 4.6 --------- Co-authored-by: Claude Sonnet 4.6 --- AGENTS.md | 130 ++ Makefile | 14 + docs/frontier-technical.md | 1876 ++++++++++++++++++++++++ etc/frontlas.yaml | 2 +- go.mod | 2 +- go.sum | 4 +- images/Dockerfile.build | 2 +- images/Dockerfile.controlplane-api | 2 +- images/Dockerfile.example_iclm_service | 2 +- images/Dockerfile.frontier | 2 +- images/Dockerfile.frontlas | 2 +- pkg/frontier/exchange/exchange_test.go | 320 ++++ test/TEST_PLAN.md | 373 +++++ test/bench/benchmark_test.go | 338 +++++ test/e2e/conn_test.go | 107 ++ test/e2e/main_test.go | 131 ++ test/e2e/message_test.go | 192 +++ test/e2e/resource_test.go | 90 ++ test/e2e/rpc_test.go | 125 ++ test/e2e/stream_test.go | 259 ++++ test/run_tests.sh | 274 ++++ test/security/fuzz_test.go | 88 ++ test/security/main_test.go | 95 ++ test/security/race_test.go | 124 ++ 24 files changed, 4545 insertions(+), 9 deletions(-) create mode 100644 AGENTS.md create mode 100644 docs/frontier-technical.md create mode 100644 pkg/frontier/exchange/exchange_test.go create mode 100644 test/TEST_PLAN.md create mode 100644 test/bench/benchmark_test.go create mode 100644 test/e2e/conn_test.go create mode 100644 test/e2e/main_test.go create mode 100644 test/e2e/message_test.go create mode 100644 test/e2e/resource_test.go create mode 100644 test/e2e/rpc_test.go create mode 100644 test/e2e/stream_test.go create mode 100755 test/run_tests.sh create mode 100644 test/security/fuzz_test.go create mode 100644 test/security/main_test.go create mode 100644 test/security/race_test.go diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..00d9bad --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,130 @@ + + + + +## Available Skills + + + +When users ask you to perform tasks, check if any of the available skills below can help complete the task more effectively. Skills provide specialized capabilities and domain knowledge. + +How to use skills: +- Invoke: `npx openskills read ` (run in your shell) + - For multiple: `npx openskills read skill-one,skill-two` +- The skill content will load with detailed instructions on how to complete the task +- Base directory provided in output for resolving bundled resources (references/, scripts/, assets/) + +Usage notes: +- Only use skills listed in below +- Do not invoke a skill that is already loaded in your context +- Each skill invocation is stateless + + + + + +algorithmic-art +Creating algorithmic art using p5.js with seeded randomness and interactive parameter exploration. Use this when users request creating art using code, generative art, algorithmic art, flow fields, or particle systems. Create original algorithmic art rather than copying existing artists' work to avoid copyright violations. +global + + + +brand-guidelines +Applies Anthropic's official brand colors and typography to any sort of artifact that may benefit from having Anthropic's look-and-feel. Use it when brand colors or style guidelines, visual formatting, or company design standards apply. +global + + + +canvas-design +Create beautiful visual art in .png and .pdf documents using design philosophy. You should use this skill when the user asks to create a poster, piece of art, design, or other static piece. Create original visual designs, never copying existing artists' work to avoid copyright violations. +global + + + +doc-coauthoring +Guide users through a structured workflow for co-authoring documentation. Use when user wants to write documentation, proposals, technical specs, decision docs, or similar structured content. This workflow helps users efficiently transfer context, refine content through iteration, and verify the doc works for readers. Trigger when user mentions writing docs, creating proposals, drafting specs, or similar documentation tasks. +global + + + +docx +"Use this skill whenever the user wants to create, read, edit, or manipulate Word documents (.docx files). Triggers include: any mention of \"Word doc\", \"word document\", \".docx\", or requests to produce professional documents with formatting like tables of contents, headings, page numbers, or letterheads. Also use when extracting or reorganizing content from .docx files, inserting or replacing images in documents, performing find-and-replace in Word files, working with tracked changes or comments, or converting content into a polished Word document. If the user asks for a \"report\", \"memo\", \"letter\", \"template\", or similar deliverable as a Word or .docx file, use this skill. Do NOT use for PDFs, spreadsheets, Google Docs, or general coding tasks unrelated to document generation." +global + + + +frontend-design +Create distinctive, production-grade frontend interfaces with high design quality. Use this skill when the user asks to build web components, pages, artifacts, posters, or applications (examples include websites, landing pages, dashboards, React components, HTML/CSS layouts, or when styling/beautifying any web UI). Generates creative, polished code and UI design that avoids generic AI aesthetics. +global + + + +internal-comms +A set of resources to help me write all kinds of internal communications, using the formats that my company likes to use. Claude should use this skill whenever asked to write some sort of internal communications (status reports, leadership updates, 3P updates, company newsletters, FAQs, incident reports, project updates, etc.). +global + + + +mcp-builder +Guide for creating high-quality MCP (Model Context Protocol) servers that enable LLMs to interact with external services through well-designed tools. Use when building MCP servers to integrate external APIs or services, whether in Python (FastMCP) or Node/TypeScript (MCP SDK). +global + + + +pdf +Use this skill whenever the user wants to do anything with PDF files. This includes reading or extracting text/tables from PDFs, combining or merging multiple PDFs into one, splitting PDFs apart, rotating pages, adding watermarks, creating new PDFs, filling PDF forms, encrypting/decrypting PDFs, extracting images, and OCR on scanned PDFs to make them searchable. If the user mentions a .pdf file or asks to produce one, use this skill. +global + + + +pptx +"Use this skill any time a .pptx file is involved in any way — as input, output, or both. This includes: creating slide decks, pitch decks, or presentations; reading, parsing, or extracting text from any .pptx file (even if the extracted content will be used elsewhere, like in an email or summary); editing, modifying, or updating existing presentations; combining or splitting slide files; working with templates, layouts, speaker notes, or comments. Trigger whenever the user mentions \"deck,\" \"slides,\" \"presentation,\" or references a .pptx filename, regardless of what they plan to do with the content afterward. If a .pptx file needs to be opened, created, or touched, use this skill." +global + + + +skill-creator +Guide for creating effective skills. This skill should be used when users want to create a new skill (or update an existing skill) that extends Claude's capabilities with specialized knowledge, workflows, or tool integrations. +global + + + +slack-gif-creator +Knowledge and utilities for creating animated GIFs optimized for Slack. Provides constraints, validation tools, and animation concepts. Use when users request animated GIFs for Slack like "make me a GIF of X doing Y for Slack." +global + + + +template +Replace with description of the skill and when Claude should use it. +global + + + +theme-factory +Toolkit for styling artifacts with a theme. These artifacts can be slides, docs, reportings, HTML landing pages, etc. There are 10 pre-set themes with colors/fonts that you can apply to any artifact that has been creating, or can generate a new theme on-the-fly. +global + + + +web-artifacts-builder +Suite of tools for creating elaborate, multi-component claude.ai HTML artifacts using modern frontend web technologies (React, Tailwind CSS, shadcn/ui). Use for complex artifacts requiring state management, routing, or shadcn/ui components - not for simple single-file HTML/JSX artifacts. +global + + + +webapp-testing +Toolkit for interacting with and testing local web applications using Playwright. Supports verifying frontend functionality, debugging UI behavior, capturing browser screenshots, and viewing browser logs. +global + + + +xlsx +"Use this skill any time a spreadsheet file is the primary input or output. This means any task where the user wants to: open, read, edit, or fix an existing .xlsx, .xlsm, .csv, or .tsv file (e.g., adding columns, computing formulas, formatting, charting, cleaning messy data); create a new spreadsheet from scratch or from other data sources; or convert between tabular file formats. Trigger especially when the user references a spreadsheet file by name or path — even casually (like \"the xlsx in my downloads\") — and wants something done to it or produced from it. Also trigger for cleaning or restructuring messy tabular data files (malformed rows, misplaced headers, junk data) into proper spreadsheets. The deliverable must be a spreadsheet file. Do NOT trigger when the primary deliverable is a Word document, HTML report, standalone Python script, database pipeline, or Google Sheets API integration, even if tabular data is involved." +global + + + + + + diff --git a/Makefile b/Makefile index afc2826..aa5b45f 100644 --- a/Makefile +++ b/Makefile @@ -52,6 +52,12 @@ help: @echo " make frontier-windows-amd64 - Build frontier for Windows amd64" @echo " make frontier-windows-arm64 - Build frontier for Windows arm64" @echo "" + @echo "Docker images (REGISTRY=, VERSION=):" + @echo " make image-frontier - Build frontier image (current platform)" + @echo " make image-frontier-linux - Build frontier image for linux/amd64" + @echo " make image-frontlas - Build frontlas image (current platform)" + @echo " make image-frontlas-linux - Build frontlas image for linux/amd64" + @echo "" @echo "Other targets:" @echo " make clean - Clean local build artifacts" @echo " make clean-dist - Clean cross-compilation artifacts" @@ -419,10 +425,18 @@ uninstall-systemd: image-frontier: docker buildx build -t ${REGISTRY}/frontier:${VERSION} -f images/Dockerfile.frontier . +.PHONY: image-frontier-linux +image-frontier-linux: + docker buildx build --platform linux/amd64 -t ${REGISTRY}/frontier:${VERSION} -f images/Dockerfile.frontier . + .PHONY: image-frontlas image-frontlas: docker buildx build -t ${REGISTRY}/frontlas:${VERSION} -f images/Dockerfile.frontlas . +.PHONY: image-frontlas-linux +image-frontlas-linux: + docker buildx build --platform linux/amd64 -t ${REGISTRY}/frontlas:${VERSION} -f images/Dockerfile.frontlas . + .PHONY: image-gen-api image-gen-api: docker buildx build -t image-gen-api:${VERSION} -f images/Dockerfile.controlplane-api . diff --git a/docs/frontier-technical.md b/docs/frontier-technical.md new file mode 100644 index 0000000..8b8a7f9 --- /dev/null +++ b/docs/frontier-technical.md @@ -0,0 +1,1876 @@ +# Frontier 技术原理 + +## 目录 + +1. [客户端认证](#客户端认证) +2. [客户端上下线](#客户端上下线) +3. [Exchange 原理](#exchange-原理) + +--- + +## 客户端认证 + +### 概述 + +Frontier 支持两种类型的客户端连接: +- **Service(服务端)**:后端服务,负责处理业务逻辑 +- **Edge(边缘端)**:客户端,通常是终端设备或用户应用 + +### Service 客户端认证 + +Service 客户端在连接时通过 `Meta` 结构体传递认证信息: + +```go +type Meta struct { + Service string `json:"service"` // 服务名称 + Topics []string `json:"topics"` // 订阅的主题列表 +} +``` + +**连接流程:** + +1. Service 客户端建立 TCP 连接 +2. 通过 Geminio 协议握手,在 `Meta` 中携带服务名称和订阅的 Topics +3. Frontier 解析 `Meta` 并分配 `ServiceID` +4. 注册服务到内存缓存和数据库 +5. 注册服务声明的 Topics 和 RPCs + +**关键代码位置:** +- `pkg/frontier/servicebound/service_manager.go:handleConn()` - 处理连接 +- `pkg/frontier/servicebound/service_manager.go:online()` - 上线处理 +- `pkg/frontier/servicebound/service_onoff.go:GetClientID()` - ID 分配 + +### Edge 客户端认证 + +Edge 客户端连接时通过 `Meta` 传递元数据(通常是用户标识等信息): + +**连接流程:** + +1. Edge 客户端建立 TCP 连接 +2. 通过 Geminio 协议握手,携带 `Meta` 信息 +3. Frontier 通过 Exchange 向 Service 请求分配 `EdgeID` + - 如果 Service 不在线,根据配置决定是否自动分配 ID +4. 注册 Edge 到内存缓存和数据库 + +**EdgeID 分配机制:** + +```go +func (em *edgeManager) GetClientID(_ uint64, meta []byte) (uint64, error) { + // 优先从 Exchange 获取 EdgeID(通过 Service) + if em.exchange != nil { + edgeID, err := em.exchange.GetEdgeID(meta) + if err == nil { + return edgeID, nil + } + } + + // 如果 Service 不在线,根据配置决定是否自动分配 + if em.conf.Edgebound.EdgeIDAllocWhenNoIDServiceOn { + return em.idFactory.GetID(), nil + } + + return 0, err +} +``` + +**关键代码位置:** +- `pkg/frontier/edgebound/edge_manager.go:handleConn()` - 处理连接 +- `pkg/frontier/edgebound/edge_manager.go:online()` - 上线处理 +- `pkg/frontier/edgebound/edge_onoff.go:GetClientID()` - ID 分配 + +### 认证特点 + +1. **基于 Geminio 协议**:使用 Geminio 作为底层通信协议 +2. **Meta 信息传递**:通过连接时的 Meta 字段传递认证和配置信息 +3. **ID 分配策略**: + - Service:支持指定 ID 或自动分配 + - Edge:优先从 Service 获取,支持降级自动分配 +4. **连接复用**:同一 ServiceID/EdgeID 的旧连接会被新连接踢下线 + +--- + +## 客户端上下线 + +### Service 上线流程 + +**时序图:** + +``` +Service Client Servicebound Manager + | | + |---- TCP Connect -------->| + | | + |---- Geminio Handshake -->| + | | + |<--- Parse Meta ----------| + | | + |---- Allocate ServiceID ->| + | | + |---- Register Service -->| + | | + |---- Register Topics ----->| + | | + |---- Register RPCs ------>| + | | + |---- Add to MQM --------->| + | | + |---- ConnOnline Event --->| + | | + |---- Forward Setup ------->| +``` + +**详细步骤:** + +1. **连接建立** (`handleConn`) + - 接受 TCP 连接 + - 创建 Geminio End + - 解析 Meta 信息 + +2. **注册 Topics** (`remoteReceiveClaim`) + - 将服务声明的 Topics 注册到数据库 + - 创建 `ServiceTopic` 记录 + +3. **添加到 MQM** (Message Queue Manager) + - 将 End 添加到消息队列管理器 + - 用于后续消息路由 + +4. **上线处理** (`online`) + - 检查是否存在旧连接,如果存在则关闭 + - 添加到内存缓存 `services[serviceID] = end` + - 创建数据库记录 `Service` + - 触发 `ConnOnline` 事件 + +5. **设置转发** (`forward`) + - 调用 Exchange 设置 Service -> Edge 的转发 + +**关键代码:** + +```go +func (sm *serviceManager) handleConn(conn net.Conn) error { + // 创建 Geminio End + end, err := server.NewEndWithConn(conn, opt) + + // 解析 Meta + meta := &apis.Meta{} + json.Unmarshal(end.Meta(), meta) + + // 注册 Topics + sm.remoteReceiveClaim(end.ClientID(), meta.Topics) + + // 添加到 MQM + sm.mqm.AddMQByEnd(meta.Topics, end) + + // 上线处理 + sm.online(end, meta) + + // 设置转发 + sm.forward(meta, end) +} +``` + +### Service 下线流程 + +**触发时机:** +- 客户端主动关闭连接 +- 网络断开 +- 连接超时 + +**处理步骤:** + +1. **ConnOffline 事件** + - Geminio 协议层检测到连接断开 + - 触发 `ConnOffline` 回调 + +2. **清理缓存** (`offline`) + - 从内存缓存 `services` 中删除 + - 验证地址匹配,避免误删 + +3. **清理数据库** + - 删除 `Service` 记录 + - 删除 `ServiceRPCs` 记录 + - 删除 `ServiceTopics` 记录 + +4. **清理 MQM** + - 从消息队列管理器中移除 + +5. **通知其他组件** + - 触发 `ServiceOffline` 事件 + - 通知 Informer + +**关键代码:** + +```go +func (sm *serviceManager) ConnOffline(d delegate.ConnDescriber) error { + serviceID := d.ClientID() + addr := d.RemoteAddr() + + // 清理缓存和数据库 + err := sm.offline(serviceID, addr) + + // 通知其他组件 + if sm.informer != nil { + sm.informer.ServiceOffline(serviceID, meta, addr) + } + + return nil +} +``` + +### Edge 上线流程 + +**时序图:** + +``` +Edge Client Edgebound Manager Exchange Service + | | | | + |---- TCP Connect ---->| | | + | | | | + |---- Geminio -------->| | | + | Handshake | | | + | | | | + |<--- Get EdgeID ------| | | + | |---- GetEdgeID ------->| | + | | |---- RPC ------->| + | | |<--- EdgeID -----| + | |<--- EdgeID ----------| | + |<--- EdgeID ----------| | | + | | | | + | |---- Register Edge --->| | + | |---- ConnOnline ------>| | + | |---- Forward Setup --->| | +``` + +**详细步骤:** + +1. **连接建立** (`handleConn`) + - 接受 TCP 连接 + - 创建 Geminio End + +2. **EdgeID 分配** (`GetClientID`) + - 优先通过 Exchange 向 Service 请求 EdgeID + - 如果 Service 不在线,根据配置决定是否自动分配 + +3. **上线处理** (`online`) + - 检查是否存在旧连接,如果存在则关闭 + - 添加到内存缓存 `edges[edgeID] = end` + - 创建数据库记录 `Edge` + - 触发 `ConnOnline` 事件 + - 通知 Exchange Edge 上线 + +4. **设置转发** (`forward`) + - 调用 Exchange 设置 Edge -> Service 的转发 + +**关键代码:** + +```go +func (em *edgeManager) handleConn(conn net.Conn) error { + // 创建 Geminio End + end, err := server.NewEndWithConn(conn, opt) + + // 上线处理(内部会分配 EdgeID) + em.online(end) + + // 设置转发 + em.forward(end) +} +``` + +### Edge 下线流程 + +**处理步骤:** + +1. **ConnOffline 事件** + - Geminio 协议层检测到连接断开 + +2. **清理缓存** (`offline`) + - 从内存缓存 `edges` 中删除 + - 验证地址匹配 + +3. **清理数据库** + - 删除 `Edge` 记录 + - 删除 `EdgeRPCs` 记录 + +4. **通知其他组件** + - 触发 `EdgeOffline` 事件 + - 通知 Exchange Edge 下线 + - 通知 Informer + +**关键代码:** + +```go +func (em *edgeManager) ConnOffline(d delegate.ConnDescriber) error { + edgeID := d.ClientID() + meta := d.Meta() + addr := d.RemoteAddr() + + // 清理缓存和数据库 + err := em.offline(edgeID, meta, addr) + + return nil +} +``` + +### 上下线特性 + +1. **并发安全** + - 使用 `sync.RWMutex` 保护内存缓存 + - 使用 `synchub.SyncHub` 同步旧连接清理 + +2. **幂等性** + - 通过地址匹配避免误删 + - 支持同一 ID 的重复连接(会踢掉旧连接) + +3. **数据一致性** + - 内存缓存和数据库同步更新 + - 使用事务保证数据一致性(TODO) + +4. **事件通知** + - 通过 Informer 通知其他组件 + - 支持自定义事件处理 + +--- + +## Exchange 原理 + +### 概述 + +Exchange 是 Frontier 的核心组件,负责在 Service 和 Edge 之间转发消息、RPC 调用和 Stream。它实现了 Service 和 Edge 的解耦,使得两者可以独立扩展。 + +### 架构设计 + +``` + Exchange + | + +--------------+--------------+ + | | + Edgebound Servicebound + | | + Edge Clients Service Clients +``` + +**核心组件:** + +- `Edgebound`: 管理所有 Edge 连接 +- `Servicebound`: 管理所有 Service 连接 +- `MQM`: 消息队列管理器,负责消息路由 +- `Exchange`: 转发引擎 + +### Service -> Edge 转发 + +#### 消息转发 (Message Forwarding) + +**流程:** + +1. Service 发送消息,在 `Custom` 字段末尾携带目标 `EdgeID`(8字节) +2. Exchange 截取 `EdgeID` 并查找对应的 Edge +3. 如果 Edge 在线,将消息转发给 Edge +4. 如果 Edge 不在线,返回错误 + +**时序图:** + +``` +Service Exchange Edgebound Edge + | | | | + |--Publish------->| | | + | | | | + | |--GetEdgeByID--->| | + | | | | + | |<--Edge End------| | + | | | | + | |-----------------|--Publish---->| + | | | | + | |<----------------|--Done--------| + | | | | + |<--Done----------| | | + | | | | + + 说明: Publish Message 携带 Custom+EdgeID + Exchange 提取 EdgeID 后查找 Edge + 如果 Edge 在线,转发消息并返回 Done + 如果 Edge 不在线,返回 Error(ErrEdgeNotOnline) + + 或者(Edge不在线): + +Service Exchange Edgebound Edge + | | | | + |--Publish------->| | | + | | | | + | |--GetEdgeByID--->| | + | | | | + | |<--nil-----------| | + | | | | + |<--Error----------| | | +``` + +**关键代码:** + +```go +func (ex *exchange) forwardMessageToEdge(end geminio.End) { + serviceID := end.ClientID() + go func() { + for { + msg, err := end.Receive(context.TODO()) + if err != nil { + return + } + + // 从 Custom 末尾提取 EdgeID + custom := msg.Custom() + edgeID := binary.BigEndian.Uint64(custom[len(custom)-8:]) + msg.SetCustom(custom[:len(custom)-8]) + + // 查找 Edge + edge := ex.Edgebound.GetEdgeByID(edgeID) + if edge == nil { + msg.Error(apis.ErrEdgeNotOnline) + return + } + + // 转发消息 + mopt := options.NewMessage() + mopt.SetCustom(msg.Custom()) + mopt.SetTopic(msg.Topic()) + newmsg := edge.NewMessage(msg.Data(), mopt) + edge.Publish(context.TODO(), newmsg, popt) + msg.Done() + } + }() +} +``` + +#### RPC 转发 (RPC Forwarding) + +**流程:** + +1. Service 发起 RPC 调用,在 `Custom` 字段末尾携带目标 `EdgeID` +2. Exchange 拦截 RPC(通过 Hijack) +3. 提取 `EdgeID` 并查找 Edge +4. 转发 RPC 调用到 Edge +5. 将响应返回给 Service,并在 `Custom` 中携带 `EdgeID` + +**时序图:** + +``` +Service Exchange Edgebound Edge + | | | | + |--Call RPC------>| | | + | | | | + | |--GetEdgeByID--->| | + | | | | + | |<--Edge End------| | + | | | | + | |-----------------|--Call RPC-->| + | | | | + | |<----------------|--Response----| + | | | | + |<--Response-------| | | + | | | | + + 说明: Call RPC 携带 Custom+EdgeID + Exchange Hijack 拦截并提取 EdgeID + 如果 Edge 在线,转发 RPC 并返回 Response(Data+Custom+EdgeID) + 如果 Edge 不在线,返回 Error(ErrEdgeNotOnline) + + 或者(Edge不在线): + +Service Exchange Edgebound Edge + | | | | + |--Call RPC------>| | | + | | | | + | |--GetEdgeByID--->| | + | | | | + | |<--nil-----------| | + | | | | + |<--Error----------| | | +``` + +**关键代码:** + +```go +func (ex *exchange) forwardRPCToEdge(end geminio.End) { + end.Hijack(func(ctx context.Context, method string, r1 geminio.Request, r2 geminio.Response) { + serviceID := end.ClientID() + + // 提取 EdgeID + custom := r1.Custom() + edgeID := binary.BigEndian.Uint64(custom[len(custom)-8:]) + r1.SetCustom(custom[:len(custom)-8]) + + // 查找 Edge + edge := ex.Edgebound.GetEdgeByID(edgeID) + if edge == nil { + r2.SetError(apis.ErrEdgeNotOnline) + return + } + + // 转发 RPC + r3 := edge.NewRequest(r1.Data(), ropt) + r4, err := edge.Call(ctx, method, r3, copt) + + // 返回响应,携带 EdgeID + tail := make([]byte, 8) + binary.BigEndian.PutUint64(tail, edgeID) + r2.SetCustom(append(r4.Custom(), tail...)) + r2.SetData(r4.Data()) + }) +} +``` + +### Edge -> Service 转发 + +#### 消息转发 (Message Forwarding) + +**流程:** + +1. Edge 发送消息到指定 Topic +2. Exchange 接收消息 +3. 通过 MQM 将消息投递到订阅该 Topic 的 Service +4. MQM 负责消息路由和负载均衡 + +**时序图:** + +``` +Edge Exchange MQM Servicebound Service + | | | | | + |--Publish---->| | | | + | | | | | + | |--Produce---->| | | + | | | | | + | | |--GetServices>| | + | | | | | + | | |<--ServiceList| | + | | | | | + | | |--Deliver---->| | + | | | | | + | | |<--Done-------| | + | | | | | + | |<--Success----| | | + | | | | | + |<--Done-------| | | | + + 说明: Edge 发送消息到指定 Topic + Exchange 通过 MQM 投递消息 + MQM 查找订阅 Topic 的 Services 并负载均衡投递 +``` + +**关键代码:** + +```go +func (ex *exchange) forwardMessageToService(end geminio.End) { + edgeID := end.ClientID() + go func() { + for { + msg, err := end.Receive(context.TODO()) + if err != nil { + return + } + + topic := msg.Topic() + + // 通过 MQM 投递消息 + err = ex.MQM.Produce(topic, msg.Data(), + apis.WithOrigin(msg), + apis.WithEdgeID(edgeID), + apis.WithAddr(end.RemoteAddr())) + + if err != nil { + msg.Error(err) + continue + } + msg.Done() + } + }() +} +``` + +#### RPC 转发 (RPC Forwarding) + +**流程:** + +1. Edge 发起 RPC 调用,指定 RPC 方法名 +2. Exchange 拦截 RPC +3. 根据 RPC 方法名查找提供该方法的 Service(可能有多个) +4. 使用哈希算法选择目标 Service(负载均衡) +5. 转发 RPC 调用,在 `Custom` 中携带 `EdgeID` +6. 将响应返回给 Edge + +**时序图:** + +``` +Edge Exchange Servicebound Service + | | | | + |--Call RPC-->| | | + | | | | + | |--GetServicesByRPC| | + | | | | + | |<--ServiceList----| | + | | | | + | |------------------|--Call RPC-->| + | | | | + | |<-----------------|--Response---| + | | | | + |<--Response----| | | + | | | | + + 说明: Edge 发起 RPC 调用指定 method + Exchange Hijack 拦截并查找提供该 RPC 的 Services + 使用哈希算法 Hash(edgeID, addr) 选择 Service + 在 Custom 中追加 EdgeID 后转发 RPC + 返回 Response(Data+Custom) +``` + +**负载均衡策略:** + +```go +// 使用哈希算法选择 Service +index := misc.Hash(ex.conf.Exchange.HashBy, len(svcs), edgeID, addr) +svc := svcs[index] +``` + +**关键代码:** + +```go +func (ex *exchange) forwardRPCToService(end geminio.End) { + edgeID := end.ClientID() + addr := end.RemoteAddr() + + end.Hijack(func(ctx context.Context, method string, r1 geminio.Request, r2 geminio.Response) { + // 查找提供该 RPC 的 Services + svcs, err := ex.Servicebound.GetServicesByRPC(method) + if err != nil { + r2.SetError(err) + return + } + + // 负载均衡选择 Service + index := misc.Hash(ex.conf.Exchange.HashBy, len(svcs), edgeID, addr) + svc := svcs[index] + + // 在 Custom 中携带 EdgeID + tail := make([]byte, 8) + binary.BigEndian.PutUint64(tail, edgeID) + custom := append(r1.Custom(), tail...) + + // 转发 RPC + r3 := svc.NewRequest(r1.Data(), ropt) + r4, err := svc.Call(ctx, method, r3, copt) + + r2.SetData(r4.Data()) + r2.SetCustom(r4.Custom()) + }) +} +``` + +### Stream 转发 + +Stream 用于建立 Service 和 Edge 之间的双向数据流。 + +#### Service -> Edge Stream + +**流程:** + +1. Service 创建 Stream,在 `Peer` 字段中指定目标 `EdgeID` +2. Exchange 解析 `EdgeID` +3. 查找对应的 Edge +4. 在 Edge 端创建对应的 Stream +5. 双向转发 Stream 数据(Raw、Message、RPC) + +**时序图:** + +``` +Service Exchange Edgebound Edge + | | | | + |--OpenStream---->| | | + | | | | + | |--GetEdgeByID--->| | + | | | | + | |<--Edge End------| | + | | | | + | |-----------------|--OpenStream->| + | | | | + | |<----------------|--EdgeStream--| + | | | | + |<--Connected-----| | | + | | | | + |<==============Bidirectional Data==========>| + | | | | + + 说明: Service 创建 Stream,Peer=EdgeID + Exchange 解析 EdgeID 并查找 Edge + 如果 Edge 在线,建立双向 Stream 并转发数据 + 如果 Edge 不在线,关闭 Stream + + 或者(Edge不在线): + +Service Exchange Edgebound Edge + | | | | + |--OpenStream---->| | | + | | | | + | |--GetEdgeByID--->| | + | | | | + | |<--nil-----------| | + | | | | + |<--Close Stream--| | | +``` + +**关键代码:** + +```go +func (ex *exchange) StreamToEdge(serviceStream geminio.Stream) { + // 从 Peer 中解析 EdgeID + peer := serviceStream.Peer() + edgeID, err := strconv.ParseUint(peer, 10, 64) + + // 查找 Edge + edge := ex.Edgebound.GetEdgeByID(edgeID) + if edge == nil { + serviceStream.Close() + return + } + + // 创建 Edge Stream + edgeStream, err := edge.OpenStream() + + // 双向转发 + ex.streamForward(serviceStream, edgeStream) +} +``` + +#### Edge -> Service Stream + +**流程:** + +1. Edge 创建 Stream,在 `Peer` 字段中指定目标 Service 名称 +2. Exchange 根据 Service 名称查找 Service +3. 在 Service 端创建对应的 Stream +4. 双向转发 Stream 数据 + +**时序图:** + +``` +Edge Exchange Servicebound Service + | | | | + |--OpenStream->| | | + | | | | + | |--GetServiceByName| | + | | | | + | |<--Service End----| | + | | | | + | |------------------|--OpenStream->| + | | | | + | |<-----------------|--ServiceStream| + | | | | + |<--Connected---| | | + | | | | + |<============Bidirectional Data==========>| + | | | | + + 说明: Edge 创建 Stream,Peer=ServiceName + Exchange 解析 Service 名称并查找 Service + 如果 Service 在线,建立双向 Stream 并转发数据 + 如果 Service 不在线,关闭 Stream + + 或者(Service不在线): + +Edge Exchange Servicebound Service + | | | | + |--OpenStream->| | | + | | | | + | |--GetServiceByName| | + | | | | + | |<--Error-----------| | + | | | | + |<--Close Stream| | | +``` + +**关键代码:** + +```go +func (ex *exchange) StreamToService(edgeStream geminio.Stream) { + // 从 Peer 中解析 Service 名称 + peer := edgeStream.Peer() + svc, err := ex.Servicebound.GetServiceByName(peer) + + // 创建 Service Stream + serviceStream, err := svc.OpenStream() + + // 双向转发 + ex.streamForward(edgeStream, serviceStream) +} +``` + +#### Stream 数据转发 + +Stream 支持三种数据类型的转发: + +1. **Raw 数据**:原始字节流双向转发 +2. **Message**:消息双向转发 +3. **RPC**:RPC 调用双向转发 + +**时序图:** + +``` +Stream A Exchange Stream B + | | | + |--Raw Data---->| | + | | | + | |--Raw Data---->| + | | | + |<--Raw Data----| | + | | | + | |<--Raw Data----| + | | | + |--Message----->| | + | | | + | |--Publish Msg->| + | | | + |<--Message-----| | + | | | + | |<--Message------| + | | | + |--RPC Request->| | + | | | + | |--Call RPC----->| + | | | + | |<--RPC Response-| + | | | + |<--RPC Response| | + + 说明: Stream 支持三种数据类型的双向转发 + - Raw 数据: 原始字节流双向转发 + - Message: 消息双向转发 + - RPC: RPC 调用双向转发 +``` + +**关键代码:** + +```go +func (ex *exchange) streamForward(left, right geminio.Stream) { + // Raw 数据转发 + ex.streamForwardRaw(left, right) + // Message 转发 + ex.streamForwardMessage(left, right) + // RPC 转发 + ex.streamForwardRPC(left, right) +} +``` + +### Exchange 特性 + +1. **透明转发** + - Service 和 Edge 无需知道对方的具体位置 + - 通过 ID 或名称进行路由 + +2. **负载均衡** + - RPC 调用支持多 Service 负载均衡 + - 使用哈希算法保证相同 Edge 的请求路由到同一 Service + +3. **错误处理** + - 目标不在线时返回明确错误 + - 支持超时控制(默认 30 秒) + +4. **Custom 字段传递** + - 通过 Custom 字段传递路由信息(EdgeID) + - 保持原始 Custom 数据,仅在末尾追加路由信息 + +5. **异步处理** + - 消息转发使用 goroutine 异步处理 + - RPC 转发同步等待响应 + +### 数据流向总结 + +``` +Service -> Edge: + - Message: Service 指定 EdgeID -> Exchange 转发 -> Edge + - RPC: Service 指定 EdgeID -> Exchange 转发 -> Edge -> 响应返回 + - Stream: Service 指定 EdgeID -> Exchange 建立双向流 + +Edge -> Service: + - Message: Edge 指定 Topic -> Exchange -> MQM -> 订阅的 Service + - RPC: Edge 指定方法名 -> Exchange 负载均衡 -> Service -> 响应返回 + - Stream: Edge 指定 Service 名称 -> Exchange 建立双向流 +``` + +--- + +--- + +## Frontier + Frontlas 集群模式 + +### 概述 + +Frontier 集群模式通过引入 Frontlas(Frontier Atlas)组件实现多 Frontier 实例的协调管理。Frontlas 是一个无状态的集群管理组件,使用 Redis 存储 Frontier、Service 和 Edge 的元数据信息。 + +**架构图:** + +``` + Frontlas (集群管理) + | + +------------+------------+ + | | + Redis (元数据存储) gRPC/REST API + | | + +-------+-------+ +-------+-------+ + | | | | +Frontier-1 Frontier-2 Service-1 Service-2 + | | | | +Edge-1 Edge-2 | | + ... ... +``` + +**核心组件:** + +- **Frontier**: 无状态的数据平面组件,可以水平扩展 +- **Frontlas**: 无状态的集群管理组件,使用 Redis 存储元数据 +- **Redis**: 存储 Frontier、Service、Edge 的元数据和存活信息 + +### 多 Frontier 下的连接管理 + +#### Service 连接管理 + +在集群模式下,Service 通过 `clusterServiceEnd` 管理多个 Frontier 连接。 + +**连接池管理:** + +```go +type clusterServiceEnd struct { + cc clusterv1.ClusterServiceClient // gRPC 客户端,连接 Frontlas + + edgefrontiers *mapmap.BiMap // EdgeID <-> FrontierID 双向映射 + frontiers sync.Map // FrontierID -> frontierNend 连接池 +} +``` + +**关键机制:** + +1. **定期更新 Frontier 列表** + - 每 10 秒通过 gRPC 调用 `ListFrontiers` 获取最新的 Frontier 列表 + - 对比当前连接池,识别新增和删除的 Frontier + +2. **动态连接管理** + - **新增 Frontier**: 自动创建连接并加入连接池 + - **删除 Frontier**: 关闭连接并从连接池移除 + - **连接漂移**: 当 Frontier 地址变化时,关闭旧连接,创建新连接 + +**时序图:** + +``` +Service Frontlas Frontier-1 Frontier-2 + | | | | + |--ListFrontiers>| | | + | | | | + |<--FrontierList-| | | + | | | | + |--Compare Pool->| | | + | | | | + |--New Frontier->| | | + | | | | + |----------------|-----------------|--Connect---->| + | | | | + |<----------------|-----------------|--Connected---| + | | | | + |--Old Frontier->| | | + | | | | + |----------------|-----------------|--Close------>| + | | | | +``` + +**流程:** + +1. Service 启动时连接 Frontlas(gRPC) +2. 调用 `ListFrontiers` 获取所有 Frontier 列表 +3. 为每个 Frontier 创建 `serviceEnd` 连接 +4. 定期(10秒)更新 Frontier 列表 +5. 对比差异: + - **新增**: 创建新连接 + - **删除**: 关闭旧连接,清理 EdgeID 映射 + - **变更**: 关闭旧连接,创建新连接 + +**关键代码:** + +```go +func (end *clusterServiceEnd) update() error { + // 获取最新 Frontier 列表 + rsp, err := end.cc.ListFrontiers(context.TODO(), &clusterv1.ListFrontiersRequest{}) + + // 对比当前连接池 + keeps := []string{} + removes := []*frontierNend{} + news := []*clusterv1.Frontier{} + + // 识别需要删除的 Frontier + end.frontiers.Range(func(key, value interface{}) bool { + // 如果不在新列表中,标记为删除 + if !foundInNewList { + removes = append(removes, frontierNend) + } + return true + }) + + // 识别新增的 Frontier + for _, frontier := range rsp.Frontiers { + if !foundInKeeps { + news = append(news, frontier) + } + } + + // 异步处理连接变更 + go func() { + // 关闭旧连接 + for _, remove := range removes { + remove.end.Close() + end.edgefrontiers.DelValue(remove.frontier.FrontierId) + } + // 创建新连接 + for _, new := range news { + serviceEnd, err := end.newServiceEnd(new.AdvertisedSbAddr) + end.frontiers.Swap(new.FrontierId, &frontierNend{ + frontier: new, + end: serviceEnd, + }) + } + }() +} +``` + +#### Edge 路由查找 + +当 Service 需要与特定 Edge 通信时,需要查找该 Edge 所在的 Frontier。 + +**查找机制:** + +1. **缓存查找**: 首先从 `edgefrontiers` 双向映射中查找 EdgeID 对应的 FrontierID +2. **Frontlas 查询**: 如果缓存未命中,调用 `GetFrontierByEdge` 查询 Frontlas +3. **连接获取**: 从连接池中获取对应的 Frontier 连接 +4. **连接创建**: 如果连接池中没有,动态创建连接 + +**时序图:** + +``` +Service Frontlas Redis Frontier + | | | | + |--Publish(edgeID)>| | | + | | | | + |--Check Cache-->| | | + | | | | + |<--Cache Miss----| | | + | | | | + |--GetFrontierByEdge>| | | + | | | | + | |--GetEdge------->| | + | | | | + | |<--Edge Info----| | + | | | | + | |--GetFrontier-->| | + | | | | + | |<--Frontier Info| | + | | | | + |<--Frontier Info-| | | + | | | | + |--Get Connection>| | | + | | | | + |----------------|----------------|--Publish---->| + | | | | +``` + +**关键代码:** + +```go +func (end *clusterServiceEnd) lookup(edgeID uint64) (string, *serviceEnd, error) { + // 1. 从缓存查找 + frontierID, ok := end.edgefrontiers.GetValue(edgeID) + if !ok { + // 2. 从 Frontlas 查询 + rsp, err := end.cc.GetFrontierByEdge(context.TODO(), &clusterv1.GetFrontierByEdgeIDRequest{ + EdgeId: edgeID, + }) + frontierID = rsp.Fontier.FrontierId + // 3. 更新缓存 + end.edgefrontiers.Set(edgeID, frontierID) + } + + // 4. 从连接池获取连接 + fe, ok := end.frontiers.Load(frontierID) + if !ok { + // 5. 动态创建连接 + serviceEnd, err := end.newServiceEnd(frontier.AdvertisedSbAddr) + end.frontiers.Swap(frontierID, &frontierNend{ + frontier: frontier, + end: serviceEnd, + }) + } + + return frontierID, serviceEnd, nil +} +``` + +#### 连接漂移处理 + +**场景:** + +1. **Frontier 实例重启**: IP 地址可能变化(K8s Pod) +2. **Frontier 实例迁移**: 节点故障导致 Pod 迁移 +3. **Frontier 配置变更**: 端口或地址配置变化 + +**处理机制:** + +1. **定期更新检测**: 每 10 秒更新 Frontier 列表,检测地址变化 +2. **连接对比**: 通过 `frontierEqual` 比较 FrontierID 和地址 +3. **优雅切换**: + - 先创建新连接 + - 再关闭旧连接 + - 使用 `Swap` 保证原子性 + +**关键代码:** + +```go +func frontierEqual(a, b *clusterv1.Frontier) bool { + return a.AdvertisedSbAddr == b.AdvertisedSbAddr && + a.FrontierId == b.FrontierId +} + +// 在 update() 中 +prev, ok := end.frontiers.Swap(new.FrontierId, &frontierNend{ + frontier: new, + end: serviceEnd, +}) +if ok { + // 关闭旧连接 + prev.(*frontierNend).end.Close() +} +``` + +#### Edge 连接管理 + +在集群模式下,Edge 直接连接到 Frontier 实例。Edge 可以连接到任意 Frontier,当连接失败时可以重试或切换到其他 Frontier。 + +**连接流程:** + +1. **Edge 初始连接** + - Edge 通过 Dialer 连接到指定的 Frontier 地址 + - Frontier 接受连接并分配 EdgeID + - Edge 上线处理 + +2. **Edge 上线通知** + - Frontier 在本地注册 Edge + - Frontier 通过 Exchange 通知 Service(如果 Service 在线) + - Frontier 向 Frontlas 报告 Edge 上线 + +3. **心跳续期** + - Edge 每 30 秒向 Frontier 发送心跳 + - Frontier 转发心跳到 Frontlas + - Frontlas 续期 Edge 的存活标记 + +4. **连接失败处理** + - Edge 连接失败时,根据配置决定是否重试 + - 使用 `NewRetryEdge` 时,会自动重连 + - 可以配置连接到不同的 Frontier 地址 + +**时序图:** + +``` +Edge Frontier-1 Exchange Frontlas Redis + | | | | | + |--Connect------->| | | | + | | | | | + | |--Allocate EdgeID>| | | + | | | | | + | |<--EdgeID--------| | | + | | | | | + |<--Connected-----| | | | + | | | | | + | |--EdgeOnline---->| | | + | | | | | + | | |--EdgeOnline---->| | + | | | | | + | | | |--SetEdgeAndAlive>| + | | | | | + | | | |<--Success-----| + | | | | | + | | |<--Success------| | + | | | | | + | |<--Success-------| | | + | | | | | + |--Heartbeat------| | | | + | | | | | + | |--EdgeHeartbeat->| | | + | | | | | + | | |--EdgeHeartbeat>| | + | | | | | + | | | |--ExpireEdge-->| + | | | | | + | | | |<--Success-----| + | | | | | + | | |<--Success------| | + | | | | | + | |<--Success-------| | | + | | | | | +``` + +**Edge 重连场景:** + +当 Edge 连接失败或 Frontier 故障时,Edge 可以重连到其他 Frontier。 + +**时序图:** + +``` +Edge Frontier-1 Frontier-2 Frontlas Redis + | | | | | + |--Connect------->| | | | + | | | | | + | |<--Connection Failed| | | + | | | | | + |--Retry Connect->| | | | + | | | | | + | |<--Connection Failed| | | + | | | | | + |--Connect--------|---------------->| | | + | | | | | + | | |--Allocate EdgeID>| | + | | | | | + | | |<--EdgeID--------| | + | | | | | + |<--Connected-----| | | | + | | | | | + | | |--EdgeOnline---->| | + | | | | | + | | | |--SetEdgeAndAlive>| + | | | | | + | | | |<--Success-----| + | | | | | + | | |<--Success------| | + | | | | | + | | |--Delete Old Edge>| | + | | | | | + | | | |--DeleteEdge-->| + | | | | | + | | | |<--Success-----| + | | | | | + | | |<--Success------| | +``` + +**Edge 下线流程:** + +**时序图:** + +``` +Edge Frontier Exchange Frontlas Redis + | | | | | + |--Disconnect---->| | | | + | | | | | + | |--EdgeOffline-->| | | + | | | | | + | | |--EdgeOffline-->| | + | | | | | + | | | |--DeleteEdge->| + | | | | | + | | | |<--Success-----| + | | | | | + | | |<--Success------| | + | | | | | + | |<--Success-------| | | + | | | | | + | |--Clean Local---->| | | + | | | | | +``` + +**关键机制:** + +1. **EdgeID 分配** + - Edge 连接时,Frontier 通过 Exchange 向 Service 请求 EdgeID + - 如果 Service 不在线,根据配置决定是否自动分配 EdgeID + - EdgeID 在集群中全局唯一 + +2. **连接选择** + - Edge 可以连接到任意 Frontier 实例 + - 通常通过负载均衡器或 DNS 选择 Frontier + - 支持配置多个 Frontier 地址进行重试 + +3. **状态同步** + - Edge 上线/下线时,Frontier 同步状态到 Frontlas + - Frontlas 更新 Redis 中的 Edge 元数据 + - Service 通过 Frontlas 查询 Edge 所在的 Frontier + +4. **心跳机制** + - Edge 每 30 秒发送心跳到 Frontier + - Frontier 转发心跳到 Frontlas + - Frontlas 续期 Redis 中的存活标记(TTL 30秒) + +**关键代码:** + +```go +// Edge 上线处理 +func (em *edgeManager) online(end geminio.End) error { + // 1. 检查是否存在旧连接 + old, ok := em.edges[end.ClientID()] + if ok { + oldend.Close() // 关闭旧连接 + } + + // 2. 添加到本地缓存 + em.edges[end.ClientID()] = end + + // 3. 创建数据库记录 + edge := &model.Edge{ + EdgeID: end.ClientID(), + Meta: string(end.Meta()), + Addr: end.RemoteAddr().String(), + } + em.repo.CreateEdge(edge) + + // 4. 通知 Exchange + if em.exchange != nil { + em.exchange.EdgeOnline(end.ClientID(), end.Meta(), end.RemoteAddr()) + } +} + +// Edge 上线通知 Frontlas +func (fm *FrontierManager) EdgeOnline(ctx context.Context, req geminio.Request, rsp geminio.Response) { + edgeOnline := &gapis.EdgeOnline{} + json.Unmarshal(req.Data(), edgeOnline) + + // 更新 Redis + fm.repo.SetEdgeAndAlive(edgeOnline.EdgeID, &repo.Edge{ + FrontierID: edgeOnline.FrontierID, + Addr: edgeOnline.Addr, + }, edgeHeartbeatInterval) +} +``` + +### 水平扩展原理 + +#### Frontier 无状态设计 + +Frontier 实例是无状态的,所有状态信息存储在: +- **内存**: 当前连接的 Service 和 Edge 信息(重启后丢失) +- **Redis**: 通过 Frontlas 持久化的元数据 + +**无状态特性:** + +1. **无本地存储**: Frontier 不存储任何持久化数据 +2. **无会话绑定**: Service 和 Edge 可以连接到任意 Frontier 实例 +3. **动态路由**: 通过 Frontlas 查询 Edge 所在的 Frontier + +#### 水平扩展流程 + +**扩展步骤:** + +1. **添加 Frontier 实例** + - 新 Frontier 启动并连接 Frontlas + - 在 Redis 中注册 Frontier 信息(FrontierID、地址等) + - 设置存活标记(TTL 30秒) + +2. **Service 发现新 Frontier** + - Service 定期调用 `ListFrontiers` 获取最新列表 + - 检测到新 Frontier,自动创建连接 + - 新连接加入连接池 + +3. **Edge 连接分配** + - 新 Edge 可以连接到任意 Frontier 实例 + - 通过负载均衡或随机选择 + - Edge 信息记录到 Redis(关联 FrontierID) + +4. **流量自动分配** + - 新 Edge 的请求自动路由到对应的 Frontier + - Service 通过 `lookup` 查找 Edge 所在的 Frontier + - 实现负载均衡 + +**时序图:** + +``` +NewFrontier Frontlas Redis Service + | | | | + |--Connect------>| | | + | (geminio) | | | + | | | | + |--ConnOnline--->| | | + | (FrontierID, | | | + | Addr) | | | + | | | | + | |--SetFrontierAndAlive>| | + | | (Hash + TTL) | | + | | | | + | |<--Success------| | + | | | | + |<--Registered---| | | + | | | | + | |<--ListFrontiers| | + | | (gRPC) | | + | | | | + | |--GetAllFrontiers>| | + | | | | + | |<--FrontierList-| | + | | | | + | |--FrontierList->| | + | | | | + | | |--Compare Pool>| + | | | | + | | |--New Frontier>| + | | | | + | | |--Connect----->| + | | | (geminio) | + | | | | + | | |<--Connected---| +``` + +#### 负载均衡策略 + +**Edge 连接分配:** + +- **随机分配**: Edge 随机选择 Frontier 连接 +- **负载均衡**: 根据 Frontier 的 Edge 数量分配(需要额外实现) + +**Service 请求路由:** + +- **精确路由**: 通过 EdgeID 查找对应的 Frontier +- **缓存优化**: EdgeID -> FrontierID 映射缓存,减少 Frontlas 查询 + +#### 高可用性 + +**Frontier 故障处理:** + +1. **心跳检测**: Frontier 每 30 秒向 Frontlas 发送心跳 +2. **TTL 过期**: Redis 中的存活标记过期后,Frontier 被视为离线 +3. **自动清理**: Frontlas 清理过期的 Frontier 信息 +4. **连接重建**: Service 检测到 Frontier 离线,关闭连接并清理缓存 + +**Frontlas 高可用:** + +- **无状态设计**: Frontlas 实例无状态,可以部署多个 +- **Redis 高可用**: 使用 Redis Sentinel 或 Cluster 模式 +- **负载均衡**: 多个 Frontlas 实例通过负载均衡器提供服务 + +### Redis 数据模型 + +Frontlas 使用 Redis 存储 Frontier、Service 和 Edge 的元数据和存活信息。所有数据通过 TTL(Time To Live)机制管理生命周期。 + +#### Frontier 数据模型 + +**存储结构:** + +1. **元数据(Hash)** + - Key: `frontlas:frontiers:{frontierID}` + - Type: Hash + - Fields: + - `advertised_sb_addr`: Servicebound 地址 + - `advertised_eb_addr`: Edgebound 地址 + - `edge_count`: Edge 数量 + - `service_count`: Service 数量 + - TTL: 由配置的 `service_meta` 决定(默认 30 秒) + +2. **存活标记(String)** + - Key: `frontlas:alive:frontiers:{frontierID}` + - Type: String + - Value: `1` + - TTL: 30 秒(通过心跳续期) + +**示例:** + +``` +# Frontier 元数据 +frontlas:frontiers:frontier01 + advertised_sb_addr: "192.168.1.10:30011" + advertised_eb_addr: "192.168.1.10:30012" + edge_count: "100" + service_count: "5" + +# Frontier 存活标记 +frontlas:alive:frontiers:frontier01 = "1" (TTL: 30s) +``` + +**操作:** + +- **创建**: `SetFrontierAndAlive()` - 使用 Lua 脚本原子性创建元数据和存活标记 +- **更新**: `ExpireFrontier()` - 更新存活标记的 TTL +- **删除**: `DeleteFrontier()` - 删除存活标记,保留元数据(edge_count 设为 0) + +#### Service 数据模型 + +**存储结构:** + +1. **元数据(String/JSON)** + - Key: `frontlas:services:{serviceID}` + - Type: String + - Value: JSON 格式 + ```json + { + "service": "user-service", + "frontier_id": "frontier01", + "addr": "192.168.1.20:54321", + "update_time": 1234567890 + } + ``` + - TTL: 由配置的 `service_meta` 决定(默认 30 秒) + +2. **存活标记(String)** + - Key: `frontlas:alive:services:{serviceID}` + - Type: String + - Value: `1` + - TTL: 30 秒(通过心跳续期) + +**示例:** + +``` +# Service 元数据 +frontlas:services:12345 = '{"service":"user-service","frontier_id":"frontier01","addr":"192.168.1.20:54321","update_time":1234567890}' (TTL: 30s) + +# Service 存活标记 +frontlas:alive:services:12345 = "1" (TTL: 30s) +``` + +**操作:** + +- **创建**: `SetServiceAndAlive()` - 使用 Lua 脚本原子性创建元数据和存活标记,并更新 Frontier 的 service_count +- **更新**: `ExpireService()` - 更新元数据和存活标记的 TTL +- **删除**: `DeleteService()` - 使用 Lua 脚本删除存活标记,更新 Frontier 的 service_count + +#### Edge 数据模型 + +**存储结构:** + +1. **元数据(String/JSON)** + - Key: `frontlas:edges:{edgeID}` + - Type: String + - Value: JSON 格式 + ```json + { + "frontier_id": "frontier01", + "addr": "192.168.1.30:54322", + "update_time": 1234567890 + } + ``` + - TTL: 由配置的 `edge_meta` 决定(默认 30 秒) + +2. **存活标记(String)** + - Key: `frontlas:alive:edges:{edgeID}` + - Type: String + - Value: `1` + - TTL: 30 秒(通过心跳续期) + +**示例:** + +``` +# Edge 元数据 +frontlas:edges:67890 = '{"frontier_id":"frontier01","addr":"192.168.1.30:54322","update_time":1234567890}' (TTL: 30s) + +# Edge 存活标记 +frontlas:alive:edges:67890 = "1" (TTL: 30s) +``` + +**操作:** + +- **创建**: `SetEdgeAndAlive()` - 使用 Pipeline 原子性创建元数据和存活标记,并更新 Frontier 的 edge_count +- **更新**: `ExpireEdge()` - 更新元数据和存活标记的 TTL +- **删除**: `DeleteEdge()` - 使用 Lua 脚本删除存活标记,更新 Frontier 的 edge_count + +#### 数据关系 + +**Frontier <-> Service 关系:** + +- Service 元数据中包含 `frontier_id` 字段 +- Frontier 的 Hash 中包含 `service_count` 字段 +- 通过 `frontier_id` 可以查找 Service 所在的 Frontier + +**Frontier <-> Edge 关系:** + +- Edge 元数据中包含 `frontier_id` 字段 +- Frontier 的 Hash 中包含 `edge_count` 字段 +- 通过 `frontier_id` 可以查找 Edge 所在的 Frontier + +**查询路径:** + +``` +EdgeID -> frontlas:edges:{edgeID} -> frontier_id -> frontlas:frontiers:{frontierID} -> advertised_sb_addr +``` + +#### TTL 和心跳机制 + +**TTL 策略:** + +1. **元数据 TTL**: 配置项控制(默认 30 秒),用于清理长期不活跃的数据 +2. **存活标记 TTL**: 固定 30 秒,通过心跳续期 + +**心跳机制:** + +1. **Frontier 心跳**: 每 30 秒向 Frontlas 发送心跳,续期 `frontlas:alive:frontiers:{frontierID}` +2. **Service 心跳**: 每 30 秒向 Frontier 发送心跳,Frontier 转发给 Frontlas,续期 `frontlas:alive:services:{serviceID}` +3. **Edge 心跳**: 每 30 秒向 Frontier 发送心跳,Frontier 转发给 Frontlas,续期 `frontlas:alive:edges:{edgeID}` + +**过期处理:** + +- 当存活标记过期时,对应的资源被视为离线 +- Frontlas 会清理过期的存活标记 +- Service 通过定期查询检测到 Frontier 离线,自动清理连接 + +#### 数据一致性 + +**原子性操作:** + +- 使用 Redis Lua 脚本保证创建/删除操作的原子性 +- 使用 Pipeline 批量操作保证一致性 + +**关键 Lua 脚本:** + +1. **frontier_create.lua**: 创建 Frontier 元数据和存活标记 +2. **service_create.lua**: 创建 Service 元数据、存活标记,并更新 Frontier 计数 +3. **service_delete.lua**: 删除 Service 存活标记,并更新 Frontier 计数 +4. **edge_delete.lua**: 删除 Edge 存活标记,并更新 Frontier 计数 + +### CRD 部分原理 + +#### CRD 定义 + +Frontier 使用 Kubernetes Operator 模式,通过 CRD(Custom Resource Definition)定义集群配置。 + +**CRD 结构:** + +```yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: frontierclusters.frontier.singchia.io +spec: + group: frontier.singchia.io + names: + kind: FrontierCluster + plural: frontierclusters + scope: Namespaced + versions: + - name: v1alpha1 +``` + +**CRD Spec 定义:** + +```go +type FrontierClusterSpec struct { + Frontier Frontier `json:"frontier"` + Frontlas Frontlas `json:"frontlas"` +} + +type Frontier struct { + Replicas int `json:"replicas,omitempty"` + Servicebound Servicebound `json:"servicebound"` + Edgebound Edgebound `json:"edgebound"` + Image string `json:"image,omitempty"` + NodeAffinity corev1.NodeAffinity `json:"nodeAffinity,omitempty"` +} + +type Frontlas struct { + Replicas int `json:"replicas,omitempty"` + ControlPlane ControlPlane `json:"controlplane,omitempty"` + Redis Redis `json:"redis"` + Image string `json:"image,omitempty"` +} +``` + +#### Controller 工作原理 + +**Reconcile 循环:** + +Controller 通过 Reconcile 函数实现声明式配置管理。 + +**工作流程:** + +1. **监听 CRD 变更**: Controller 监听 `FrontierCluster` 资源的创建、更新、删除 +2. **Reconcile 触发**: 当 CRD 变更时,触发 Reconcile 函数 +3. **状态对比**: 对比期望状态(Spec)和实际状态(Status) +4. **资源创建/更新**: 创建或更新 Deployment、Service 等资源 +5. **状态更新**: 更新 CRD 的 Status 字段 + +**时序图:** + +``` +用户 K8s API Controller Deployment Service + | | | | | + |--Apply CRD---->| | | | + | | | | | + | |--Event-------->| | | + | | | | | + | | |--Reconcile-->| | + | | | | | + | | |--Ensure Service>| | + | | | | | + | | |--Create/Update>| | + | | | | | + | |<--Create Service| | | + | | | | | + | | |--Ensure Deployment>| | + | | | | | + | | |--Create/Update>| | + | | | | | + | |<--Create Deployment| | | + | | | | | + | | |--Check Ready->| | + | | | | | + | | |<--Ready--------| | + | | | | | + | | |--Update Status>| | + | | | | | + | |<--Status Update| | | +``` + +**关键代码:** + +```go +func (r *FrontierClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // 1. 获取 CRD + frontiercluster := frontierv1alpha1.FrontierCluster{} + r.Get(ctx, req.NamespacedName, &frontiercluster) + + // 2. 确保 Service 存在 + r.ensureService(ctx, frontiercluster) + + // 3. 确保 TLS Secret 存在 + r.ensureTLS(ctx, frontiercluster) + + // 4. 确保 Deployment 存在 + ready, err := r.ensureDeployment(ctx, frontiercluster) + + // 5. 更新状态 + status.Update(ctx, r.client.Status(), &frontiercluster, statusOptions(). + withMessage(Info, "Good to go!"). + withRunningPhase()) +} +``` + +#### 资源管理 + +**Deployment 创建:** + +Controller 根据 CRD Spec 创建 Frontier 和 Frontlas 的 Deployment。 + +**关键配置:** + +1. **环境变量注入**: + - Frontier 地址和端口 + - Frontlas 地址 + - Redis 配置 + +2. **Pod 反亲和性**: + - 确保 Frontier Pod 分布在不同节点 + - 提高可用性 + +3. **资源限制**: + - 可配置 CPU 和内存限制 + +**关键代码:** + +```go +func (r *FrontierClusterReconciler) ensureFrontierDeployment(ctx context.Context, fc v1alpha1.FrontierCluster) error { + // 构建容器配置 + container := container.Builder(). + SetName("frontier"). + SetImage(image). + SetEnvs([]corev1.EnvVar{ + {Name: FrontierServiceboundPortEnv, Value: strconv.Itoa(sbport)}, + {Name: FrontierEdgeboundPortEnv, Value: strconv.Itoa(ebport)}, + {Name: FrontlasAddrEnv, Value: frontlasAddr}, + }) + + // 构建 Pod 模板 + podTemplateSpec := podtemplatespec.Builder(). + SetPodAntiAffinity(&corev1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{ + {TopologyKey: "kubernetes.io/hostname"}, + }, + }) + + // 创建 Deployment + deploy := deployment.Builder(). + SetReplicas(fc.FrontierReplicas()). + SetPodTemplateSpec(podTemplateSpec). + Build() + + deployment.CreateOrUpdate(ctx, r.client, deploy) +} +``` + +#### 自动扩缩容 + +**水平扩缩容:** + +1. **修改 Replicas**: 用户修改 CRD 中的 `replicas` 字段 +2. **Controller 检测**: Controller 检测到 Spec 变更 +3. **更新 Deployment**: 更新 Deployment 的 Replicas +4. **K8s 调度**: Kubernetes 自动创建或删除 Pod +5. **状态同步**: Controller 更新 CRD Status + +**垂直扩缩容:** + +- 通过修改 Deployment 的资源限制实现 +- 需要重启 Pod,影响较大 + +#### CRD Status 管理 + +**Status 字段:** + +```go +type FrontierClusterStatus struct { + Phase Phase `json:"phase"` // Running, Failed, Pending + Message string `json:"message"` // 状态描述 +} +``` + +**状态转换:** + +- **Pending**: Deployment 未就绪 +- **Running**: 所有 Deployment 就绪 +- **Failed**: 创建或更新资源失败 + +**关键代码:** + +```go +func (r *FrontierClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // 检查 Deployment 就绪状态 + frontierIsReady := deployment.IsReady(currentFrontierDeployment, fc.FrontierReplicas()) + frontlasIsReady := deployment.IsReady(currentFrontlasDeployment, fc.FrontlasReplicas()) + + if !frontierIsReady || !frontlasIsReady { + // 更新为 Pending 状态 + status.Update(ctx, r.client.Status(), &frontiercluster, statusOptions(). + withMessage(Info, "Deployment is not yet ready"). + withPendingPhase(10)) + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + + // 更新为 Running 状态 + status.Update(ctx, r.client.Status(), &frontiercluster, statusOptions(). + withMessage(Info, "Good to go!"). + withRunningPhase()) +} +``` + +### 集群模式特性总结 + +1. **无状态设计** + - Frontier 和 Frontlas 都是无状态的 + - 状态信息存储在 Redis 中 + - 支持水平扩展 + +2. **自动发现和路由** + - Service 自动发现所有 Frontier 实例 + - 通过 Frontlas 查询 Edge 所在的 Frontier + - 支持连接漂移和自动重连 + +3. **高可用性** + - 多 Frontier 实例提供冗余 + - Frontlas 支持多实例部署 + - Redis 支持高可用模式 + +4. **声明式管理** + - 通过 CRD 声明集群配置 + - Controller 自动管理资源 + - 支持自动扩缩容 + +--- + +## 总结 + +Frontier 通过以下机制实现了高效的客户端管理和消息转发: + +1. **灵活的认证机制**:支持 Meta 信息传递和灵活的 ID 分配策略 +2. **可靠的上下线管理**:保证数据一致性和并发安全 +3. **高效的 Exchange 转发**:实现 Service 和 Edge 的解耦,支持消息、RPC 和 Stream 的透明转发 +4. **集群模式支持**:通过 Frontlas 实现多 Frontier 实例的协调管理,支持水平扩展和高可用 + +这些设计使得 Frontier 能够支持大规模、高并发的边缘计算场景。 diff --git a/etc/frontlas.yaml b/etc/frontlas.yaml index b563d0a..247a4cc 100644 --- a/etc/frontlas.yaml +++ b/etc/frontlas.yaml @@ -21,5 +21,5 @@ redis: mode: standalone standalone: network: tcp - addr: redis:6379 + addr: 127.0.0.1:6379 db: 0 diff --git a/go.mod b/go.mod index d11c471..197d85f 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/nats-io/nats.go v1.33.1 github.com/nsqio/go-nsq v1.1.0 github.com/rabbitmq/amqp091-go v1.9.0 - github.com/singchia/geminio v1.2.2-rc.2 + github.com/singchia/geminio v1.2.3-rc.1 github.com/singchia/go-timer/v2 v2.2.1 github.com/singchia/joy4 v0.0.0-20240621074108-53a2b0132ec6 github.com/soheilhy/cmux v0.1.5 diff --git a/go.sum b/go.sum index 894cebb..3b25e44 100644 --- a/go.sum +++ b/go.sum @@ -146,8 +146,8 @@ github.com/redis/go-redis/v9 v9.5.5 h1:51VEyMF8eOO+NUHFm8fpg+IOc1xFuFOhxs3R+kPu1 github.com/redis/go-redis/v9 v9.5.5/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0b/CLO2V2M= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= -github.com/singchia/geminio v1.2.2-rc.2 h1:3cAb2GrgxCp1tQwy0ZwLR2b0HsjwY4NfAMO28QEQm+M= -github.com/singchia/geminio v1.2.2-rc.2/go.mod h1:b6bld5o0aofg/kuAdc5uAnaTJYvd6YaJxYDtH9b+NzY= +github.com/singchia/geminio v1.2.3-rc.1 h1:UbMMsxNe5i7vUFNjKmtj0XryqUV9HNgRZ6HSPOrTvGg= +github.com/singchia/geminio v1.2.3-rc.1/go.mod h1:b6bld5o0aofg/kuAdc5uAnaTJYvd6YaJxYDtH9b+NzY= github.com/singchia/go-timer/v2 v2.0.3/go.mod h1:PgkEQc6io8slCUiT5rHzWKU4/P2HXHWk3WWfijZXAf4= github.com/singchia/go-timer/v2 v2.2.1 h1:gJucmL99fkuNzGk2AfNPFpa1X3/4+aGO21KkjFAG624= github.com/singchia/go-timer/v2 v2.2.1/go.mod h1:PgkEQc6io8slCUiT5rHzWKU4/P2HXHWk3WWfijZXAf4= diff --git a/images/Dockerfile.build b/images/Dockerfile.build index ddce684..9062380 100644 --- a/images/Dockerfile.build +++ b/images/Dockerfile.build @@ -15,7 +15,7 @@ ARG CGO_ENABLED=1 # Linux: Enable CGO (native build in Linux container) # Windows/macOS: Disable CGO (cross-compilation from Linux) ENV GO111MODULE=on \ - GOPROXY=https://goproxy.io,direct + GOPROXY=https://goproxy.cn,https://goproxy.io,https://proxy.golang.org,direct WORKDIR /build diff --git a/images/Dockerfile.controlplane-api b/images/Dockerfile.controlplane-api index d8c1574..0bdb849 100644 --- a/images/Dockerfile.controlplane-api +++ b/images/Dockerfile.controlplane-api @@ -4,7 +4,7 @@ FROM golang:1.18-alpine RUN apk add --no-cache curl unzip protoc protobuf-dev ENV GO111MODULE=on \ - GOPROXY=https://goproxy.io,direct + GOPROXY=https://goproxy.cn,https://goproxy.io,https://proxy.golang.org,direct RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@latest \ && go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest \ diff --git a/images/Dockerfile.example_iclm_service b/images/Dockerfile.example_iclm_service index f973b79..c4eee7e 100644 --- a/images/Dockerfile.example_iclm_service +++ b/images/Dockerfile.example_iclm_service @@ -6,7 +6,7 @@ ARG TARGETOS ARG TARGETARCH ENV GO111MODULE=on \ - GOPROXY=https://goproxy.io,direct + GOPROXY=https://goproxy.cn,https://goproxy.io,https://proxy.golang.org,direct WORKDIR /go/src/github.com/singchia/frontier RUN --mount=type=bind,readwrite,target=/go/src/github.com/singchia/frontier \ diff --git a/images/Dockerfile.frontier b/images/Dockerfile.frontier index 8ff94cf..ec7f091 100644 --- a/images/Dockerfile.frontier +++ b/images/Dockerfile.frontier @@ -6,7 +6,7 @@ ARG TARGETOS ARG TARGETARCH ENV GO111MODULE=on \ - GOPROXY=https://goproxy.io,direct + GOPROXY=https://goproxy.cn,https://goproxy.io,https://proxy.golang.org,direct WORKDIR /go/src/github.com/singchia/frontier RUN --mount=type=bind,readwrite,target=/go/src/github.com/singchia/frontier \ diff --git a/images/Dockerfile.frontlas b/images/Dockerfile.frontlas index ac3d720..6a29496 100644 --- a/images/Dockerfile.frontlas +++ b/images/Dockerfile.frontlas @@ -6,7 +6,7 @@ ARG TARGETOS ARG TARGETARCH ENV GO111MODULE=on \ - GOPROXY=https://goproxy.io,direct + GOPROXY=https://goproxy.cn,https://goproxy.io,https://proxy.golang.org,direct WORKDIR /go/src/github.com/singchia/frontier RUN --mount=type=bind,readwrite,target=/go/src/github.com/singchia/frontier \ diff --git a/pkg/frontier/exchange/exchange_test.go b/pkg/frontier/exchange/exchange_test.go new file mode 100644 index 0000000..af57620 --- /dev/null +++ b/pkg/frontier/exchange/exchange_test.go @@ -0,0 +1,320 @@ +package exchange + +import ( + "context" + "flag" + "io" + "net" + "testing" + "time" + + "github.com/jumboframes/armorigo/log" + "github.com/singchia/frontier/api/dataplane/v1/edge" + "github.com/singchia/frontier/api/dataplane/v1/service" + gconfig "github.com/singchia/frontier/pkg/config" + "github.com/singchia/frontier/pkg/frontier/config" + "github.com/singchia/frontier/pkg/frontier/edgebound" + "github.com/singchia/frontier/pkg/frontier/mq" + "github.com/singchia/frontier/pkg/frontier/repo" + "github.com/singchia/frontier/pkg/frontier/servicebound" + "github.com/singchia/geminio" + "github.com/singchia/go-timer/v2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/klog/v2" +) + +func init() { + klog.InitFlags(nil) + flag.Set("v", "0") + flag.Set("logtostderr", "false") + flag.Set("stderrthreshold", "FATAL") + + log.SetLevel(log.LevelFatal) + log.SetOutput(io.Discard) +} + +const ( + testNetwork = "tcp" + edgeboundAddr = "127.0.0.1:13300" + serviceboundAddr = "127.0.0.1:13301" +) + +// exchangeHarness starts an in-process exchange + edgebound + servicebound. +type exchangeHarness struct { + eb interface{ Close() error } + sb interface{ Close() error } + r interface{ Close() error } + mqm interface{ Close() error } + tmr timer.Timer +} + +func newHarness(t *testing.T) *exchangeHarness { + t.Helper() + conf := &config.Configuration{ + Edgebound: config.Edgebound{ + Listen: gconfig.Listen{Network: testNetwork, Addr: edgeboundAddr}, + EdgeIDAllocWhenNoIDServiceOn: true, + }, + Servicebound: config.Servicebound{ + Listen: gconfig.Listen{Network: testNetwork, Addr: serviceboundAddr}, + }, + } + r, err := repo.NewRepo(conf) + require.NoError(t, err) + + mqm, err := mq.NewMQM(conf) + require.NoError(t, err) + + tmr := timer.NewTimer() + ex := NewExchange(conf, mqm) + + sb, err := servicebound.NewServicebound(conf, r, nil, ex, mqm, tmr) + require.NoError(t, err) + + eb, err := edgebound.NewEdgebound(conf, r, nil, ex, tmr) + require.NoError(t, err) + + go sb.Serve() + go eb.Serve() + time.Sleep(30 * time.Millisecond) + + h := &exchangeHarness{eb: eb, sb: sb, r: r, mqm: mqm, tmr: tmr} + t.Cleanup(func() { + eb.Close() + sb.Close() + r.Close() + mqm.Close() + tmr.Close() + }) + return h +} + +func edgeDial() edge.Dialer { + return func() (net.Conn, error) { return net.Dial(testNetwork, edgeboundAddr) } +} +func svcDial() service.Dialer { + return func() (net.Conn, error) { return net.Dial(testNetwork, serviceboundAddr) } +} + +// UNIT-EXCH-001: RPC from Edge forwarded to Service +func TestExchangeForwardRPCToService(t *testing.T) { + newHarness(t) + + svc, err := service.NewService(svcDial(), service.OptionServiceName("rpc-svc")) + require.NoError(t, err) + defer svc.Close() + require.NoError(t, svc.Register(context.TODO(), "echo", func(_ context.Context, req geminio.Request, resp geminio.Response) { + resp.SetData(req.Data()) + })) + time.Sleep(20 * time.Millisecond) + + e, err := edge.NewEdge(edgeDial()) + require.NoError(t, err) + defer e.Close() + + req := e.NewRequest([]byte("ping")) + resp, err := e.Call(context.TODO(), "echo", req) + require.NoError(t, err) + assert.Equal(t, []byte("ping"), resp.Data()) +} + +// UNIT-EXCH-002: Message from Edge forwarded to Service via topic +func TestExchangeForwardMessageToService(t *testing.T) { + newHarness(t) + + const topic = "news" + svc, err := service.NewService(svcDial(), + service.OptionServiceName("msg-svc"), + service.OptionServiceReceiveTopics([]string{topic}), + ) + require.NoError(t, err) + defer svc.Close() + + received := make(chan []byte, 1) + go func() { + msg, err := svc.Receive(context.TODO()) + if err == nil { + received <- msg.Data() + msg.Done() + } + }() + time.Sleep(20 * time.Millisecond) + + e, err := edge.NewEdge(edgeDial()) + require.NoError(t, err) + defer e.Close() + + msg := e.NewMessage([]byte("headline")) + require.NoError(t, e.Publish(context.TODO(), topic, msg)) + + select { + case data := <-received: + assert.Equal(t, []byte("headline"), data) + case <-time.After(3 * time.Second): + t.Fatal("timed out") + } +} + +// UNIT-EXCH-003: RPC from Service forwarded to specific Edge +func TestExchangeForwardRPCToEdge(t *testing.T) { + newHarness(t) + + e, err := edge.NewEdge(edgeDial()) + require.NoError(t, err) + defer e.Close() + require.NoError(t, e.Register(context.TODO(), "greet", func(_ context.Context, req geminio.Request, resp geminio.Response) { + resp.SetData([]byte("hello-from-edge")) + })) + time.Sleep(20 * time.Millisecond) + + svc, err := service.NewService(svcDial(), service.OptionServiceName("rpc-caller")) + require.NoError(t, err) + defer svc.Close() + + req := svc.NewRequest([]byte("")) + resp, err := svc.Call(context.TODO(), e.EdgeID(), "greet", req) + require.NoError(t, err) + assert.Equal(t, []byte("hello-from-edge"), resp.Data()) +} + +// UNIT-EXCH-004: Message from Service delivered to specific Edge +func TestExchangeForwardMessageToEdge(t *testing.T) { + newHarness(t) + + e, err := edge.NewEdge(edgeDial()) + require.NoError(t, err) + defer e.Close() + + received := make(chan []byte, 1) + go func() { + msg, err := e.Receive(context.TODO()) + if err == nil { + received <- msg.Data() + msg.Done() + } + }() + time.Sleep(20 * time.Millisecond) + + svc, err := service.NewService(svcDial(), service.OptionServiceName("msg-pub")) + require.NoError(t, err) + defer svc.Close() + + msg := svc.NewMessage([]byte("push-to-edge")) + require.NoError(t, svc.Publish(context.TODO(), e.EdgeID(), msg)) + + select { + case data := <-received: + assert.Equal(t, []byte("push-to-edge"), data) + case <-time.After(3 * time.Second): + t.Fatal("timed out") + } +} + +// UNIT-EXCH-005: Stream opened from Edge transparently forwarded to Service +func TestExchangeStreamToService(t *testing.T) { + newHarness(t) + + accepted := make(chan geminio.Stream, 1) + svc, err := service.NewService(svcDial(), service.OptionServiceName("stream-svc")) + require.NoError(t, err) + defer svc.Close() + go func() { + if st, err := svc.AcceptStream(); err == nil { + accepted <- st + } + }() + time.Sleep(20 * time.Millisecond) + + e, err := edge.NewEdge(edgeDial()) + require.NoError(t, err) + defer e.Close() + + st, err := e.OpenStream("stream-svc") + require.NoError(t, err) + defer st.Close() + + select { + case serverSt := <-accepted: + assert.NotNil(t, serverSt) + serverSt.Close() + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for stream on service side") + } +} + +// UNIT-EXCH-006: Stream opened from Service transparently forwarded to Edge +func TestExchangeStreamToEdge(t *testing.T) { + newHarness(t) + + accepted := make(chan geminio.Stream, 1) + e, err := edge.NewEdge(edgeDial()) + require.NoError(t, err) + defer e.Close() + go func() { + if st, err := e.AcceptStream(); err == nil { + accepted <- st + } + }() + time.Sleep(20 * time.Millisecond) + + svc, err := service.NewService(svcDial(), service.OptionServiceName("stream-opener")) + require.NoError(t, err) + defer svc.Close() + + st, err := svc.OpenStream(context.TODO(), e.EdgeID()) + require.NoError(t, err) + defer st.Close() + + select { + case edgeSt := <-accepted: + assert.NotNil(t, edgeSt) + edgeSt.Close() + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for stream on edge side") + } +} + +// UNIT-EXCH-007: Edge online/offline events are forwarded to Service via control RPCs +func TestExchangeEdgeOnlineOffline(t *testing.T) { + newHarness(t) + + onlineCh := make(chan uint64, 1) + offlineCh := make(chan uint64, 1) + + svc, err := service.NewService(svcDial(), service.OptionServiceName("event-watcher")) + require.NoError(t, err) + defer svc.Close() + + require.NoError(t, svc.RegisterEdgeOnline(context.TODO(), func(edgeID uint64, meta []byte, addr net.Addr) error { + onlineCh <- edgeID + return nil + })) + require.NoError(t, svc.RegisterEdgeOffline(context.TODO(), func(edgeID uint64, meta []byte, addr net.Addr) error { + offlineCh <- edgeID + return nil + })) + + time.Sleep(20 * time.Millisecond) + + // connect then disconnect an edge + e, err := edge.NewEdge(edgeDial()) + require.NoError(t, err) + edgeID := e.EdgeID() + + select { + case id := <-onlineCh: + assert.Equal(t, edgeID, id) + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for EdgeOnline event") + } + + e.Close() + + select { + case id := <-offlineCh: + assert.Equal(t, edgeID, id) + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for EdgeOffline event") + } +} diff --git a/test/TEST_PLAN.md b/test/TEST_PLAN.md new file mode 100644 index 0000000..8842133 --- /dev/null +++ b/test/TEST_PLAN.md @@ -0,0 +1,373 @@ +# Frontier 测试计划 + +**文档版本:** 1.1 +**创建日期:** 2026-04-01 +**测试执行:** Claude Code + +--- + +## 目录 + +1. [项目概述与测试范围](#一项目概述与测试范围) +2. [测试分类与编号规则](#二测试分类与编号规则) +3. [单元测试](#三单元测试) +4. [基准测试](#四基准测试) +5. [端到端测试](#五端到端测试) +6. [安全测试](#六安全测试) +7. [测试覆盖矩阵](#七测试覆盖矩阵) +8. [执行命令速查](#八执行命令速查) + +--- + +## 一、项目概述与测试范围 + +### 架构简述 + +Frontier 是一个面向边缘节点的反向代理与消息总线,核心数据流如下: + +``` +Edge (边缘节点) + │ TCP/TLS + ▼ +┌─────────────────────────────────┐ +│ Frontier │ +│ ┌───────────┐ ┌────────────┐ │ +│ │ Edgebound │ │Servicebound│ │ +│ └─────┬─────┘ └─────┬──────┘ │ +│ └──────┬────────┘ │ +│ ┌────▼────┐ │ +│ │Exchange │ │ +│ └─────────┘ │ +└─────────────────────────────────┘ + │ TCP/TLS + ▼ +Service (业务服务) +``` + +**核心能力(测试重点):** +- **Edgebound**:接受 Edge 接入,管理 Edge 连接生命周期 +- **Servicebound**:接受 Service 接入,管理 Service 注册与路由 +- **Exchange**:Edge ↔ Service 之间的 RPC 转发、消息转发、Stream 透传 +- **Repo(DAO)**:内存数据库(buntdb / sqlite)存储 Edge/Service 元数据 + +### 测试范围 + +| 包含 | 不包含 | +|------|--------| +| frontier 核心数据面(edgebound / servicebound / exchange) | frontlas(集群控制面) | +| Repo DAO(membuntdb / memsqlite) | Kubernetes Operator | +| 配置加载(config) | MQ 外部依赖集成(Kafka/NATS/NSQ 等) | +| 基准测试(bench / batch) | 控制面 REST/gRPC API | + +--- + +## 二、测试分类与编号规则 + +### 2.1 测试类别 + +| 类别编码 | 类别名称 | 测试工具 | 目录 | +|---------|---------|---------|------| +| UNIT | 单元测试 | `go test` | `pkg/frontier/...` | +| BENCH | 基准测试 | `go test -bench` / 独立二进制 | `test/bench/`, `test/batch/` | +| E2E | 端到端测试 | `go test` + 本地 frontier 实例 | `test/e2e/`(待创建)| +| SEC | 安全测试 | `go test -race` / `go test -fuzz` | `test/security/`(待创建)| + +### 2.2 编号规则 + +格式:`[类别]-[模块]-[序号]` + +| 缩写 | 模块 | +|------|------| +| EDGE | Edgebound | +| SVC | Servicebound | +| EXCH | Exchange | +| REPO | Repo/DAO | +| CONF | Config | +| CONN | 连接管理 | +| RPC | RPC 转发 | +| MSG | 消息转发 | +| STRM | Stream 透传 | + +--- + +## 三、单元测试 + +### 3.1 已有测试(`pkg/`) + +| 编号 | 测试名称 | 文件 | 验证点 | +|------|---------|------|--------| +| UNIT-CONF-001 | TestGenDefaultConfig | `pkg/frontier/config/config_test.go` | 默认配置序列化到 YAML | +| UNIT-CONF-002 | TestGenAllConfig | `pkg/frontier/config/config_test.go` | 完整配置序列化到 YAML | +| UNIT-EDGE-001 | TestEdgeManager | `pkg/frontier/edgebound/edge_manager_test.go` | Edge 接入→在线→断开完整流程 | +| UNIT-EDGE-002 | TestEdgeManagerStream | `pkg/frontier/edgebound/edge_dataplane_test.go` | Edge 批量创建 Stream(1000条) | +| UNIT-SVC-001 | TestServiceManager | `pkg/frontier/servicebound/service_manager_test.go` | Service 接入→在线→断开完整流程 | +| UNIT-REPO-001 | TestListEdges(buntdb) | `pkg/frontier/repo/dao/membuntdb/dao_edge_test.go` | Edge 列表按地址前缀/时间范围查询 | +| UNIT-REPO-002 | TestListEdgeRPCs(buntdb) | `pkg/frontier/repo/dao/membuntdb/dao_edge_test.go` | EdgeRPC 多条件查询 | +| UNIT-REPO-003 | TestListServices(buntdb) | `pkg/frontier/repo/dao/membuntdb/dao_service_test.go` | Service 列表查询及分页 | +| UNIT-REPO-004 | TestDeleteService(buntdb) | `pkg/frontier/repo/dao/membuntdb/dao_service_test.go` | Service 删除后数量校验 | +| UNIT-REPO-005 | TestListServiceRPCs(buntdb) | `pkg/frontier/repo/dao/membuntdb/dao_service_test.go` | ServiceRPC 按 ID/时间查询 | +| UNIT-REPO-006 | TestListServiceTopics(buntdb) | `pkg/frontier/repo/dao/membuntdb/dao_service_test.go` | ServiceTopic 多条件查询 | +| UNIT-REPO-007 | TestCreateEdge(sqlite) | `pkg/frontier/repo/dao/memsqlite/dao_edge_test.go` | Edge 写入 sqlite | +| UNIT-REPO-008 | TestCountEdges(sqlite) | `pkg/frontier/repo/dao/memsqlite/dao_edge_test.go` | 批量写入后计数校验(10000条)| +| UNIT-REPO-009 | BenchmarkCreateEdge(sqlite) | `pkg/frontier/repo/dao/memsqlite/dao_edge_test.go` | Edge 写入并发性能基线 | +| UNIT-REPO-010 | BenchmarkGetEdge(sqlite) | `pkg/frontier/repo/dao/memsqlite/dao_edge_test.go` | Edge 读取并发性能基线 | +| UNIT-REPO-011 | BenchmarkListEdges(sqlite) | `pkg/frontier/repo/dao/memsqlite/dao_edge_test.go` | 10万条数据分页查询性能 | +| UNIT-REPO-012 | TestListServices(sqlite) | `pkg/frontier/repo/dao/memsqlite/dao_service_test.go` | Service 与 RPC/Topic 联合查询 | + +### 3.2 待补充测试 + +| 编号 | 建议测试名称 | 目标文件 | 验证点 | +|------|------------|---------|--------| +| UNIT-EXCH-001 | TestExchangeForwardRPCToService | `pkg/frontier/exchange/` | RPC 从 Edge 转发到 Service 全流程 | +| UNIT-EXCH-002 | TestExchangeForwardMessageToService | `pkg/frontier/exchange/` | 消息从 Edge 转发到 Service 全流程 | +| UNIT-EXCH-003 | TestExchangeForwardRPCToEdge | `pkg/frontier/exchange/` | RPC 从 Service 转发到指定 Edge | +| UNIT-EXCH-004 | TestExchangeForwardMessageToEdge | `pkg/frontier/exchange/` | 消息从 Service 投递到指定 Edge | +| UNIT-EXCH-005 | TestExchangeStreamToService | `pkg/frontier/exchange/` | Stream 从 Edge 透传到 Service | +| UNIT-EXCH-006 | TestExchangeStreamToEdge | `pkg/frontier/exchange/` | Stream 从 Service 透传到 Edge | +| UNIT-EXCH-007 | TestExchangeEdgeOnlineOffline | `pkg/frontier/exchange/` | Edge 上下线事件通知 Service | +| UNIT-EDGE-003 | TestEdgeManagerMultiple | `pkg/frontier/edgebound/` | 多 Edge 并发接入,ID 分配唯一性 | +| UNIT-SVC-002 | TestServiceManagerRouting | `pkg/frontier/servicebound/` | 按 RPC/Topic/Name 查找 Service | + +--- + +## 四、基准测试 + +基准测试为**独立二进制**,需先启动一个本地 frontier 实例,再运行对应客户端程序。 + +### 4.1 已有基准测试 + +| 编号 | 测试方法 | 文件 | 场景 | +|------|---------|------|------| +| BENCH-CALL-001 | `BenchmarkEdgeCallService` | `test/bench/benchmark_test.go` | Edge 端通过 Frontier 调用 Service 的 RPC 的并发吞吐 (QPS) | +| BENCH-PUB-001 | `BenchmarkEdgePublishMessage` | `test/bench/benchmark_test.go` | Edge 端通过 Frontier 发布消息的并发吞吐 (QPS) | +| BENCH-OPEN-001 | `BenchmarkEdgeOpenStream` | `test/bench/benchmark_test.go` | Edge 端通过 Frontier 打开并关闭 Stream 的并发吞吐 (QPS) | +| BENCH-EDGE-001 | `edges` (独立二进制) | `test/batch/edges/edges.go` | 大规模边缘节点长连接模拟 | + +### 4.2 待补充基准测试 + +| 编号 | 建议测试名称 | 目录 | 场景 | +|------|------------|------|------| +| BENCH-CONN-001 | `BenchmarkConnect` | `test/bench/benchmark_test.go` | 测量每秒可接入 Edge 连接数(TPS) | +| BENCH-STRM-001 | `BenchmarkStreamTransfer` | `test/bench/benchmark_test.go` | Stream 双向数据传输带��测试 | + +### 4.3 基准测试执行方式 + +```bash +# 运行所有性能基准测试并打印内存分配情况 +go test -bench=. -benchmem -v ./test/bench/... + +# 运行特定模块基准测试并设定测试时间(如 10s) +go test -bench=BenchmarkEdgeCallService -benchtime=10s ./test/bench/... + +# 大规模连接模拟(独立二进制) +cd test/batch/edges && make +./edges --address 127.0.0.1:30011 --count 10000 --nseconds 30 +``` + +--- + +## 五、端到端测试 + +E2E 测试在进程内启动 frontier(不依赖外部进程),验证 Edge → Frontier → Service 的完整链路。 + +### 5.1 测试目录结构(待创建) + +``` +test/e2e/ +├── main_test.go # TestMain:启动/停止嵌入式 frontier +├── helper.go # 公共 dialer、frontier 启动工具函数 +├── conn_test.go # 连接生命周期测试 +├── rpc_test.go # RPC 转发测试 +├── message_test.go # 消息转发测试 +└── stream_test.go # Stream 透传测试 +``` + +### 5.2 E2E 测试用例 + +#### 连接管理(CONN) + +| 编号 | 测试名称 | 验证点 | +|------|---------|--------| +| E2E-CONN-001 | TestEdgeConnect | Edge 成功接入 frontier,edgeID 非零 | +| E2E-CONN-002 | TestEdgeConnectAndClose | Edge 正常关闭,frontier 侧资源清理完毕 | +| E2E-CONN-003 | TestEdgeConnectWithMeta | Edge 携带 meta 接入,Service 侧通过 `EdgeOnline` 回调获得正确 meta | +| E2E-CONN-004 | TestMultiEdgeConnect | 100 个 Edge 并发接入,全部成功且 edgeID 唯一 | +| E2E-CONN-005 | TestServiceConnect | Service 成功接入并注册 RPC/Topic | +| E2E-CONN-006 | TestServiceConnectAndClose | Service 下线后,Edge 侧 RPC 调用返回 `ErrServiceNotOnline` | + +#### RPC 转发(RPC) + +| 编号 | 测试名称 | 验证点 | +|------|---------|--------| +| E2E-RPC-001 | TestEdgeCallService | Edge 通过 frontier 调用 Service 注册的 RPC,返回正确响应 | +| E2E-RPC-002 | TestServiceCallEdge | Service 通过 frontier 调用 Edge 注册的 RPC,指定 edgeID | +| E2E-RPC-003 | TestRPCEdgeIDCarry | Service 调用 Edge RPC 时,frontier 正确在 Custom 字段附加 edgeID | +| E2E-RPC-004 | TestRPCTargetEdgeOffline | 目标 Edge 已下线时,Service 调用返回 `ErrEdgeNotOnline` | +| E2E-RPC-005 | TestRPCTargetRPCNotFound | Service 调用不存在的 RPC 方法时,Edge 返回错误 | +| E2E-RPC-006 | TestRPCConcurrent | 10 个 Edge 同时调用 Service RPC,无错误,响应数据一致 | + +#### 消息转发(MSG) + +| 编号 | 测试名称 | 验证点 | +|------|---------|--------| +| E2E-MSG-001 | TestEdgePublishToService | Edge Publish 消息,Service 通过已注册 Topic 正确 Receive | +| E2E-MSG-002 | TestServicePublishToEdge | Service Publish 消息到指定 edgeID,Edge 正确 Receive | +| E2E-MSG-003 | TestMessageTopicRoute | 多个 Service 注册不同 Topic,Edge 消息按 Topic 路由到正确 Service | +| E2E-MSG-004 | TestMessageTopicNotFound | Edge 发布不存在 Topic 的消息,返回 `ErrTopicNotOnline` | +| E2E-MSG-005 | TestMessageConcurrent | 10 个 Edge 并发 Publish,消息不丢失,数量一致 | + +#### Stream 透传(STRM) + +| 编号 | 测试名称 | 验证点 | +|------|---------|--------| +| E2E-STRM-001 | TestEdgeOpenStreamToService | Edge OpenStream 到指定 Service,Service AcceptStream 收到 | +| E2E-STRM-002 | TestServiceOpenStreamToEdge | Service OpenStream 到指定 edgeID,Edge AcceptStream 收到 | +| E2E-STRM-003 | TestStreamRawDataForward | Stream 内 Raw IO 双向传输,数据内容完整一致 | +| E2E-STRM-004 | TestStreamMessageForward | Stream 内 Message 双向转发,数据内容完整一致 | +| E2E-STRM-005 | TestStreamRPCForward | Stream 内 RPC 双向调用,返回值正确 | +| E2E-STRM-006 | TestStreamClose | Stream 一端关闭,另一端收到 EOF,资源正确清理 | +| E2E-STRM-007 | TestStreamTargetEdgeOffline | 目标 Edge 不在线时,Service OpenStream 返回错误 | + +#### 资源管理(RES) + +| 编号 | 测试名称 | 验证点 | +|------|---------|--------| +| E2E-RES-001 | TestResourceCleanupOnEdgeClose | Edge 关闭后,Repo 中 Edge 及其 RPC 记录被删除 | +| E2E-RES-002 | TestResourceCleanupOnServiceClose | Service 关闭后,Repo 中 Service 及其 RPC/Topic 记录被删除 | +| E2E-RES-003 | TestGoroutineNoLeak | 100 次 Edge 接入/断开循环后,goroutine 数量回落到基线 | + +### 5.3 E2E 执行方式 + +```bash +# 运行所有 E2E 测试 +go test -v -timeout 5m ./test/e2e/ + +# 带竞态检测 +go test -race -v -timeout 5m ./test/e2e/ + +# 运行单个用例 +go test -v -run TestEdgeCallService ./test/e2e/ +``` + +--- + +## 六、安全测试 + +### 6.1 测试目录结构(待创建) + +``` +test/security/ +├── main_test.go +├── input_test.go # 输入合法性验证 +├── boundary_test.go # 边界值测试 +├── race_test.go # 并发竞态测试 +└── fuzz_test.go # 模糊测试 +``` + +### 6.2 安全测试用例 + +#### 输入合法性(INPUT) + +| 编号 | 测试名称 | 验证点 | +|------|---------|--------| +| SEC-INPUT-001 | TestLargePayloadRPC | RPC 请求携带 64MB payload,frontier 不崩溃,返回正常错误或正常响应 | +| SEC-INPUT-002 | TestEmptyPayloadRPC | RPC/消息 payload 为空(nil / 0字节),frontier 正常处理 | +| SEC-INPUT-003 | TestSpecialCharactersMeta | Edge meta 包含特殊字符(换行、空字节、Unicode),frontier 正常接受 | +| SEC-INPUT-004 | TestNilMessageData | Edge 发送 nil data 的消息,frontier 不 panic | + +#### 边界值(BOUND) + +| 编号 | 测试名称 | 验证点 | +|------|---------|--------| +| SEC-BOUND-001 | TestMaxEdgeConnections | 超大量 Edge 并发接入(如 65535),系统不崩溃,超出限制时返回可预期错误 | +| SEC-BOUND-002 | TestMaxStreamsPerEdge | 单个 Edge 打开 10000 个 Stream,frontier 不崩溃,资源可释放 | +| SEC-BOUND-003 | TestEdgeIDOverflow | edgeID 为 0 / MaxUint64 等边界值,frontier 正确拒绝或处理 | + +#### 并发竞态(RACE) + +| 编号 | 测试名称 | 执行方式 | 验证点 | +|------|---------|---------|--------| +| SEC-RACE-001 | TestRaceEdgeConnectClose | `-race` | 并发 Connect 和 Close,无 data race | +| SEC-RACE-002 | TestRaceMultipleEdgeClose | `-race` | 同一 Edge 被多个 goroutine 并发 Close,无 panic / data race | +| SEC-RACE-003 | TestRaceServiceRegisterUnregister | `-race` | Service 并发注册/注销 RPC,无 data race | +| SEC-RACE-004 | TestRaceForwardAndClose | `-race` | Edge 正在转发 RPC 时同时 Close,无 panic | + +#### 模糊测试(FUZZ,Go 1.18+) + +| 编号 | 测试名称 | 验证点 | +|------|---------|--------| +| SEC-FUZZ-001 | FuzzEdgeMeta | 随机 meta 字节序列作为 Edge 接入 meta,frontier 不 panic | +| SEC-FUZZ-002 | FuzzRPCPayload | 随机 payload 通过 RPC 调用,frontier 不 panic | +| SEC-FUZZ-003 | FuzzMessagePayload | 随机 payload 通过 Publish 发送,frontier 不 panic | + +### 6.3 安全测试执行方式 + +```bash +# 带竞态检测运行安全测试 +go test -race -v ./test/security/ + +# 运行 fuzzing(至少跑 60 秒) +go test -fuzz=FuzzEdgeMeta -fuzztime=60s ./test/security/ +go test -fuzz=FuzzRPCPayload -fuzztime=60s ./test/security/ +go test -fuzz=FuzzMessagePayload -fuzztime=60s ./test/security/ +``` + +--- + +## 七、测试覆盖矩阵 + +| 功能模块 | 单元测试 | 基准测试 | E2E测试 | 安全测试 | +|---------|:-------:|:-------:|:------:|:-------:| +| Edgebound(连接接入)| ✅ 已有 | ✅ 已有 | 🔲 待建 | 🔲 待建 | +| Servicebound(连接接入)| ✅ 已有 | ✅ 已有 | 🔲 待建 | 🔲 待建 | +| Exchange RPC 转发 | 🔲 待建 | ✅ 已有 | 🔲 待建 | 🔲 待建 | +| Exchange 消息转发 | 🔲 待建 | ✅ 已有 | 🔲 待建 | 🔲 待建 | +| Exchange Stream 透传 | 🔲 待建 | ✅ 已有 | 🔲 待建 | 🔲 待建 | +| Exchange 上下线通知 | 🔲 待建 | — | 🔲 待建 | — | +| Repo / DAO(buntdb)| ✅ 已有 | — | — | — | +| Repo / DAO(sqlite)| ✅ 已有 | ✅ 已有 | — | — | +| Config 加载 | ✅ 已有 | — | — | — | +| 竞态安全 | — | — | — | 🔲 待建 | +| 边界/模糊 | — | — | — | 🔲 待建 | + +--- + +## 八、执行命令速查 + +```bash +# ── 单元测试 ────────────────────────────────────────────── +# 运行所有单元测试 +go test ./pkg/frontier/... + +# 带竞态检测 +go test -race ./pkg/frontier/... + +# 带覆盖率 +go test -coverprofile=coverage.out ./pkg/frontier/... +go tool cover -html=coverage.out -o coverage.html + +# ── 基准测试(go test bench 方式)─────────────── +go test -bench=. -benchmem -v ./test/bench/... + +# 大规模连接模拟 +cd test/batch/edges && make && ./edges --count 10000 --nseconds 30 + +# ── E2E 测试(待创建)──────────────────────────────────── +go test -v -timeout 5m ./test/e2e/ +go test -race -v -timeout 5m ./test/e2e/ + +# ── 安全测试(待创建)──────────────────────────────────── +go test -race -v ./test/security/ +go test -fuzz=FuzzEdgeMeta -fuzztime=60s ./test/security/ +``` + +--- + +## 附录 + +### 文档更新记录 + +| 日期 | 版本 | 修改内容 | +|-----|------|---------| +| 2026-04-01 | 1.0 | 初始版本 | +| 2026-04-01 | 1.1 | 去除 frontlas / Operator,聚焦 frontier 数据面;细化 E2E 和安全测试用例 | diff --git a/test/bench/benchmark_test.go b/test/bench/benchmark_test.go new file mode 100644 index 0000000..2593260 --- /dev/null +++ b/test/bench/benchmark_test.go @@ -0,0 +1,338 @@ +package bench + +import ( + "context" + "flag" + "fmt" + "io" + "net" + "sync/atomic" + "testing" + "time" + + "github.com/jumboframes/armorigo/log" + "github.com/singchia/frontier/api/dataplane/v1/edge" + "github.com/singchia/frontier/api/dataplane/v1/service" + gconfig "github.com/singchia/frontier/pkg/config" + "github.com/singchia/frontier/pkg/frontier/config" + "github.com/singchia/frontier/pkg/frontier/edgebound" + "github.com/singchia/frontier/pkg/frontier/exchange" + "github.com/singchia/frontier/pkg/frontier/mq" + "github.com/singchia/frontier/pkg/frontier/repo" + "github.com/singchia/frontier/pkg/frontier/servicebound" + "github.com/singchia/geminio" + "github.com/singchia/go-timer/v2" + "github.com/stretchr/testify/require" + "k8s.io/klog/v2" +) + +func init() { + // Set klog to only show fatal errors + klog.InitFlags(nil) + flag.Set("v", "0") + flag.Set("logtostderr", "false") + flag.Set("alsologtostderr", "false") + flag.Set("stderrthreshold", "FATAL") + + // Set armorigo log to only show fatal errors + log.SetLevel(log.LevelFatal) + log.SetOutput(io.Discard) +} + +var benchPortCounter int32 = 15000 + +// benchFrontier holds the in-process frontier addresses. +type benchFrontier struct { + edgeAddr string + svcAddr string +} + +// allocatePorts allocates two consecutive ports for a benchmark +func allocatePorts() (edgeAddr, svcAddr string) { + port := atomic.AddInt32(&benchPortCounter, 20) // Use 20-port spacing to avoid conflicts + edgeAddr = fmt.Sprintf("127.0.0.1:%d", port-19) + svcAddr = fmt.Sprintf("127.0.0.1:%d", port-18) + return +} + +// startFrontier spins up an in-process frontier and +// registers b.Cleanup to shut it down. +func startFrontier(b *testing.B) *benchFrontier { + b.Helper() + + edgeAddr, svcAddr := allocatePorts() + + conf := &config.Configuration{ + Edgebound: config.Edgebound{ + Listen: gconfig.Listen{Network: "tcp", Addr: edgeAddr}, + EdgeIDAllocWhenNoIDServiceOn: true, + }, + Servicebound: config.Servicebound{ + Listen: gconfig.Listen{Network: "tcp", Addr: svcAddr}, + }, + } + + r, err := repo.NewRepo(conf) + require.NoError(b, err) + mqm, err := mq.NewMQM(conf) + require.NoError(b, err) + tmr := timer.NewTimer() + ex := exchange.NewExchange(conf, mqm) + + sb, err := servicebound.NewServicebound(conf, r, nil, ex, mqm, tmr) + require.NoError(b, err) + eb, err := edgebound.NewEdgebound(conf, r, nil, ex, tmr) + require.NoError(b, err) + + go sb.Serve() + go eb.Serve() + time.Sleep(30 * time.Millisecond) + + b.Cleanup(func() { + eb.Close() + sb.Close() + r.Close() + mqm.Close() + tmr.Close() + }) + + return &benchFrontier{edgeAddr: edgeAddr, svcAddr: svcAddr} +} + +// dialEdge opens a new Edge connection and registers cleanup. +func (f *benchFrontier) dialEdge(b *testing.B, opts ...edge.EdgeOption) edge.Edge { + b.Helper() + dialer := func() (net.Conn, error) { return net.Dial("tcp", f.edgeAddr) } + e, err := edge.NewEdge(dialer, opts...) + require.NoError(b, err) + b.Cleanup(func() { e.Close() }) + return e +} + +// dialService opens a new Service connection and registers cleanup. +func (f *benchFrontier) dialService(b *testing.B, name string, opts ...service.ServiceOption) service.Service { + b.Helper() + dialer := func() (net.Conn, error) { return net.Dial("tcp", f.svcAddr) } + opts = append([]service.ServiceOption{service.OptionServiceName(name)}, opts...) + svc, err := service.NewService(dialer, opts...) + require.NoError(b, err) + b.Cleanup(func() { svc.Close() }) + return svc +} + +// BENCH-CALL-001: Edge → Frontier → Service RPC 吞吐 (QPS) +func BenchmarkEdgeCallService(b *testing.B) { + f := startFrontier(b) + + svc := f.dialService(b, "bench-rpc-svc") + require.NoError(b, svc.Register(context.TODO(), "echo", + func(_ context.Context, req geminio.Request, resp geminio.Response) { + resp.SetData(req.Data()) + }, + )) + time.Sleep(300 * time.Millisecond) + + // verify echo works before benchmark + e0 := f.dialEdge(b) + req0 := e0.NewRequest([]byte("test")) + _, err := e0.Call(context.TODO(), "echo", req0) + require.NoError(b, err, "pre-bench verification failed") + + payload := []byte("ping") + + // pre-create edges to avoid timing issues with RPC routing + const numWorkers = 10 + edges := make([]edge.Edge, numWorkers) + for i := 0; i < numWorkers; i++ { + edges[i] = f.dialEdge(b) + } + time.Sleep(100 * time.Millisecond) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + e := edges[i%numWorkers] + i++ + req := e.NewRequest(payload) + if _, err := e.Call(context.TODO(), "echo", req); err != nil { + b.Error(err) + } + } + }) + b.StopTimer() + + qps := float64(b.N) / b.Elapsed().Seconds() + b.ReportMetric(qps, "qps") +} + +// BENCH-CALL-002: Service → Frontier → Edge RPC 吞吐 (QPS) +func BenchmarkServiceCallEdge(b *testing.B) { + f := startFrontier(b) + + e := f.dialEdge(b) + require.NoError(b, e.Register(context.TODO(), "echo", + func(_ context.Context, req geminio.Request, resp geminio.Response) { + resp.SetData(req.Data()) + }, + )) + edgeID := e.EdgeID() + time.Sleep(300 * time.Millisecond) + + s0 := f.dialService(b, "bench-verify") + req0 := s0.NewRequest([]byte("test")) + _, err := s0.Call(context.TODO(), edgeID, "echo", req0) + require.NoError(b, err, "pre-bench verification failed") + + payload := []byte("pong") + + const numWorkers = 10 + svcs := make([]service.Service, numWorkers) + for i := 0; i < numWorkers; i++ { + svcs[i] = f.dialService(b, fmt.Sprintf("bench-caller-%d", i)) + } + time.Sleep(100 * time.Millisecond) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + i := 0 + for pb.Next() { + svc := svcs[i%numWorkers] + i++ + req := svc.NewRequest(payload) + if _, err := svc.Call(context.TODO(), edgeID, "echo", req); err != nil { + b.Error(err) + } + } + }) + b.StopTimer() + + qps := float64(b.N) / b.Elapsed().Seconds() + b.ReportMetric(qps, "qps") +} + +// BENCH-MSG-001: Edge → Frontier → Service 消息吞吐 (QPS) +func BenchmarkEdgePublishMessage(b *testing.B) { + f := startFrontier(b) + + svc := f.dialService(b, "bench-msg-svc", service.OptionServiceReceiveTopics([]string{"bench-topic"})) + go func() { + for { + msg, err := svc.Receive(context.TODO()) + if err != nil { + return + } + msg.Done() + } + }() + time.Sleep(300 * time.Millisecond) + + payload := []byte("message") + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + e := f.dialEdge(b) + for pb.Next() { + msg := e.NewMessage(payload) + e.Publish(context.TODO(), "bench-topic", msg) + } + }) + b.StopTimer() + + qps := float64(b.N) / b.Elapsed().Seconds() + b.ReportMetric(qps, "qps") +} + +// BENCH-STRM-001: Edge → Frontier → Service 流建立吞吐 (QPS) +// Note: This benchmark may occasionally panic in geminio when run repeatedly +// due to a race condition in stream cleanup. Run with -count=1 if issues occur. +func BenchmarkEdgeOpenStream(b *testing.B) { + f := startFrontier(b) + + svc := f.dialService(b, "bench-stream-svc") + go func() { + for { + st, err := svc.AcceptStream() + if err != nil { + return + } + go st.Close() + } + }() + time.Sleep(300 * time.Millisecond) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + e := f.dialEdge(b) + for pb.Next() { + st, err := e.OpenStream("bench-stream-svc") + if err != nil { + continue + } + st.Close() + } + }) + b.StopTimer() + + qps := float64(b.N) / b.Elapsed().Seconds() + b.ReportMetric(qps, "qps") +} + +// BENCH-CONN-001: Edge 连接建立与断开吞吐 (QPS / TPS) +func BenchmarkEdgeConnectDisconnect(b *testing.B) { + // Skip this in parallel runs because it exhausts ports + if !testing.Short() { + b.Skip("Skipping connect/disconnect benchmark in non-short mode to avoid port exhaustion") + } + + edgeAddr, svcAddr := allocatePorts() + + conf := &config.Configuration{ + Edgebound: config.Edgebound{ + Listen: gconfig.Listen{Network: "tcp", Addr: edgeAddr}, + EdgeIDAllocWhenNoIDServiceOn: true, + }, + Servicebound: config.Servicebound{ + Listen: gconfig.Listen{Network: "tcp", Addr: svcAddr}, + }, + } + + r, err := repo.NewRepo(conf) + require.NoError(b, err) + mqm, err := mq.NewMQM(conf) + require.NoError(b, err) + tmr := timer.NewTimer() + ex := exchange.NewExchange(conf, mqm) + + sb, err := servicebound.NewServicebound(conf, r, nil, ex, mqm, tmr) + require.NoError(b, err) + eb, err := edgebound.NewEdgebound(conf, r, nil, ex, tmr) + require.NoError(b, err) + + go sb.Serve() + go eb.Serve() + time.Sleep(30 * time.Millisecond) + + b.Cleanup(func() { + eb.Close() + sb.Close() + r.Close() + mqm.Close() + tmr.Close() + }) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + dialer := func() (net.Conn, error) { return net.Dial("tcp", edgeAddr) } + for pb.Next() { + e, err := edge.NewEdge(dialer) + if err != nil { + continue + } + e.Close() + } + }) + b.StopTimer() + + qps := float64(b.N) / b.Elapsed().Seconds() + b.ReportMetric(qps, "qps") +} diff --git a/test/e2e/conn_test.go b/test/e2e/conn_test.go new file mode 100644 index 0000000..9820898 --- /dev/null +++ b/test/e2e/conn_test.go @@ -0,0 +1,107 @@ +package e2e + +import ( + "context" + "net" + "sync" + "testing" + "time" + + "github.com/singchia/frontier/api/dataplane/v1/edge" + "github.com/singchia/frontier/api/dataplane/v1/service" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// E2E-CONN-001 +func TestEdgeConnect(t *testing.T) { + e := newEdge(t) + assert.NotZero(t, e.EdgeID()) +} + +// E2E-CONN-002 +func TestEdgeConnectAndClose(t *testing.T) { + done := make(chan struct{}) + e, err := edge.NewEdge(edgeDialer()) + require.NoError(t, err) + go func() { + defer close(done) + e.Close() + }() + waitTimeout(t, done, 3*time.Second) +} + +// E2E-CONN-003: Edge carries meta, Service receives it via EdgeOnline callback +func TestEdgeConnectWithMeta(t *testing.T) { + meta := []byte("hello-frontier") + gotMeta := make(chan []byte, 1) + + svc := newService(t, + service.OptionServiceName("meta-checker"), + service.OptionServiceReceiveTopics([]string{}), + ) + err := svc.RegisterEdgeOnline(context.Background(), func(edgeID uint64, m []byte, addr net.Addr) error { + gotMeta <- m + return nil + }) + require.NoError(t, err) + + time.Sleep(20 * time.Millisecond) + _ = newEdge(t, edge.OptionEdgeMeta(meta)) + + select { + case m := <-gotMeta: + assert.Equal(t, meta, m) + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for EdgeOnline callback") + } +} + +// E2E-CONN-004: 100 edges connect concurrently, all succeed with unique IDs +func TestMultiEdgeConnect(t *testing.T) { + const n = 100 + ids := make(chan uint64, n) + var wg sync.WaitGroup + wg.Add(n) + + for i := 0; i < n; i++ { + go func() { + defer wg.Done() + e := newEdge(t) + ids <- e.EdgeID() + }() + } + wg.Wait() + close(ids) + + seen := make(map[uint64]struct{}, n) + for id := range ids { + assert.NotZero(t, id) + _, dup := seen[id] + assert.False(t, dup, "duplicate edgeID: %d", id) + seen[id] = struct{}{} + } + assert.Len(t, seen, n) +} + +// E2E-CONN-005: Service connects and registers successfully +func TestServiceConnect(t *testing.T) { + svc := newService(t, service.OptionServiceName("my-service")) + assert.NotNil(t, svc) +} + +// E2E-CONN-006: After Service disconnects, Edge RPC call returns an error +func TestServiceConnectAndClose(t *testing.T) { + svc, err := service.NewService(serviceDialer(), + service.OptionServiceName("gone-service"), + ) + require.NoError(t, err) + svc.Close() + + time.Sleep(50 * time.Millisecond) + + e := newEdge(t) + req := e.NewRequest([]byte("ping")) + _, err = e.Call(context.Background(), "anything", req) + assert.Error(t, err) +} diff --git a/test/e2e/main_test.go b/test/e2e/main_test.go new file mode 100644 index 0000000..40410bf --- /dev/null +++ b/test/e2e/main_test.go @@ -0,0 +1,131 @@ +package e2e + +import ( + "flag" + "io" + "net" + "os" + "testing" + "time" + + "github.com/jumboframes/armorigo/log" + "github.com/singchia/frontier/api/dataplane/v1/edge" + "github.com/singchia/frontier/api/dataplane/v1/service" + gconfig "github.com/singchia/frontier/pkg/config" + "github.com/singchia/frontier/pkg/frontier/config" + "github.com/singchia/frontier/pkg/frontier/edgebound" + "github.com/singchia/frontier/pkg/frontier/exchange" + "github.com/singchia/frontier/pkg/frontier/mq" + "github.com/singchia/frontier/pkg/frontier/repo" + "github.com/singchia/frontier/pkg/frontier/servicebound" + "github.com/singchia/go-timer/v2" + "k8s.io/klog/v2" +) + +func init() { + klog.InitFlags(nil) + flag.Set("v", "0") + flag.Set("logtostderr", "false") + flag.Set("stderrthreshold", "FATAL") + + log.SetLevel(log.LevelFatal) + log.SetOutput(io.Discard) +} + +const ( + edgeboundAddr = "127.0.0.1:13100" + serviceboundAddr = "127.0.0.1:13101" + network = "tcp" +) + +// TestMain starts one shared frontier instance for the whole test binary. +func TestMain(m *testing.M) { + conf := &config.Configuration{ + Edgebound: config.Edgebound{ + Listen: gconfig.Listen{Network: network, Addr: edgeboundAddr}, + EdgeIDAllocWhenNoIDServiceOn: true, + }, + Servicebound: config.Servicebound{ + Listen: gconfig.Listen{Network: network, Addr: serviceboundAddr}, + }, + } + + r, err := repo.NewRepo(conf) + if err != nil { + panic("new repo: " + err.Error()) + } + mqm, err := mq.NewMQM(conf) + if err != nil { + panic("new mqm: " + err.Error()) + } + tmr := timer.NewTimer() + ex := exchange.NewExchange(conf, mqm) + + sb, err := servicebound.NewServicebound(conf, r, nil, ex, mqm, tmr) + if err != nil { + panic("new servicebound: " + err.Error()) + } + eb, err := edgebound.NewEdgebound(conf, r, nil, ex, tmr) + if err != nil { + panic("new edgebound: " + err.Error()) + } + + go sb.Serve() + go eb.Serve() + time.Sleep(30 * time.Millisecond) + + code := m.Run() + + eb.Close() + sb.Close() + r.Close() + mqm.Close() + tmr.Close() + os.Exit(code) +} + +// edgeDialer returns a Dialer that connects to the edgebound. +func edgeDialer() edge.Dialer { + return func() (net.Conn, error) { + return net.Dial(network, edgeboundAddr) + } +} + +// serviceDialer returns a Dialer that connects to the servicebound. +func serviceDialer() service.Dialer { + return func() (net.Conn, error) { + return net.Dial(network, serviceboundAddr) + } +} + +// newEdge creates an Edge connected to the shared test frontier. +func newEdge(t *testing.T, opts ...edge.EdgeOption) edge.Edge { + t.Helper() + e, err := edge.NewEdge(edgeDialer(), opts...) + if err != nil { + t.Fatalf("new edge: %v", err) + } + t.Cleanup(func() { e.Close() }) + return e +} + +// newService creates a Service connected to the shared test frontier. +func newService(t *testing.T, opts ...service.ServiceOption) service.Service { + t.Helper() + svc, err := service.NewService(serviceDialer(), opts...) + if err != nil { + t.Fatalf("new service: %v", err) + } + t.Cleanup(func() { svc.Close() }) + return svc +} + +// waitTimeout waits for done to be closed, failing the test if deadline is exceeded. +func waitTimeout(t *testing.T, done <-chan struct{}, d time.Duration) { + t.Helper() + select { + case <-done: + case <-time.After(d): + t.Fatal("timed out waiting") + } +} diff --git a/test/e2e/message_test.go b/test/e2e/message_test.go new file mode 100644 index 0000000..1e43362 --- /dev/null +++ b/test/e2e/message_test.go @@ -0,0 +1,192 @@ +package e2e + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/singchia/frontier/api/dataplane/v1/service" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// E2E-MSG-001: Edge publishes to a topic, Service registered on that topic receives it +func TestEdgePublishToService(t *testing.T) { + const topic = "news" + received := make(chan []byte, 1) + + svc := newService(t, + service.OptionServiceName("subscriber"), + service.OptionServiceReceiveTopics([]string{topic}), + ) + go func() { + msg, err := svc.Receive(context.TODO()) + if err == nil { + received <- msg.Data() + msg.Done() + } + }() + + time.Sleep(30 * time.Millisecond) + + e := newEdge(t) + payload := []byte("breaking-news") + msg := e.NewMessage(payload) + err := e.Publish(context.TODO(), topic, msg) + require.NoError(t, err) + + select { + case data := <-received: + assert.Equal(t, payload, data) + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for message") + } +} + +// E2E-MSG-002: Service publishes a message to a specific edgeID, Edge receives it +func TestServicePublishToEdge(t *testing.T) { + received := make(chan []byte, 1) + + e := newEdge(t) + go func() { + msg, err := e.Receive(context.TODO()) + if err == nil { + received <- msg.Data() + msg.Done() + } + }() + + time.Sleep(30 * time.Millisecond) + + svc := newService(t, service.OptionServiceName("publisher")) + payload := []byte("hello-edge") + msg := svc.NewMessage(payload) + err := svc.Publish(context.TODO(), e.EdgeID(), msg) + require.NoError(t, err) + + select { + case data := <-received: + assert.Equal(t, payload, data) + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for message") + } +} + +// E2E-MSG-003: Multiple services on different topics; messages route correctly +func TestMessageTopicRoute(t *testing.T) { + topics := []string{"topic-a", "topic-b"} + receivedA := make(chan []byte, 1) + receivedB := make(chan []byte, 1) + + svcA := newService(t, + service.OptionServiceName("svc-a"), + service.OptionServiceReceiveTopics([]string{topics[0]}), + ) + svcB := newService(t, + service.OptionServiceName("svc-b"), + service.OptionServiceReceiveTopics([]string{topics[1]}), + ) + go func() { + if msg, err := svcA.Receive(context.TODO()); err == nil { + receivedA <- msg.Data() + msg.Done() + } + }() + go func() { + if msg, err := svcB.Receive(context.TODO()); err == nil { + receivedB <- msg.Data() + msg.Done() + } + }() + + time.Sleep(30 * time.Millisecond) + + e := newEdge(t) + msgA := e.NewMessage([]byte("for-a")) + msgB := e.NewMessage([]byte("for-b")) + require.NoError(t, e.Publish(context.TODO(), topics[0], msgA)) + require.NoError(t, e.Publish(context.TODO(), topics[1], msgB)) + + for _, ch := range []struct { + ch chan []byte + want string + timeout time.Duration + }{ + {receivedA, "for-a", 3 * time.Second}, + {receivedB, "for-b", 3 * time.Second}, + } { + select { + case data := <-ch.ch: + assert.Equal(t, []byte(ch.want), data) + case <-time.After(ch.timeout): + t.Fatalf("timed out waiting for message on topic") + } + } +} + +// E2E-MSG-004: Edge publishes to a topic with no subscriber => error +func TestMessageTopicNotFound(t *testing.T) { + e := newEdge(t) + msg := e.NewMessage([]byte("orphan")) + err := e.Publish(context.TODO(), "no-such-topic", msg) + assert.Error(t, err) +} + +// E2E-MSG-005: 10 edges publish concurrently, service receives all messages +func TestMessageConcurrent(t *testing.T) { + const ( + topic = "concurrent-topic" + workers = 10 + ) + var mu sync.Mutex + received := 0 + allDone := make(chan struct{}) + + svc := newService(t, + service.OptionServiceName("concurrent-sub"), + service.OptionServiceReceiveTopics([]string{topic}), + ) + go func() { + for { + msg, err := svc.Receive(context.TODO()) + if err != nil { + return + } + msg.Done() + mu.Lock() + received++ + if received == workers { + close(allDone) + } + mu.Unlock() + } + }() + + time.Sleep(30 * time.Millisecond) + + var wg sync.WaitGroup + wg.Add(workers) + for i := 0; i < workers; i++ { + go func() { + defer wg.Done() + e := newEdge(t) + msg := e.NewMessage([]byte("concurrent")) + if err := e.Publish(context.TODO(), topic, msg); err != nil { + t.Errorf("publish error: %v", err) + } + }() + } + wg.Wait() + + select { + case <-allDone: + mu.Lock() + assert.Equal(t, workers, received) + mu.Unlock() + case <-time.After(5 * time.Second): + mu.Lock() + t.Fatalf("timed out: only received %d/%d messages", received, workers) + mu.Unlock() + } +} diff --git a/test/e2e/resource_test.go b/test/e2e/resource_test.go new file mode 100644 index 0000000..38f5eaf --- /dev/null +++ b/test/e2e/resource_test.go @@ -0,0 +1,90 @@ +package e2e + +import ( + "context" + "runtime" + "testing" + "time" + + "github.com/singchia/frontier/api/dataplane/v1/edge" + "github.com/singchia/frontier/api/dataplane/v1/service" + "github.com/singchia/geminio" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// E2E-RES-001: After edge closes, frontier side resources are cleaned up (no panic/hang) +func TestResourceCleanupOnEdgeClose(t *testing.T) { + e, err := edge.NewEdge(edgeDialer()) + require.NoError(t, err) + + // open a few streams to ensure there is something to clean up + svc := newService(t, service.OptionServiceName("cleanup-sink")) + go func() { + for { + st, err := svc.AcceptStream() + if err != nil { + return + } + st.Close() + } + }() + time.Sleep(20 * time.Millisecond) + + for i := 0; i < 5; i++ { + st, err := e.OpenStream("cleanup-sink") + if err == nil { + st.Close() + } + } + time.Sleep(20 * time.Millisecond) + + // close the edge — frontier must not panic or deadlock + e.Close() + time.Sleep(100 * time.Millisecond) +} + +// E2E-RES-002: After service closes, subsequent edge RPC calls return an error +func TestResourceCleanupOnServiceClose(t *testing.T) { + // start a service, register an RPC, then close it + svc, err := service.NewService(serviceDialer(), service.OptionServiceName("gone-svc")) + require.NoError(t, err) + err = svc.Register(context.TODO(), "probe", func(_ context.Context, req geminio.Request, resp geminio.Response) { + resp.SetData(req.Data()) + }) + require.NoError(t, err) + time.Sleep(20 * time.Millisecond) + + svc.Close() + time.Sleep(50 * time.Millisecond) + + // now an edge should get an error calling the gone service + e := newEdge(t) + req := e.NewRequest([]byte("hello")) + _, err = e.Call(context.TODO(), "probe", req) + assert.Error(t, err, "expected error after service closed") +} + +// E2E-RES-003: goroutine count does not grow unboundedly after repeated edge connect/close +func TestGoroutineNoLeak(t *testing.T) { + // let the frontier settle, then record baseline + runtime.GC() + time.Sleep(100 * time.Millisecond) + baseline := runtime.NumGoroutine() + + const iterations = 30 + for i := 0; i < iterations; i++ { + e, err := edge.NewEdge(edgeDialer()) + require.NoError(t, err) + e.Close() + } + + // allow goroutines to wind down + time.Sleep(500 * time.Millisecond) + runtime.GC() + + after := runtime.NumGoroutine() + // Leak threshold: must not grow by more than iterations goroutines above baseline + assert.Less(t, after, baseline+iterations, + "possible goroutine leak: baseline=%d after=%d", baseline, after) +} diff --git a/test/e2e/rpc_test.go b/test/e2e/rpc_test.go new file mode 100644 index 0000000..9e41489 --- /dev/null +++ b/test/e2e/rpc_test.go @@ -0,0 +1,125 @@ +package e2e + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/singchia/frontier/api/dataplane/v1/edge" + "github.com/singchia/frontier/api/dataplane/v1/service" + "github.com/singchia/geminio" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// E2E-RPC-001: Edge calls a method registered by Service via frontier +func TestEdgeCallService(t *testing.T) { + svc := newService(t, service.OptionServiceName("echo-service")) + err := svc.Register(context.TODO(), "echo", func(ctx context.Context, req geminio.Request, resp geminio.Response) { + resp.SetData(req.Data()) + }) + require.NoError(t, err) + + // give servicebound time to index the RPC + time.Sleep(30 * time.Millisecond) + + e := newEdge(t) + payload := []byte("hello") + req := e.NewRequest(payload) + resp, err := e.Call(context.TODO(), "echo", req) + require.NoError(t, err) + assert.Equal(t, payload, resp.Data()) +} + +// E2E-RPC-002: Service calls a method registered by Edge via frontier (specifying edgeID) +func TestServiceCallEdge(t *testing.T) { + e := newEdge(t) + err := e.Register(context.TODO(), "ping", func(ctx context.Context, req geminio.Request, resp geminio.Response) { + resp.SetData([]byte("pong")) + }) + require.NoError(t, err) + + time.Sleep(30 * time.Millisecond) + + svc := newService(t, service.OptionServiceName("caller")) + req := svc.NewRequest([]byte("")) + resp, err := svc.Call(context.TODO(), e.EdgeID(), "ping", req) + require.NoError(t, err) + assert.Equal(t, []byte("pong"), resp.Data()) +} + +// E2E-RPC-003: RPC not found on edge returns an error (no matching RPC registered) +func TestRPCTargetRPCNotFound(t *testing.T) { + svc := newService(t, service.OptionServiceName("noop-service")) + // register a method so the service itself is reachable + _ = svc.Register(context.TODO(), "placeholder", func(_ context.Context, req geminio.Request, resp geminio.Response) {}) + + time.Sleep(30 * time.Millisecond) + + e := newEdge(t) + // call a method the edge never registered + req := e.NewRequest([]byte("x")) + _, err := e.Call(context.TODO(), "nonexistent-method", req) + assert.Error(t, err) +} + +// E2E-RPC-004: Service calls edge that is already offline => ErrEdgeNotOnline +func TestRPCTargetEdgeOffline(t *testing.T) { + // create an edge then close it immediately (without t.Cleanup so we control timing) + offlineEdge, err := edge.NewEdge(edgeDialer()) + require.NoError(t, err) + offlineID := offlineEdge.EdgeID() + offlineEdge.Close() + + time.Sleep(50 * time.Millisecond) + + svc := newService(t, service.OptionServiceName("caller2")) + req := svc.NewRequest([]byte("data")) + _, err = svc.Call(context.TODO(), offlineID, "any-method", req) + assert.Error(t, err) +} + +// E2E-RPC-005: 10 edges concurrently call the same Service RPC, all succeed +func TestRPCConcurrent(t *testing.T) { + svc := newService(t, service.OptionServiceName("concurrent-echo")) + err := svc.Register(context.TODO(), "echo", func(ctx context.Context, req geminio.Request, resp geminio.Response) { + resp.SetData(req.Data()) + }) + require.NoError(t, err) + + // create all edges first and wait for them to be indexed before calling + const n = 10 + edges := make([]edge.Edge, n) + for i := 0; i < n; i++ { + edges[i] = newEdge(t) + } + // give frontier time to propagate the RPC registration to all edges + time.Sleep(100 * time.Millisecond) + + var wg sync.WaitGroup + wg.Add(n) + errs := make(chan error, n) + + for i := 0; i < n; i++ { + e := edges[i] + go func() { + defer wg.Done() + payload := []byte("concurrent") + req := e.NewRequest(payload) + resp, err := e.Call(context.TODO(), "echo", req) + if err != nil { + errs <- err + return + } + if string(resp.Data()) != string(payload) { + errs <- assert.AnError + } + }() + } + wg.Wait() + close(errs) + for err := range errs { + assert.NoError(t, err) + } +} diff --git a/test/e2e/stream_test.go b/test/e2e/stream_test.go new file mode 100644 index 0000000..a00139c --- /dev/null +++ b/test/e2e/stream_test.go @@ -0,0 +1,259 @@ +package e2e + +import ( + "context" + "testing" + "time" + + "github.com/singchia/frontier/api/dataplane/v1/edge" + "github.com/singchia/frontier/api/dataplane/v1/service" + "github.com/singchia/geminio" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// E2E-STRM-001: Edge opens a stream to Service, Service accepts it +func TestEdgeOpenStreamToService(t *testing.T) { + accepted := make(chan geminio.Stream, 1) + svc := newService(t, service.OptionServiceName("stream-service")) + go func() { + st, err := svc.AcceptStream() + if err == nil { + accepted <- st + } + }() + + time.Sleep(30 * time.Millisecond) + + e := newEdge(t) + st, err := e.OpenStream("stream-service") + require.NoError(t, err) + t.Cleanup(func() { st.Close() }) + + select { + case serverSt := <-accepted: + assert.NotNil(t, serverSt) + serverSt.Close() + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for AcceptStream") + } +} + +// E2E-STRM-002: Service opens a stream to Edge, Edge accepts it +func TestServiceOpenStreamToEdge(t *testing.T) { + accepted := make(chan geminio.Stream, 1) + e := newEdge(t) + go func() { + st, err := e.AcceptStream() + if err == nil { + accepted <- st + } + }() + + time.Sleep(30 * time.Millisecond) + + svc := newService(t, service.OptionServiceName("stream-opener")) + st, err := svc.OpenStream(context.TODO(), e.EdgeID()) + require.NoError(t, err) + t.Cleanup(func() { st.Close() }) + + select { + case edgeSt := <-accepted: + assert.NotNil(t, edgeSt) + edgeSt.Close() + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for AcceptStream on edge") + } +} + +// E2E-STRM-003: Raw IO forwarded bidirectionally through the stream +func TestStreamRawDataForward(t *testing.T) { + serverRead := make(chan []byte, 1) + clientRead := make(chan []byte, 1) + + svc := newService(t, service.OptionServiceName("raw-echo")) + go func() { + st, err := svc.AcceptStream() + if err != nil { + return + } + defer st.Close() + buf := make([]byte, 64) + n, _ := st.Read(buf) + serverRead <- buf[:n] + st.Write([]byte("server-reply")) + }() + + time.Sleep(30 * time.Millisecond) + + e := newEdge(t) + st, err := e.OpenStream("raw-echo") + require.NoError(t, err) + defer st.Close() + + _, err = st.Write([]byte("client-hello")) + require.NoError(t, err) + + buf := make([]byte, 64) + go func() { + n, _ := st.Read(buf) + clientRead <- buf[:n] + }() + + select { + case data := <-serverRead: + assert.Equal(t, []byte("client-hello"), data) + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for server read") + } + select { + case data := <-clientRead: + assert.Equal(t, []byte("server-reply"), data) + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for client read") + } +} + +// E2E-STRM-004: Message forwarded bidirectionally inside a stream +func TestStreamMessageForward(t *testing.T) { + const streamTopic = "stream-topic" + serverReceived := make(chan []byte, 1) + clientReceived := make(chan []byte, 1) + + svc := newService(t, service.OptionServiceName("msg-echo")) + go func() { + st, err := svc.AcceptStream() + if err != nil { + return + } + defer st.Close() + // receive from edge + msg, err := st.Receive(context.TODO()) + if err != nil { + return + } + serverReceived <- msg.Data() + msg.Done() + // reply back + reply := st.NewMessage([]byte("svc-msg-reply")) + _ = st.Publish(context.TODO(), reply) + }() + + time.Sleep(30 * time.Millisecond) + + e := newEdge(t) + st, err := e.OpenStream("msg-echo") + require.NoError(t, err) + defer st.Close() + + go func() { + msg, err := st.Receive(context.TODO()) + if err == nil { + clientReceived <- msg.Data() + msg.Done() + } + }() + + edgeMsg := st.NewMessage([]byte("edge-msg")) + err = st.Publish(context.TODO(), edgeMsg) + require.NoError(t, err) + + select { + case data := <-serverReceived: + assert.Equal(t, []byte("edge-msg"), data) + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for server message") + } + select { + case data := <-clientReceived: + assert.Equal(t, []byte("svc-msg-reply"), data) + case <-time.After(3 * time.Second): + t.Fatal("timed out waiting for client message") + } +} + +// E2E-STRM-005: RPC forwarded bidirectionally inside a stream +func TestStreamRPCForward(t *testing.T) { + svc := newService(t, service.OptionServiceName("rpc-echo")) + go func() { + st, err := svc.AcceptStream() + if err != nil { + return + } + defer st.Close() + _ = st.Register(context.TODO(), "echo", func(_ context.Context, req geminio.Request, resp geminio.Response) { + resp.SetData(req.Data()) + }) + // keep the stream alive while the test runs + time.Sleep(3 * time.Second) + }() + + time.Sleep(30 * time.Millisecond) + + e := newEdge(t) + st, err := e.OpenStream("rpc-echo") + require.NoError(t, err) + defer st.Close() + + time.Sleep(30 * time.Millisecond) + + payload := []byte("rpc-payload") + req := st.NewRequest(payload) + resp, err := st.Call(context.TODO(), "echo", req) + require.NoError(t, err) + assert.Equal(t, payload, resp.Data()) +} + +// E2E-STRM-006: Stream Close does not panic and can be called multiple times safely. +func TestStreamClose(t *testing.T) { + svc := newService(t, service.OptionServiceName("close-test")) + go func() { + for { + st, err := svc.AcceptStream() + if err != nil { + return + } + st.Close() + } + }() + + time.Sleep(30 * time.Millisecond) + + e := newEdge(t) + st, err := e.OpenStream("close-test") + require.NoError(t, err) + + // Close must not panic, even when called multiple times + assert.NotPanics(t, func() { st.Close() }) + assert.NotPanics(t, func() { st.Close() }) +} + +// E2E-STRM-007: Service opens a stream to an offline edge; the stream is returned +// but immediately closed by frontier (edge not found), so subsequent IO fails. +func TestStreamTargetEdgeOffline(t *testing.T) { + offlineEdge, err := edge.NewEdge(edgeDialer()) + require.NoError(t, err) + offlineID := offlineEdge.EdgeID() + offlineEdge.Close() + + time.Sleep(50 * time.Millisecond) + + svc := newService(t, service.OptionServiceName("opener")) + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + st, err := svc.OpenStream(ctx, offlineID) + // frontier may return an error immediately, or return a stream that is + // already closed — either way IO must fail. + if err != nil { + return // expected: direct error + } + defer st.Close() + // if a stream was returned, a write or receive must fail + _, writeErr := st.Write([]byte("probe")) + recvCtx, recvCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer recvCancel() + _, recvErr := st.Receive(recvCtx) + if writeErr == nil && recvErr == nil { + t.Error("expected IO on stream to dead edge to fail, but both succeeded") + } +} diff --git a/test/run_tests.sh b/test/run_tests.sh new file mode 100755 index 0000000..39ec3ce --- /dev/null +++ b/test/run_tests.sh @@ -0,0 +1,274 @@ +#!/bin/bash + +# Frontier Comprehensive Test Suite Runner — see `test/run_tests.sh -h` + +# Benchmark -benchtime defaults +: "${BENCH_TIME_EACH:=3s}" +: "${BENCH_TIME_ALL:=10s}" +# go test -timeout: applies to the whole process (default: ~10m, too short for bench=. + long benchtime) +: "${BENCH_GO_TEST_TIMEOUT:=30m}" + +OUTPUT_FILE="" +run_unit=false +run_bench=false +run_e2e=false +run_security=false +run_race=false +run_cover=false +any_category=false + +while [[ $# -gt 0 ]]; do + case "$1" in + -o|--output) + if [[ -z "${2:-}" ]]; then + echo "Error: $1 requires a file path" >&2 + exit 1 + fi + OUTPUT_FILE="$2" + shift 2 + ;; + -h|--help) + cat <<'EOF' +Usage: test/run_tests.sh [options] [--category ...] + +Options: + -o, --output FILE Write full output to FILE (overwrite; also shown on terminal) + -h, --help Show this help + +Categories (combine multiple; omit all to run everything): + --unit Unit tests (exchange): go test ./... -short + --bench Benchmarks under test/bench + --e2e End-to-end tests under test/e2e + --security Security tests under test/security (race, fuzz) + --race Race detector on unit tests + --cover Coverage (coverage.out, coverage.html) + --all Explicitly run all categories + +Examples: + test/run_tests.sh --unit + test/run_tests.sh --e2e --security + test/run_tests.sh -o run.log --bench + +Env (benchmark section only): + BENCH_TIME_EACH Per-benchmark -benchtime (default: 3s) + BENCH_TIME_ALL Final bench=. -benchtime (default: 10s) + BENCH_GO_TEST_TIMEOUT go test -timeout for benchmarks (default: 30m; 0 = no limit) +EOF + exit 0 + ;; + --unit) + run_unit=true + any_category=true + shift + ;; + --bench) + run_bench=true + any_category=true + shift + ;; + --e2e) + run_e2e=true + any_category=true + shift + ;; + --security) + run_security=true + any_category=true + shift + ;; + --race) + run_race=true + any_category=true + shift + ;; + --cover|--coverage) + run_cover=true + any_category=true + shift + ;; + --all) + run_unit=true + run_bench=true + run_e2e=true + run_security=true + run_race=true + run_cover=true + any_category=true + shift + ;; + *) + echo "Unknown option: $1 (try -h)" >&2 + exit 1 + ;; + esac +done + +if ! $any_category; then + run_unit=true + run_bench=true + run_e2e=true + run_security=true + run_race=true + run_cover=true +fi + +# Resolve relative log path to invocation cwd (before cd to project root) +if [[ -n "$OUTPUT_FILE" ]]; then + if [[ "$OUTPUT_FILE" != /* ]]; then + OUTPUT_FILE="$(pwd)/$OUTPUT_FILE" + fi + mkdir -p "$(dirname "$OUTPUT_FILE")" + exec > >(tee "$OUTPUT_FILE") 2>&1 +fi + +set -e + +echo "===================================" +echo "Frontier Comprehensive Test Suite" +echo "===================================" +if [[ -n "$OUTPUT_FILE" ]]; then + echo "Full output also logged to: $OUTPUT_FILE" +fi +echo "Categories:" +$run_unit && echo " - unit" +$run_bench && echo " - bench" +$run_e2e && echo " - e2e" +$run_security && echo " - security (race, fuzz)" +$run_race && echo " - race" +$run_cover && echo " - cover" +echo "" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Function to print section headers +print_header() { + echo "" + echo -e "${YELLOW}===================================${NC}" + echo -e "${YELLOW}$1${NC}" + echo -e "${YELLOW}===================================${NC}" + echo "" +} + +# Change to project root +cd "$(dirname "$0")/.." + +# Install dependencies +echo "Installing dependencies..." +go mod download + +if $run_unit; then + print_header "Running Unit Tests (Exchange)" + go test -v ./pkg/frontier/exchange/... -short -count=1 2>&1 | tail -50 || true +fi + +if $run_bench; then + print_header "BENCHMARK TESTS" + + echo "Running RPC Call Benchmarks..." + go test -timeout="${BENCH_GO_TEST_TIMEOUT}" -bench=BenchmarkEdgeCall -benchmem -benchtime="${BENCH_TIME_EACH}" ./test/bench/... 2>&1 || true + + echo "" + echo "Running Service Call Benchmarks..." + go test -timeout="${BENCH_GO_TEST_TIMEOUT}" -bench=BenchmarkServiceCall -benchmem -benchtime="${BENCH_TIME_EACH}" ./test/bench/... 2>&1 || true + + echo "" + echo "Running Message Publishing Benchmarks..." + go test -timeout="${BENCH_GO_TEST_TIMEOUT}" -bench=BenchmarkEdgePublish -benchmem -benchtime="${BENCH_TIME_EACH}" ./test/bench/... 2>&1 || true + + echo "" + echo "Running Stream Open Benchmarks..." + go test -timeout="${BENCH_GO_TEST_TIMEOUT}" -bench=BenchmarkEdgeOpen -benchmem -benchtime="${BENCH_TIME_EACH}" ./test/bench/... 2>&1 || true + + echo "" + echo "Running Edge Connect/Disconnect Benchmarks..." + go test -timeout="${BENCH_GO_TEST_TIMEOUT}" -bench=BenchmarkEdgeConnect -benchmem -benchtime="${BENCH_TIME_EACH}" ./test/bench/... 2>&1 || true + + print_header "Running All Benchmarks (${BENCH_TIME_ALL} each)..." + go test -timeout="${BENCH_GO_TEST_TIMEOUT}" -bench=. -benchmem -benchtime="${BENCH_TIME_ALL}" ./test/bench/... 2>&1 || true +fi + +if $run_e2e; then + print_header "E2E INTEGRATION TESTS" + + echo "Running Connection Tests..." + go test -v -run TestConn ./test/e2e/... -count=1 2>&1 | tail -100 || true + + echo "" + echo "Running RPC Tests..." + go test -v -run TestRPC ./test/e2e/... -count=1 2>&1 | tail -100 || true + + echo "" + echo "Running Message Tests..." + go test -v -run TestMessage ./test/e2e/... -count=1 2>&1 | tail -100 || true + + echo "" + echo "Running Stream Tests..." + go test -v -run TestStream ./test/e2e/... -count=1 2>&1 | tail -100 || true + + echo "" + echo "Running Resource Cleanup Tests..." + go test -v -run TestResourceCleanup ./test/e2e/... -count=1 2>&1 | tail -50 || true + + print_header "Running All E2E Tests..." + go test -v ./test/e2e/... -count=1 -timeout=10m 2>&1 | tail -100 || true +fi + +if $run_security; then + print_header "SECURITY TESTS (Race & Fuzz)" + + echo "Running Race Condition Tests..." + go test -v -race ./test/security/... -count=1 -timeout=5m 2>&1 | tail -100 || true + + echo "" + echo "Running Fuzz Tests..." + if go version | grep -qE 'go1\.(1[89]|2[0-9]|[3-9][0-9])'; then + go test -v -run TestFuzz ./test/security/... -count=1 2>&1 | tail -50 || true + echo "" + echo "Running native fuzz (30 seconds)..." + go test -fuzz=Fuzz -fuzztime=30s ./test/security/... 2>&1 || true + else + echo "Go version doesn't support fuzzing natively (requires Go 1.18+), skipping..." + fi + + print_header "Running All Security Tests..." + go test -v ./test/security/... -count=1 -timeout=10m 2>&1 | tail -100 || true +fi + +if $run_race; then + print_header "RACE DETECTION TESTS" + + echo "Running unit tests with race detector..." + go test -race -short ./pkg/frontier/exchange/... 2>&1 | tail -100 || true +fi + +if $run_cover; then + print_header "CODE COVERAGE" + + echo "Generating coverage report..." + go test -coverprofile=coverage.out ./pkg/frontier/... ./test/e2e/... 2>&1 || true + go tool cover -func=coverage.out | tail -30 || true + + if command -v go &> /dev/null; then + go tool cover -html=coverage.out -o coverage.html 2>&1 || true + echo "Coverage report generated: coverage.html" + fi +fi + +print_header "TEST SUMMARY" + +echo -e "${GREEN}Test execution completed!${NC}" +echo "" +echo "Test categories executed:" +$run_unit && echo " - Unit Tests (Exchange)" +$run_bench && echo " - Benchmark Tests" +$run_e2e && echo " - E2E Integration Tests" +$run_security && echo " - Security Tests (Race, Fuzz)" +$run_race && echo " - Race Detection Tests" +$run_cover && echo " - Code Coverage" +echo "" +echo "Check the output above for any failures or issues." +echo "" diff --git a/test/security/fuzz_test.go b/test/security/fuzz_test.go new file mode 100644 index 0000000..d542de8 --- /dev/null +++ b/test/security/fuzz_test.go @@ -0,0 +1,88 @@ +package security + +import ( + "context" + "testing" + + "github.com/singchia/frontier/api/dataplane/v1/edge" + "github.com/singchia/frontier/api/dataplane/v1/service" + "github.com/singchia/geminio" + "github.com/stretchr/testify/require" +) + +// SEC-FUZZ-001: Random bytes as Edge meta must not crash frontier +func FuzzEdgeMeta(f *testing.F) { + // seed corpus + f.Add([]byte("normal-meta")) + f.Add([]byte{0x00}) + f.Add([]byte{0xff, 0xfe, 0xfd}) + f.Add([]byte("line1\nline2")) + + f.Fuzz(func(t *testing.T, meta []byte) { + e, err := edge.NewEdge(testEdgeDial, edge.OptionEdgeMeta(meta)) + if err != nil { + return // connection refused or rejected is acceptable + } + e.Close() + }) +} + +// SEC-FUZZ-002: Random bytes as RPC payload must not crash frontier +func FuzzRPCPayload(f *testing.F) { + f.Add([]byte("hello")) + f.Add([]byte{}) + f.Add([]byte{0x00, 0xff}) + + // set up a long-lived service that echoes RPCs + svc, err := service.NewService(testSvcDial, service.OptionServiceName("fuzz-rpc-svc")) + require.NoError(f, err) + f.Cleanup(func() { svc.Close() }) + require.NoError(f, svc.Register(context.TODO(), "fuzz", func(_ context.Context, req geminio.Request, resp geminio.Response) { + resp.SetData(req.Data()) + })) + + f.Fuzz(func(t *testing.T, payload []byte) { + e, err := edge.NewEdge(testEdgeDial) + if err != nil { + return + } + defer e.Close() + req := e.NewRequest(payload) + _, _ = e.Call(context.TODO(), "fuzz", req) + }) +} + +// SEC-FUZZ-003: Random bytes as Publish payload must not crash frontier +func FuzzMessagePayload(f *testing.F) { + f.Add([]byte("msg")) + f.Add([]byte{}) + f.Add([]byte{0x00, 0x01, 0x02}) + + const topic = "fuzz-topic" + svc, err := service.NewService(testSvcDial, + service.OptionServiceName("fuzz-msg-svc"), + service.OptionServiceReceiveTopics([]string{topic}), + ) + require.NoError(f, err) + f.Cleanup(func() { svc.Close() }) + // drain received messages silently + go func() { + for { + msg, err := svc.Receive(context.TODO()) + if err != nil { + return + } + msg.Done() + } + }() + + f.Fuzz(func(t *testing.T, payload []byte) { + e, err := edge.NewEdge(testEdgeDial) + if err != nil { + return + } + defer e.Close() + msg := e.NewMessage(payload) + _ = e.Publish(context.TODO(), topic, msg) + }) +} diff --git a/test/security/main_test.go b/test/security/main_test.go new file mode 100644 index 0000000..9b8af8f --- /dev/null +++ b/test/security/main_test.go @@ -0,0 +1,95 @@ +package security + +import ( + "flag" + "io" + "net" + "os" + "testing" + "time" + + "github.com/jumboframes/armorigo/log" + "github.com/singchia/frontier/api/dataplane/v1/edge" + "github.com/singchia/frontier/api/dataplane/v1/service" + gconfig "github.com/singchia/frontier/pkg/config" + "github.com/singchia/frontier/pkg/frontier/config" + "github.com/singchia/frontier/pkg/frontier/edgebound" + "github.com/singchia/frontier/pkg/frontier/exchange" + "github.com/singchia/frontier/pkg/frontier/mq" + "github.com/singchia/frontier/pkg/frontier/repo" + "github.com/singchia/frontier/pkg/frontier/servicebound" + "github.com/singchia/go-timer/v2" + "k8s.io/klog/v2" +) + +func init() { + klog.InitFlags(nil) + flag.Set("v", "0") + flag.Set("logtostderr", "false") + flag.Set("stderrthreshold", "FATAL") + + log.SetLevel(log.LevelFatal) + log.SetOutput(io.Discard) +} + +const ( + edgeboundAddr = "127.0.0.1:13200" + serviceboundAddr = "127.0.0.1:13201" + testNetwork = "tcp" +) + +var ( + testEdgeDial edge.Dialer + testSvcDial service.Dialer +) + +func TestMain(m *testing.M) { + conf := &config.Configuration{ + Edgebound: config.Edgebound{ + Listen: gconfig.Listen{Network: testNetwork, Addr: edgeboundAddr}, + EdgeIDAllocWhenNoIDServiceOn: true, + }, + Servicebound: config.Servicebound{ + Listen: gconfig.Listen{Network: testNetwork, Addr: serviceboundAddr}, + }, + } + + r, err := repo.NewRepo(conf) + if err != nil { + panic(err) + } + mqm, err := mq.NewMQM(conf) + if err != nil { + panic(err) + } + tmr := timer.NewTimer() + ex := exchange.NewExchange(conf, mqm) + + sb, err := servicebound.NewServicebound(conf, r, nil, ex, mqm, tmr) + if err != nil { + panic(err) + } + eb, err := edgebound.NewEdgebound(conf, r, nil, ex, tmr) + if err != nil { + panic(err) + } + go sb.Serve() + go eb.Serve() + time.Sleep(30 * time.Millisecond) + + testEdgeDial = func() (net.Conn, error) { + return net.Dial(testNetwork, edgeboundAddr) + } + testSvcDial = func() (net.Conn, error) { + return net.Dial(testNetwork, serviceboundAddr) + } + + code := m.Run() + + eb.Close() + sb.Close() + r.Close() + mqm.Close() + tmr.Close() + os.Exit(code) +} diff --git a/test/security/race_test.go b/test/security/race_test.go new file mode 100644 index 0000000..d33fa92 --- /dev/null +++ b/test/security/race_test.go @@ -0,0 +1,124 @@ +package security + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/singchia/frontier/api/dataplane/v1/edge" + "github.com/singchia/frontier/api/dataplane/v1/service" + "github.com/singchia/geminio" + "github.com/stretchr/testify/require" +) + +// SEC-RACE-001: Concurrent Connect and Close on many edges — run with -race +func TestRaceEdgeConnectClose(t *testing.T) { + const n = 50 + var wg sync.WaitGroup + wg.Add(n) + for i := 0; i < n; i++ { + go func() { + defer wg.Done() + e, err := edge.NewEdge(testEdgeDial) + if err != nil { + return + } + e.Close() + }() + } + wg.Wait() +} + +// SEC-RACE-002: Same edge closed concurrently from multiple goroutines — must not panic +func TestRaceMultipleEdgeClose(t *testing.T) { + e, err := edge.NewEdge(testEdgeDial) + require.NoError(t, err) + + var wg sync.WaitGroup + const closers = 10 + wg.Add(closers) + for i := 0; i < closers; i++ { + go func() { + defer wg.Done() + e.Close() + }() + } + wg.Wait() +} + +// SEC-RACE-003: Service concurrently registers and the edge concurrently calls — no data race +func TestRaceServiceRegisterAndCall(t *testing.T) { + svc, err := service.NewService(testSvcDial, service.OptionServiceName("race-svc")) + require.NoError(t, err) + defer svc.Close() + + e, err := edge.NewEdge(testEdgeDial) + require.NoError(t, err) + defer e.Close() + + time.Sleep(20 * time.Millisecond) + + var wg sync.WaitGroup + const workers = 10 + + // goroutines registering RPCs + wg.Add(workers) + for i := 0; i < workers; i++ { + method := "method" + go func() { + defer wg.Done() + _ = svc.Register(context.TODO(), method, func(_ context.Context, req geminio.Request, resp geminio.Response) { + resp.SetData(req.Data()) + }) + }() + } + + // goroutines calling RPC from edge simultaneously + wg.Add(workers) + for i := 0; i < workers; i++ { + go func() { + defer wg.Done() + req := e.NewRequest([]byte("race")) + _, _ = e.Call(context.TODO(), "method", req) + }() + } + + wg.Wait() +} + +// SEC-RACE-004: Edge closes while its RPC is being forwarded — must not panic +func TestRaceForwardAndClose(t *testing.T) { + svc, err := service.NewService(testSvcDial, service.OptionServiceName("slow-svc")) + require.NoError(t, err) + defer svc.Close() + + // slow handler to ensure forwarding is in-flight when edge closes + err = svc.Register(context.TODO(), "slow", func(_ context.Context, req geminio.Request, resp geminio.Response) { + time.Sleep(50 * time.Millisecond) + resp.SetData(req.Data()) + }) + require.NoError(t, err) + + time.Sleep(20 * time.Millisecond) + + e, err := edge.NewEdge(testEdgeDial) + require.NoError(t, err) + + var wg sync.WaitGroup + wg.Add(2) + + go func() { + defer wg.Done() + req := e.NewRequest([]byte("x")) + _, _ = e.Call(context.TODO(), "slow", req) + }() + + go func() { + defer wg.Done() + time.Sleep(10 * time.Millisecond) + e.Close() + }() + + wg.Wait() +}