runc/libcontainer/seccomp/seccomp_linux.go

//go:build cgo && seccomp

package seccomp

import (
	"errors"
	"fmt"

	libseccomp "github.com/seccomp/libseccomp-golang"
	"github.com/sirupsen/logrus"
	"golang.org/x/sys/unix"

	"github.com/opencontainers/runc/libcontainer/configs"
	"github.com/opencontainers/runc/libcontainer/seccomp/patchbpf"
	"github.com/opencontainers/runtime-spec/specs-go"
)

var (
	actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM))
	actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
)

const (
	// Linux system calls can have at most 6 arguments
	syscallMaxArguments int = 6
)

// InitSeccomp installs the seccomp filters to be used in the container as
// specified in config.
// Returns the seccomp file descriptor if any of the filters include a
// SCMP_ACT_NOTIFY action, otherwise returns -1.
func InitSeccomp(config *configs.Seccomp) (int, error) {
	if config == nil {
		return -1, errors.New("cannot initialize Seccomp - nil config passed")
	}

	defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet)
	if err != nil {
		return -1, errors.New("error initializing seccomp - invalid default action")
	}

	// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
	apiLevel, _ := libseccomp.GetAPI()
	for _, call := range config.Syscalls {
		if call.Action == configs.Notify {
			if apiLevel < 6 {
				return -1, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel)
			}

			// We can't allow the write syscall to notify to the seccomp agent.
			// After InitSeccomp() is called, we need to syncParentSeccomp() to write the seccomp fd plain
			// number, so the parent sends it to the seccomp agent. If we use SCMP_ACT_NOTIFY on write, we
			// never can write the seccomp fd to the parent and therefore the seccomp agent never receives
			// the seccomp fd and runc is hang during initialization.
			//
			// Note that read()/close(), that are also used in syncParentSeccomp(), _can_ use SCMP_ACT_NOTIFY.
			// Because we write the seccomp fd on the pipe to the parent, the parent is able to proceed and
			// send the seccomp fd to the agent (it is another process and not subject to the seccomp
			// filter). We will be blocked on read()/close() inside syncParentSeccomp() but if the seccomp
			// agent allows those syscalls to proceed, initialization works just fine and the agent can
			// handle future read()/close() syscalls as it wanted.
			if call.Name == "write" {
				return -1, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall")
			}
		}
	}

	// See comment on why write is not allowed. The same reason applies, as this can mean handling write too.
	if defaultAction == libseccomp.ActNotify {
		return -1, errors.New("SCMP_ACT_NOTIFY cannot be used as default action")
	}

	filter, err := libseccomp.NewFilter(defaultAction)
	if err != nil {
		return -1, fmt.Errorf("error creating filter: %w", err)
	}

	// Add extra architectures
	for _, arch := range config.Architectures {
		scmpArch, err := libseccomp.GetArchFromString(arch)
		if err != nil {
			return -1, fmt.Errorf("error validating Seccomp architecture: %w", err)
		}
		if err := filter.AddArch(scmpArch); err != nil {
			return -1, fmt.Errorf("error adding architecture to seccomp filter: %w", err)
		}
	}

	// Add extra flags.
	for _, flag := range config.Flags {
		if err := setFlag(filter, flag); err != nil {
			return -1, err
		}
	}

	// Enable libseccomp binary tree optimization for longer rulesets.
	//
	// The number below chosen semi-arbitrarily, considering the following:
	// 1. libseccomp <= 2.5.4 misbehaves when binary tree optimization
	// is enabled and there are 0 rules.
	// 2. All known libseccomp versions (2.5.0 to 2.5.4) generate a binary
	// tree with 4 syscalls per node.
	if len(config.Syscalls) > 32 {
		if err := filter.SetOptimize(2); err != nil {
			// The error is not fatal and is probably means we have older libseccomp.
			logrus.Debugf("seccomp binary tree optimization not available: %v", err)
		}
	}

	// Unset no new privs bit
	if err := filter.SetNoNewPrivsBit(false); err != nil {
		return -1, fmt.Errorf("error setting no new privileges: %w", err)
	}

	// Add a rule for each syscall
	for _, call := range config.Syscalls {
		if call == nil {
			return -1, errors.New("encountered nil syscall while initializing Seccomp")
		}

		if err := matchCall(filter, call, defaultAction); err != nil {
			return -1, err
		}
	}

	seccompFd, err := patchbpf.PatchAndLoad(config, filter)
	if err != nil {
		return -1, fmt.Errorf("error loading seccomp filter into kernel: %w", err)
	}

	return seccompFd, nil
}

type unknownFlagError struct {
	flag specs.LinuxSeccompFlag
}

func (e *unknownFlagError) Error() string {
	return "seccomp flag " + string(e.flag) + " is not known to runc"
}

func setFlag(filter *libseccomp.ScmpFilter, flag specs.LinuxSeccompFlag) error {
	switch flag {
	case flagTsync:
		// libseccomp-golang always use filterAttrTsync when
		// possible so all goroutines will receive the same
		// rules, so there is nothing to do. It does not make
		// sense to apply the seccomp filter on only one
		// thread; other threads will be terminated after exec
		// anyway.
		return nil
	case specs.LinuxSeccompFlagLog:
		if err := filter.SetLogBit(true); err != nil {
			return fmt.Errorf("error adding log flag to seccomp filter: %w", err)
		}
		return nil
	case specs.LinuxSeccompFlagSpecAllow:
		if err := filter.SetSSB(true); err != nil {
			return fmt.Errorf("error adding SSB flag to seccomp filter: %w", err)
		}
		return nil
	case specs.LinuxSeccompFlagWaitKillableRecv:
		if err := filter.SetWaitKill(true); err != nil {
			return fmt.Errorf("error adding WaitKill flag to seccomp filter: %w", err)
		}
		return nil
	}
	// NOTE when adding more flags above, do not forget to also:
	// - add new flags to `flags` slice in config.go;
	// - add new flag values to flags_value() in tests/integration/seccomp.bats;
	// - modify func filterFlags in patchbpf/ accordingly.

	return &unknownFlagError{flag: flag}
}

// FlagSupported checks if the flag is known to runc and supported by
// currently used libseccomp and kernel (i.e. it can be set).
func FlagSupported(flag specs.LinuxSeccompFlag) error {
	filter := &libseccomp.ScmpFilter{}
	err := setFlag(filter, flag)

	// For flags we don't know, setFlag returns unknownFlagError.
	var uf *unknownFlagError
	if errors.As(err, &uf) {
		return err
	}
	// For flags that are known to runc and libseccomp-golang but can not
	// be applied because either libseccomp or the kernel is too old,
	// seccomp.VersionError is returned.
	var verErr *libseccomp.VersionError
	if errors.As(err, &verErr) {
		// Not supported by libseccomp or the kernel.
		return err
	}

	// All other flags are known and supported.
	return nil
}

// Convert Libcontainer Action to Libseccomp ScmpAction
func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error) {
	switch act {
	case configs.Kill, configs.KillThread:
		return libseccomp.ActKillThread, nil
	case configs.Errno:
		if errnoRet != nil {
			return libseccomp.ActErrno.SetReturnCode(int16(*errnoRet)), nil
		}
		return actErrno, nil
	case configs.Trap:
		return libseccomp.ActTrap, nil
	case configs.Allow:
		return libseccomp.ActAllow, nil
	case configs.Trace:
		if errnoRet != nil {
			return libseccomp.ActTrace.SetReturnCode(int16(*errnoRet)), nil
		}
		return actTrace, nil
	case configs.Log:
		return libseccomp.ActLog, nil
	case configs.Notify:
		return libseccomp.ActNotify, nil
	case configs.KillProcess:
		return libseccomp.ActKillProcess, nil
	default:
		return libseccomp.ActInvalid, errors.New("invalid action, cannot use in rule")
	}
}

// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
	switch op {
	case configs.EqualTo:
		return libseccomp.CompareEqual, nil
	case configs.NotEqualTo:
		return libseccomp.CompareNotEqual, nil
	case configs.GreaterThan:
		return libseccomp.CompareGreater, nil
	case configs.GreaterThanOrEqualTo:
		return libseccomp.CompareGreaterEqual, nil
	case configs.LessThan:
		return libseccomp.CompareLess, nil
	case configs.LessThanOrEqualTo:
		return libseccomp.CompareLessOrEqual, nil
	case configs.MaskEqualTo:
		return libseccomp.CompareMaskedEqual, nil
	default:
		return libseccomp.CompareInvalid, errors.New("invalid operator, cannot use in rule")
	}
}

// Convert Libcontainer Arg to Libseccomp ScmpCondition
func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
	cond := libseccomp.ScmpCondition{}

	if arg == nil {
		return cond, errors.New("cannot convert nil to syscall condition")
	}

	op, err := getOperator(arg.Op)
	if err != nil {
		return cond, err
	}

	return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
}

// Add a rule to match a single syscall
func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall, defAct libseccomp.ScmpAction) error {
	if call == nil || filter == nil {
		return errors.New("cannot use nil as syscall to block")
	}

	if len(call.Name) == 0 {
		return errors.New("empty string is not a valid syscall")
	}

	// Convert the call's action to the libseccomp equivalent
	callAct, err := getAction(call.Action, call.ErrnoRet)
	if err != nil {
		return fmt.Errorf("action in seccomp profile is invalid: %w", err)
	}
	if callAct == defAct {
		// This rule is redundant, silently skip it
		// to avoid error from AddRule.
		return nil
	}

	// If we can't resolve the syscall, assume it is not supported
	// by this kernel. Warn about it, don't error out.
	callNum, err := libseccomp.GetSyscallFromName(call.Name)
	if err != nil {
		logrus.Debugf("unknown seccomp syscall %q ignored", call.Name)
		return nil
	}

	// Unconditional match - just add the rule
	if len(call.Args) == 0 {
		if err := filter.AddRule(callNum, callAct); err != nil {
			return fmt.Errorf("error adding seccomp filter rule for syscall %s: %w", call.Name, err)
		}
	} else {
		// If two or more arguments have the same condition,
		// Revert to old behavior, adding each condition as a separate rule
		argCounts := make([]uint, syscallMaxArguments)
		conditions := []libseccomp.ScmpCondition{}

		for _, cond := range call.Args {
			newCond, err := getCondition(cond)
			if err != nil {
				return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %w", call.Name, err)
			}

			argCounts[cond.Index] += 1

			conditions = append(conditions, newCond)
		}

		hasMultipleArgs := false
		for _, count := range argCounts {
			if count > 1 {
				hasMultipleArgs = true
				break
			}
		}

		if hasMultipleArgs {
			// Revert to old behavior
			// Add each condition attached to a separate rule
			for _, cond := range conditions {
				condArr := []libseccomp.ScmpCondition{cond}

				if err := filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
					return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err)
				}
			}
		} else {
			// No conditions share same argument
			// Use new, proper behavior
			if err := filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
				return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err)
			}
		}
	}

	return nil
}

// Version returns major, minor, and micro.
func Version() (uint, uint, uint) {
	return libseccomp.GetLibraryVersion()
}

// Enabled is true if seccomp support is compiled in.
const Enabled = true