simplify rootless

Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
docker-18.09
Akihiro Suda 2018-10-16 13:30:02 +09:00
parent 8db1defa00
commit 048130d1d0
41 changed files with 40 additions and 4323 deletions

View File

@ -1,7 +1,7 @@
# Rootless mode (Experimental)
Requirements:
- runc `ecd55a4135e0a26de884ce436442914f945b1e76` (May 30, 2018) or later
- runc `a00bf0190895aa465a5fbed0268888e2c8ddfe85` (Oct 15, 2018) or later
- Some distros such as Debian (excluding Ubuntu) and Arch Linux require `echo 1 > /proc/sys/kernel/unprivileged_userns_clone`
- `newuidmap` and `newgidmap` need to be installed on the host. These commands are provided by the `uidmap` package.
- `/etc/subuid` and `/etc/subgid` should contain >= 65536 sub-IDs. e.g. `penguin:231072:65536`.
@ -45,6 +45,7 @@ unshared# buildkitd
* `overlayfs` snapshotter is not supported except Ubuntu-flavored kernel: http://kernel.ubuntu.com/git/ubuntu/ubuntu-artful.git/commit/fs/overlayfs?h=Ubuntu-4.13.0-25.29&id=0a414bdc3d01f3b61ed86cfe3ce8b63a9240eba7
* containerd worker is not supported ( pending PR: https://github.com/containerd/containerd/pull/2006 )
* Network namespace is not used at the moment.
* Cgroups is disabled.
### Terminal 2:
@ -61,7 +62,7 @@ $ docker run --name buildkitd -d --privileged -p 1234:1234 buildkit-rootless --
```
`docker run` requires `--privileged` but the BuildKit daemon is executed as a normal user.
See [`moby/moby#36597`](https://github.com/moby/moby/issues/36597, [`kubernetes/community#1934`](https://github.com/kubernetes/community/pull/1934) and [Jess's blog](https://blog.jessfraz.com/post/building-container-images-securely-on-kubernetes/) for the ongoing work to remove this requirement
See [`docker/cli#1347`](https://github.com/docker/cli/pull/1347) for the ongoing work to remove this requirement
```
$ docker exec buildkitd id

View File

@ -18,7 +18,7 @@ func main() {
var opt buildOpt
flag.BoolVar(&opt.withContainerd, "with-containerd", true, "enable containerd worker")
flag.StringVar(&opt.containerd, "containerd", "v1.2.0-rc.1", "containerd version")
flag.StringVar(&opt.runc, "runc", "dd56ece8236d6d9e5bed4ea0c31fe53c7b873ff4", "runc version")
flag.StringVar(&opt.runc, "runc", "a00bf0190895aa465a5fbed0268888e2c8ddfe85", "runc version")
flag.Parse()
bk := buildkit(opt)

View File

@ -18,7 +18,7 @@ func main() {
var opt buildOpt
flag.BoolVar(&opt.withContainerd, "with-containerd", true, "enable containerd worker")
flag.StringVar(&opt.containerd, "containerd", "v1.2.0-rc.1", "containerd version")
flag.StringVar(&opt.runc, "runc", "dd56ece8236d6d9e5bed4ea0c31fe53c7b873ff4", "runc version")
flag.StringVar(&opt.runc, "runc", "a00bf0190895aa465a5fbed0268888e2c8ddfe85", "runc version")
flag.Parse()
bk := buildkit(opt)

View File

@ -18,7 +18,7 @@ func main() {
var opt buildOpt
flag.BoolVar(&opt.withContainerd, "with-containerd", true, "enable containerd worker")
flag.StringVar(&opt.containerd, "containerd", "v1.2.0-rc.1", "containerd version")
flag.StringVar(&opt.runc, "runc", "dd56ece8236d6d9e5bed4ea0c31fe53c7b873ff4", "runc version")
flag.StringVar(&opt.runc, "runc", "a00bf0190895aa465a5fbed0268888e2c8ddfe85", "runc version")
flag.Parse()
bk := buildkit(opt)

View File

@ -19,7 +19,7 @@ func main() {
var opt buildOpt
flag.BoolVar(&opt.withContainerd, "with-containerd", true, "enable containerd worker")
flag.StringVar(&opt.containerd, "containerd", "v1.2.0-rc.1", "containerd version")
flag.StringVar(&opt.runc, "runc", "dd56ece8236d6d9e5bed4ea0c31fe53c7b873ff4", "runc version")
flag.StringVar(&opt.runc, "runc", "a00bf0190895aa465a5fbed0268888e2c8ddfe85", "runc version")
flag.StringVar(&opt.buildkit, "buildkit", "master", "buildkit version")
flag.Parse()

View File

@ -1604,10 +1604,6 @@ RUN ["ls"]
}
func testUser(t *testing.T, sb integration.Sandbox) {
if sb.Rootless() {
t.Skip("only for rootful worker, due to lack of support for additional gids (https://github.com/opencontainers/runc/issues/1835)")
}
f := getFrontend(t, sb)
dockerfile := []byte(`

View File

@ -1,4 +1,4 @@
ARG RUNC_VERSION=00dc70017d222b178a002ed30e9321b12647af2d
ARG RUNC_VERSION=a00bf0190895aa465a5fbed0268888e2c8ddfe85
ARG CONTAINERD_VERSION=v1.2.0-rc.1
# containerd v1.0 for integration tests
ARG CONTAINERD10_VERSION=v1.0.3

View File

@ -1,6 +1,6 @@
# syntax = tonistiigi/dockerfile:runmount20180925
ARG RUNC_VERSION=00dc70017d222b178a002ed30e9321b12647af2d
ARG RUNC_VERSION=a00bf0190895aa465a5fbed0268888e2c8ddfe85
ARG CONTAINERD_VERSION=v1.2.0-rc.1
# containerd v1.0 for integration tests
ARG CONTAINERD10_VERSION=v1.0.3

View File

@ -1,113 +1,40 @@
package specconv
import (
"os"
"sort"
"strings"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/user"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
)
// ToRootless converts spec to be compatible with "rootless" runc.
// * Adds userns (Note: since we are already in userns, ideally we should not need to do this. runc-side issue is tracked at https://github.com/opencontainers/runc/issues/1837)
// * Fix up mount flags (same as above)
// * Replace /sys with bind-mount (FIXME: we don't need to do this if netns is unshared)
// * Remove /sys mount
// * Remove cgroups
//
// See docs/rootless.md for the supported runc revision.
func ToRootless(spec *specs.Spec) error {
if !system.RunningInUserNS() {
return errors.New("needs to be in user namespace")
// Remove /sys mount because we can't mount /sys when the daemon netns
// is not unshared from the host.
//
// Instead, we could bind-mount /sys from the host, however, `rbind, ro`
// does not make /sys/fs/cgroup read-only (and we can't bind-mount /sys
// without rbind)
//
// PR for making /sys/fs/cgroup read-only is proposed, but it is very
// complicated: https://github.com/opencontainers/runc/pull/1869
//
// For buildkit usecase, we suppose we don't need to provide /sys to
// containers and remove /sys mount as a workaround.
var mounts []specs.Mount
for _, mount := range spec.Mounts {
if strings.HasPrefix(mount.Destination, "/sys") {
continue
}
mounts = append(mounts, mount)
}
uidMap, err := user.CurrentProcessUIDMap()
if err != nil && !os.IsNotExist(err) {
return err
}
gidMap, err := user.CurrentProcessUIDMap()
if err != nil && !os.IsNotExist(err) {
return err
}
return toRootless(spec, uidMap, gidMap)
}
spec.Mounts = mounts
// toRootless was forked from github.com/opencontainers/runc/libcontainer/specconv
func toRootless(spec *specs.Spec, uidMap, gidMap []user.IDMap) error {
if err := configureUserNS(spec, uidMap, gidMap); err != nil {
return err
}
if err := configureMounts(spec); err != nil {
return err
}
// Remove cgroup settings.
// Remove cgroups so as to avoid `container_linux.go:337: starting container process caused "process_linux.go:280: applying cgroup configuration for process caused \"mkdir /sys/fs/cgroup/cpuset/buildkit: permission denied\""`
spec.Linux.Resources = nil
spec.Linux.CgroupsPath = ""
return nil
}
// configureUserNS add suserns and the current ID map to the spec.
// Since we are already in userns, ideally we should not need to add userns.
// However, currently rootless runc always requires userns to be added.
// https://github.com/opencontainers/runc/issues/1837
func configureUserNS(spec *specs.Spec, uidMap, gidMap []user.IDMap) error {
spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{
Type: specs.UserNamespace,
})
sort.Slice(uidMap, func(i, j int) bool { return uidMap[i].ID < uidMap[j].ID })
uNextContainerID := int64(0)
for _, u := range uidMap {
spec.Linux.UIDMappings = append(spec.Linux.UIDMappings,
specs.LinuxIDMapping{
HostID: uint32(u.ID),
ContainerID: uint32(uNextContainerID),
Size: uint32(u.Count),
})
uNextContainerID += int64(u.Count)
}
sort.Slice(gidMap, func(i, j int) bool { return gidMap[i].ID < gidMap[j].ID })
gNextContainerID := int64(0)
for _, g := range gidMap {
spec.Linux.GIDMappings = append(spec.Linux.GIDMappings,
specs.LinuxIDMapping{
HostID: uint32(g.ID),
ContainerID: uint32(gNextContainerID),
Size: uint32(g.Count),
})
gNextContainerID += int64(g.Count)
}
return nil
}
func configureMounts(spec *specs.Spec) error {
var mounts []specs.Mount
for _, mount := range spec.Mounts {
// Ignore all mounts that are under /sys, because we add /sys later.
if strings.HasPrefix(mount.Destination, "/sys") {
continue
}
// Remove all gid= and uid= mappings.
// Since we are already in userns, ideally we should not need to do this.
// https://github.com/opencontainers/runc/issues/1837
var options []string
for _, option := range mount.Options {
if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") {
options = append(options, option)
}
}
mount.Options = options
mounts = append(mounts, mount)
}
// Add the sysfs mount as an rbind, because we can't mount /sys unless we have netns.
// TODO: keep original /sys mount when we have netns.
mounts = append(mounts, specs.Mount{
Source: "/sys",
Destination: "/sys",
Type: "none",
Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
})
spec.Mounts = mounts
return nil
}

View File

@ -1,42 +0,0 @@
package specconv
import (
"testing"
"github.com/opencontainers/runc/libcontainer/specconv"
"github.com/opencontainers/runc/libcontainer/user"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/require"
)
func TestToRootless(t *testing.T) {
spec := specconv.Example()
uidMap := []user.IDMap{
{
ID: 0,
ParentID: 4242,
Count: 1,
},
{
ID: 1,
ParentID: 231072,
Count: 65536,
},
}
gidMap := uidMap
expectedUIDMappings := []specs.LinuxIDMapping{
{
HostID: 0,
ContainerID: 0,
Size: 1,
},
{
HostID: 1,
ContainerID: 1,
Size: 65536,
},
}
err := toRootless(spec, uidMap, gidMap)
require.NoError(t, err)
require.EqualValues(t, expectedUIDMappings, spec.Linux.UIDMappings)
}

View File

@ -18,7 +18,7 @@ github.com/gogo/googleapis b23578765ee54ff6bceff57f397d833bf4ca6869
github.com/golang/protobuf v1.1.0
github.com/containerd/continuity bd77b46c8352f74eb12c85bdc01f4b90f69d66b4
github.com/opencontainers/image-spec v1.0.1
github.com/opencontainers/runc 00dc70017d222b178a002ed30e9321b12647af2d
github.com/opencontainers/runc a00bf0190895aa465a5fbed0268888e2c8ddfe85
github.com/Microsoft/go-winio v0.4.11
github.com/containerd/fifo 3d5202aec260678c48179c56f40e6f38a095738c
github.com/opencontainers/runtime-spec eba862dc2470385a233c7507392675cbeadf7353 # v1.0.1-45-geba862d
@ -67,6 +67,3 @@ github.com/opentracing-contrib/go-stdlib b1a47cfbdd7543e70e9ef3e73d0802ad306cc1c
# used by dockerfile tests
gotest.tools v2.1.0
github.com/google/go-cmp v0.2.0
# used by rootless spec conv test
github.com/seccomp/libseccomp-golang 32f571b70023028bd57d9288c20efbcb237f3ce0

View File

@ -1,61 +0,0 @@
package configs
import "fmt"
// blockIODevice holds major:minor format supported in blkio cgroup
type blockIODevice struct {
// Major is the device's major number
Major int64 `json:"major"`
// Minor is the device's minor number
Minor int64 `json:"minor"`
}
// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
type WeightDevice struct {
blockIODevice
// Weight is the bandwidth rate for the device, range is from 10 to 1000
Weight uint16 `json:"weight"`
// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
LeafWeight uint16 `json:"leafWeight"`
}
// NewWeightDevice returns a configured WeightDevice pointer
func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice {
wd := &WeightDevice{}
wd.Major = major
wd.Minor = minor
wd.Weight = weight
wd.LeafWeight = leafWeight
return wd
}
// WeightString formats the struct to be writable to the cgroup specific file
func (wd *WeightDevice) WeightString() string {
return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)
}
// LeafWeightString formats the struct to be writable to the cgroup specific file
func (wd *WeightDevice) LeafWeightString() string {
return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)
}
// ThrottleDevice struct holds a `major:minor rate_per_second` pair
type ThrottleDevice struct {
blockIODevice
// Rate is the IO rate limit per cgroup per device
Rate uint64 `json:"rate"`
}
// NewThrottleDevice returns a configured ThrottleDevice pointer
func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
td := &ThrottleDevice{}
td.Major = major
td.Minor = minor
td.Rate = rate
return td
}
// String formats the struct to be writable to the cgroup specific file
func (td *ThrottleDevice) String() string {
return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
}

View File

@ -1,122 +0,0 @@
package configs
type FreezerState string
const (
Undefined FreezerState = ""
Frozen FreezerState = "FROZEN"
Thawed FreezerState = "THAWED"
)
type Cgroup struct {
// Deprecated, use Path instead
Name string `json:"name,omitempty"`
// name of parent of cgroup or slice
// Deprecated, use Path instead
Parent string `json:"parent,omitempty"`
// Path specifies the path to cgroups that are created and/or joined by the container.
// The path is assumed to be relative to the host system cgroup mountpoint.
Path string `json:"path"`
// ScopePrefix describes prefix for the scope name
ScopePrefix string `json:"scope_prefix"`
// Paths represent the absolute cgroups paths to join.
// This takes precedence over Path.
Paths map[string]string
// Resources contains various cgroups settings to apply
*Resources
}
type Resources struct {
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
// Deprecated
AllowAllDevices *bool `json:"allow_all_devices,omitempty"`
// Deprecated
AllowedDevices []*Device `json:"allowed_devices,omitempty"`
// Deprecated
DeniedDevices []*Device `json:"denied_devices,omitempty"`
Devices []*Device `json:"devices"`
// Memory limit (in bytes)
Memory int64 `json:"memory"`
// Memory reservation or soft_limit (in bytes)
MemoryReservation int64 `json:"memory_reservation"`
// Total memory usage (memory + swap); set `-1` to enable unlimited swap
MemorySwap int64 `json:"memory_swap"`
// Kernel memory limit (in bytes)
KernelMemory int64 `json:"kernel_memory"`
// Kernel memory limit for TCP use (in bytes)
KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
// CPU shares (relative weight vs. other containers)
CpuShares uint64 `json:"cpu_shares"`
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
CpuQuota int64 `json:"cpu_quota"`
// CPU period to be used for hardcapping (in usecs). 0 to use system default.
CpuPeriod uint64 `json:"cpu_period"`
// How many time CPU will use in realtime scheduling (in usecs).
CpuRtRuntime int64 `json:"cpu_rt_quota"`
// CPU period to be used for realtime scheduling (in usecs).
CpuRtPeriod uint64 `json:"cpu_rt_period"`
// CPU to use
CpusetCpus string `json:"cpuset_cpus"`
// MEM to use
CpusetMems string `json:"cpuset_mems"`
// Process limit; set <= `0' to disable limit.
PidsLimit int64 `json:"pids_limit"`
// Specifies per cgroup weight, range is from 10 to 1000.
BlkioWeight uint16 `json:"blkio_weight"`
// Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
BlkioLeafWeight uint16 `json:"blkio_leaf_weight"`
// Weight per cgroup per device, can override BlkioWeight.
BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"`
// IO read rate limit per cgroup per device, bytes per second.
BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
// IO write rate limit per cgroup per device, bytes per second.
BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
// IO read rate limit per cgroup per device, IO per second.
BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"`
// IO write rate limit per cgroup per device, IO per second.
BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"`
// set the freeze value for the process
Freezer FreezerState `json:"freezer"`
// Hugetlb limit (in bytes)
HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
// Whether to disable OOM Killer
OomKillDisable bool `json:"oom_kill_disable"`
// Tuning swappiness behaviour per cgroup
MemorySwappiness *uint64 `json:"memory_swappiness"`
// Set priority of network traffic for container
NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
// Set class identifier for container's network packets
NetClsClassid uint32 `json:"net_cls_classid_u"`
}

View File

@ -1,6 +0,0 @@
package configs
// TODO Windows: This can ultimately be entirely factored out on Windows as
// cgroups are a Unix-specific construct.
type Cgroup struct {
}

View File

@ -1,349 +0,0 @@
package configs
import (
"bytes"
"encoding/json"
"fmt"
"os/exec"
"time"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
)
type Rlimit struct {
Type int `json:"type"`
Hard uint64 `json:"hard"`
Soft uint64 `json:"soft"`
}
// IDMap represents UID/GID Mappings for User Namespaces.
type IDMap struct {
ContainerID int `json:"container_id"`
HostID int `json:"host_id"`
Size int `json:"size"`
}
// Seccomp represents syscall restrictions
// By default, only the native architecture of the kernel is allowed to be used
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
type Seccomp struct {
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Syscalls []*Syscall `json:"syscalls"`
}
// Action is taken upon rule match in Seccomp
type Action int
const (
Kill Action = iota + 1
Errno
Trap
Allow
Trace
)
// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
type Operator int
const (
EqualTo Operator = iota + 1
NotEqualTo
GreaterThan
GreaterThanOrEqualTo
LessThan
LessThanOrEqualTo
MaskEqualTo
)
// Arg is a rule to match a specific syscall argument in Seccomp
type Arg struct {
Index uint `json:"index"`
Value uint64 `json:"value"`
ValueTwo uint64 `json:"value_two"`
Op Operator `json:"op"`
}
// Syscall is a rule to match a syscall in Seccomp
type Syscall struct {
Name string `json:"name"`
Action Action `json:"action"`
Args []*Arg `json:"args"`
}
// TODO Windows. Many of these fields should be factored out into those parts
// which are common across platforms, and those which are platform specific.
// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
// This is a common option when the container is running in ramdisk
NoPivotRoot bool `json:"no_pivot_root"`
// ParentDeathSignal specifies the signal that is sent to the container's process in the case
// that the parent process dies.
ParentDeathSignal int `json:"parent_death_signal"`
// Path to a directory containing the container's root filesystem.
Rootfs string `json:"rootfs"`
// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
// bind mounts are writtable.
Readonlyfs bool `json:"readonlyfs"`
// Specifies the mount propagation flags to be applied to /.
RootPropagation int `json:"rootPropagation"`
// Mounts specify additional source and destination paths that will be mounted inside the container's
// rootfs and mount namespace if specified
Mounts []*Mount `json:"mounts"`
// The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well!
Devices []*Device `json:"devices"`
MountLabel string `json:"mount_label"`
// Hostname optionally sets the container's hostname if provided
Hostname string `json:"hostname"`
// Namespaces specifies the container's namespaces that it should setup when cloning the init process
// If a namespace is not provided that namespace is shared from the container's parent process
Namespaces Namespaces `json:"namespaces"`
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capabilities not specified will be dropped from the processes capability mask
Capabilities *Capabilities `json:"capabilities"`
// Networks specifies the container's network setup to be created
Networks []*Network `json:"networks"`
// Routes can be specified to create entries in the route table as the container is started
Routes []*Route `json:"routes"`
// Cgroups specifies specific cgroup settings for the various subsystems that the container is
// placed into to limit the resources the container has available
Cgroups *Cgroup `json:"cgroups"`
// AppArmorProfile specifies the profile to apply to the process running in the container and is
// change at the time the process is execed
AppArmorProfile string `json:"apparmor_profile,omitempty"`
// ProcessLabel specifies the label to apply to the process running in the container. It is
// commonly used by selinux
ProcessLabel string `json:"process_label,omitempty"`
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []Rlimit `json:"rlimits,omitempty"`
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
// for a process. Valid values are between the range [-1000, '1000'], where processes with
// higher scores are preferred for being killed. If it is unset then we don't touch the current
// value.
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj *int `json:"oom_score_adj,omitempty"`
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"`
// GidMappings is an array of Group ID mappings for User Namespaces
GidMappings []IDMap `json:"gid_mappings"`
// MaskPaths specifies paths within the container's rootfs to mask over with a bind
// mount pointing to /dev/null as to prevent reads of the file.
MaskPaths []string `json:"mask_paths"`
// ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
// so that these files prevent any writes.
ReadonlyPaths []string `json:"readonly_paths"`
// Sysctl is a map of properties and their values. It is the equivalent of using
// sysctl -w my.property.name value in Linux.
Sysctl map[string]string `json:"sysctl"`
// Seccomp allows actions to be taken whenever a syscall is made within the container.
// A number of rules are given, each having an action to be taken if a syscall matches it.
// A default action to be taken if no rules match is also given.
Seccomp *Seccomp `json:"seccomp"`
// NoNewPrivileges controls whether processes in the container can gain additional privileges.
NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
// Hooks are a collection of actions to perform at various container lifecycle events.
// CommandHooks are serialized to JSON, but other hooks are not.
Hooks *Hooks
// Version is the version of opencontainer specification that is supported.
Version string `json:"version"`
// Labels are user defined metadata that is stored in the config and populated on the state
Labels []string `json:"labels"`
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
// Rootless specifies whether the container is a rootless container.
Rootless bool `json:"rootless"`
// IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into
// to limit the resources (e.g., L3 cache) the container has available
IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
}
type Hooks struct {
// Prestart commands are executed after the container namespaces are created,
// but before the user supplied command is executed from init.
Prestart []Hook
// Poststart commands are executed after the container init process starts.
Poststart []Hook
// Poststop commands are executed after the container init process exits.
Poststop []Hook
}
type Capabilities struct {
// Bounding is the set of capabilities checked by the kernel.
Bounding []string
// Effective is the set of capabilities checked by the kernel.
Effective []string
// Inheritable is the capabilities preserved across execve.
Inheritable []string
// Permitted is the limiting superset for effective capabilities.
Permitted []string
// Ambient is the ambient set of capabilities that are kept.
Ambient []string
}
func (hooks *Hooks) UnmarshalJSON(b []byte) error {
var state struct {
Prestart []CommandHook
Poststart []CommandHook
Poststop []CommandHook
}
if err := json.Unmarshal(b, &state); err != nil {
return err
}
deserialize := func(shooks []CommandHook) (hooks []Hook) {
for _, shook := range shooks {
hooks = append(hooks, shook)
}
return hooks
}
hooks.Prestart = deserialize(state.Prestart)
hooks.Poststart = deserialize(state.Poststart)
hooks.Poststop = deserialize(state.Poststop)
return nil
}
func (hooks Hooks) MarshalJSON() ([]byte, error) {
serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
for _, hook := range hooks {
switch chook := hook.(type) {
case CommandHook:
serializableHooks = append(serializableHooks, chook)
default:
logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
}
}
return serializableHooks
}
return json.Marshal(map[string]interface{}{
"prestart": serialize(hooks.Prestart),
"poststart": serialize(hooks.Poststart),
"poststop": serialize(hooks.Poststop),
})
}
// HookState is the payload provided to a hook on execution.
type HookState specs.State
type Hook interface {
// Run executes the hook with the provided state.
Run(HookState) error
}
// NewFunctionHook will call the provided function when the hook is run.
func NewFunctionHook(f func(HookState) error) FuncHook {
return FuncHook{
run: f,
}
}
type FuncHook struct {
run func(HookState) error
}
func (f FuncHook) Run(s HookState) error {
return f.run(s)
}
type Command struct {
Path string `json:"path"`
Args []string `json:"args"`
Env []string `json:"env"`
Dir string `json:"dir"`
Timeout *time.Duration `json:"timeout"`
}
// NewCommandHook will execute the provided command when the hook is run.
func NewCommandHook(cmd Command) CommandHook {
return CommandHook{
Command: cmd,
}
}
type CommandHook struct {
Command
}
func (c Command) Run(s HookState) error {
b, err := json.Marshal(s)
if err != nil {
return err
}
var stdout, stderr bytes.Buffer
cmd := exec.Cmd{
Path: c.Path,
Args: c.Args,
Env: c.Env,
Stdin: bytes.NewReader(b),
Stdout: &stdout,
Stderr: &stderr,
}
if err := cmd.Start(); err != nil {
return err
}
errC := make(chan error, 1)
go func() {
err := cmd.Wait()
if err != nil {
err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
}
errC <- err
}()
var timerCh <-chan time.Time
if c.Timeout != nil {
timer := time.NewTimer(*c.Timeout)
defer timer.Stop()
timerCh = timer.C
}
select {
case err := <-errC:
return err
case <-timerCh:
cmd.Process.Kill()
cmd.Wait()
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
}
}

View File

@ -1,61 +0,0 @@
package configs
import "fmt"
// HostUID gets the translated uid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostUID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.UidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.")
}
id, found := c.hostIDFromMapping(containerId, c.UidMappings)
if !found {
return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.")
}
return id, nil
}
// Return unchanged id.
return containerId, nil
}
// HostRootUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostRootUID() (int, error) {
return c.HostUID(0)
}
// HostGID gets the translated gid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostGID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.GidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.")
}
id, found := c.hostIDFromMapping(containerId, c.GidMappings)
if !found {
return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.")
}
return id, nil
}
// Return unchanged id.
return containerId, nil
}
// HostRootGID gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostRootGID() (int, error) {
return c.HostGID(0)
}
// Utility function that gets a host ID for a container ID from user namespace map
// if that ID is present in the map.
func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
for _, m := range uMap {
if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
hostID := m.HostID + (containerID - m.ContainerID)
return hostID, true
}
}
return -1, false
}

View File

@ -1,57 +0,0 @@
package configs
import (
"fmt"
"os"
)
const (
Wildcard = -1
)
// TODO Windows: This can be factored out in the future
type Device struct {
// Device type, block, char, etc.
Type rune `json:"type"`
// Path to the device.
Path string `json:"path"`
// Major is the device's major number.
Major int64 `json:"major"`
// Minor is the device's minor number.
Minor int64 `json:"minor"`
// Cgroup permissions format, rwm.
Permissions string `json:"permissions"`
// FileMode permission bits for the device.
FileMode os.FileMode `json:"file_mode"`
// Uid of the device.
Uid uint32 `json:"uid"`
// Gid of the device.
Gid uint32 `json:"gid"`
// Write the file to the allowed list
Allow bool `json:"allow"`
}
func (d *Device) CgroupString() string {
return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions)
}
func (d *Device) Mkdev() int {
return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12))
}
// deviceNumberString converts the device number to a string return result.
func deviceNumberString(number int64) string {
if number == Wildcard {
return "*"
}
return fmt.Sprint(number)
}

View File

@ -1,111 +0,0 @@
// +build linux
package configs
var (
// DefaultSimpleDevices are devices that are to be both allowed and created.
DefaultSimpleDevices = []*Device{
// /dev/null and zero
{
Path: "/dev/null",
Type: 'c',
Major: 1,
Minor: 3,
Permissions: "rwm",
FileMode: 0666,
},
{
Path: "/dev/zero",
Type: 'c',
Major: 1,
Minor: 5,
Permissions: "rwm",
FileMode: 0666,
},
{
Path: "/dev/full",
Type: 'c',
Major: 1,
Minor: 7,
Permissions: "rwm",
FileMode: 0666,
},
// consoles and ttys
{
Path: "/dev/tty",
Type: 'c',
Major: 5,
Minor: 0,
Permissions: "rwm",
FileMode: 0666,
},
// /dev/urandom,/dev/random
{
Path: "/dev/urandom",
Type: 'c',
Major: 1,
Minor: 9,
Permissions: "rwm",
FileMode: 0666,
},
{
Path: "/dev/random",
Type: 'c',
Major: 1,
Minor: 8,
Permissions: "rwm",
FileMode: 0666,
},
}
DefaultAllowedDevices = append([]*Device{
// allow mknod for any device
{
Type: 'c',
Major: Wildcard,
Minor: Wildcard,
Permissions: "m",
},
{
Type: 'b',
Major: Wildcard,
Minor: Wildcard,
Permissions: "m",
},
{
Path: "/dev/console",
Type: 'c',
Major: 5,
Minor: 1,
Permissions: "rwm",
},
// /dev/pts/ - pts namespaces are "coming soon"
{
Path: "",
Type: 'c',
Major: 136,
Minor: Wildcard,
Permissions: "rwm",
},
{
Path: "",
Type: 'c',
Major: 5,
Minor: 2,
Permissions: "rwm",
},
// tuntap
{
Path: "",
Type: 'c',
Major: 10,
Minor: 200,
Permissions: "rwm",
},
}, DefaultSimpleDevices...)
DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...)
)

View File

@ -1,9 +0,0 @@
package configs
type HugepageLimit struct {
// which type of hugepage to limit.
Pagesize string `json:"page_size"`
// usage limit for hugepage.
Limit uint64 `json:"limit"`
}

View File

@ -1,7 +0,0 @@
package configs
type IntelRdt struct {
// The schema for L3 cache id and capacity bitmask (CBM)
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
L3CacheSchema string `json:"l3_cache_schema,omitempty"`
}

View File

@ -1,14 +0,0 @@
package configs
import (
"fmt"
)
type IfPrioMap struct {
Interface string `json:"interface"`
Priority int64 `json:"priority"`
}
func (i *IfPrioMap) CgroupString() string {
return fmt.Sprintf("%s %d", i.Interface, i.Priority)
}

View File

@ -1,39 +0,0 @@
package configs
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
EXT_COPYUP = 1 << iota
)
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
// Destination path for the mount inside the container.
Destination string `json:"destination"`
// Device the mount is for.
Device string `json:"device"`
// Mount flags.
Flags int `json:"flags"`
// Propagation Flags
PropagationFlags []int `json:"propagation_flags"`
// Mount data applied to the mount.
Data string `json:"data"`
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Optional Command to be run before Source is mounted.
PremountCmds []Command `json:"premount_cmds"`
// Optional Command to be run after Source is mounted.
PostmountCmds []Command `json:"postmount_cmds"`
}

View File

@ -1,5 +0,0 @@
package configs
type NamespaceType string
type Namespaces []Namespace

View File

@ -1,122 +0,0 @@
package configs
import (
"fmt"
"os"
"sync"
)
const (
NEWNET NamespaceType = "NEWNET"
NEWPID NamespaceType = "NEWPID"
NEWNS NamespaceType = "NEWNS"
NEWUTS NamespaceType = "NEWUTS"
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
)
var (
nsLock sync.Mutex
supportedNamespaces = make(map[NamespaceType]bool)
)
// NsName converts the namespace type to its filename
func NsName(ns NamespaceType) string {
switch ns {
case NEWNET:
return "net"
case NEWNS:
return "mnt"
case NEWPID:
return "pid"
case NEWIPC:
return "ipc"
case NEWUSER:
return "user"
case NEWUTS:
return "uts"
}
return ""
}
// IsNamespaceSupported returns whether a namespace is available or
// not
func IsNamespaceSupported(ns NamespaceType) bool {
nsLock.Lock()
defer nsLock.Unlock()
supported, ok := supportedNamespaces[ns]
if ok {
return supported
}
nsFile := NsName(ns)
// if the namespace type is unknown, just return false
if nsFile == "" {
return false
}
_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
// a namespace is supported if it exists and we have permissions to read it
supported = err == nil
supportedNamespaces[ns] = supported
return supported
}
func NamespaceTypes() []NamespaceType {
return []NamespaceType{
NEWUSER, // Keep user NS always first, don't move it.
NEWIPC,
NEWUTS,
NEWNET,
NEWPID,
NEWNS,
}
}
// Namespace defines configuration for each namespace. It specifies an
// alternate path that is able to be joined via setns.
type Namespace struct {
Type NamespaceType `json:"type"`
Path string `json:"path"`
}
func (n *Namespace) GetPath(pid int) string {
return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
}
func (n *Namespaces) Remove(t NamespaceType) bool {
i := n.index(t)
if i == -1 {
return false
}
*n = append((*n)[:i], (*n)[i+1:]...)
return true
}
func (n *Namespaces) Add(t NamespaceType, path string) {
i := n.index(t)
if i == -1 {
*n = append(*n, Namespace{Type: t, Path: path})
return
}
(*n)[i].Path = path
}
func (n *Namespaces) index(t NamespaceType) int {
for i, ns := range *n {
if ns.Type == t {
return i
}
}
return -1
}
func (n *Namespaces) Contains(t NamespaceType) bool {
return n.index(t) != -1
}
func (n *Namespaces) PathOf(t NamespaceType) string {
i := n.index(t)
if i == -1 {
return ""
}
return (*n)[i].Path
}

View File

@ -1,31 +0,0 @@
// +build linux
package configs
import "golang.org/x/sys/unix"
func (n *Namespace) Syscall() int {
return namespaceInfo[n.Type]
}
var namespaceInfo = map[NamespaceType]int{
NEWNET: unix.CLONE_NEWNET,
NEWNS: unix.CLONE_NEWNS,
NEWUSER: unix.CLONE_NEWUSER,
NEWIPC: unix.CLONE_NEWIPC,
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
var flag int
for _, v := range *n {
if v.Path != "" {
continue
}
flag |= namespaceInfo[v.Type]
}
return uintptr(flag)
}

View File

@ -1,13 +0,0 @@
// +build !linux,!windows
package configs
func (n *Namespace) Syscall() int {
panic("No namespace syscall support")
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
panic("No namespace syscall support")
}

View File

@ -1,8 +0,0 @@
// +build !linux
package configs
// Namespace defines configuration for each namespace. It specifies an
// alternate path that is able to be joined via setns.
type Namespace struct {
}

View File

@ -1,72 +0,0 @@
package configs
// Network defines configuration for a container's networking stack
//
// The network configuration can be omitted from a container causing the
// container to be setup with the host's networking stack
type Network struct {
// Type sets the networks type, commonly veth and loopback
Type string `json:"type"`
// Name of the network interface
Name string `json:"name"`
// The bridge to use.
Bridge string `json:"bridge"`
// MacAddress contains the MAC address to set on the network interface
MacAddress string `json:"mac_address"`
// Address contains the IPv4 and mask to set on the network interface
Address string `json:"address"`
// Gateway sets the gateway address that is used as the default for the interface
Gateway string `json:"gateway"`
// IPv6Address contains the IPv6 and mask to set on the network interface
IPv6Address string `json:"ipv6_address"`
// IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface
IPv6Gateway string `json:"ipv6_gateway"`
// Mtu sets the mtu value for the interface and will be mirrored on both the host and
// container's interfaces if a pair is created, specifically in the case of type veth
// Note: This does not apply to loopback interfaces.
Mtu int `json:"mtu"`
// TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and
// container's interfaces if a pair is created, specifically in the case of type veth
// Note: This does not apply to loopback interfaces.
TxQueueLen int `json:"txqueuelen"`
// HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the
// container.
HostInterfaceName string `json:"host_interface_name"`
// HairpinMode specifies if hairpin NAT should be enabled on the virtual interface
// bridge port in the case of type veth
// Note: This is unsupported on some systems.
// Note: This does not apply to loopback interfaces.
HairpinMode bool `json:"hairpin_mode"`
}
// Routes can be specified to create entries in the route table as the container is started
//
// All of destination, source, and gateway should be either IPv4 or IPv6.
// One of the three options must be present, and omitted entries will use their
// IP family default for the route table. For IPv4 for example, setting the
// gateway to 1.2.3.4 and the interface to eth0 will set up a standard
// destination of 0.0.0.0(or *) when viewed in the route table.
type Route struct {
// Sets the destination and mask, should be a CIDR. Accepts IPv4 and IPv6
Destination string `json:"destination"`
// Sets the source and mask, should be a CIDR. Accepts IPv4 and IPv6
Source string `json:"source"`
// Sets the gateway. Accepts IPv4 and IPv6
Gateway string `json:"gateway"`
// The device to set this route up for, for example: eth0
InterfaceName string `json:"interface_name"`
}

View File

@ -82,7 +82,7 @@ struct nlconfig_t {
uint8_t is_setgroup;
/* Rootless container settings. */
uint8_t is_rootless;
uint8_t is_rootless_euid; /* boolean */
char *uidmappath;
size_t uidmappath_len;
char *gidmappath;
@ -100,7 +100,7 @@ struct nlconfig_t {
#define GIDMAP_ATTR 27284
#define SETGROUP_ATTR 27285
#define OOM_SCORE_ADJ_ATTR 27286
#define ROOTLESS_ATTR 27287
#define ROOTLESS_EUID_ATTR 27287
#define UIDMAPPATH_ATTR 27288
#define GIDMAPPATH_ATTR 27289
@ -419,8 +419,8 @@ static void nl_parse(int fd, struct nlconfig_t *config)
case CLONE_FLAGS_ATTR:
config->cloneflags = readint32(current);
break;
case ROOTLESS_ATTR:
config->is_rootless = readint8(current);
case ROOTLESS_EUID_ATTR:
config->is_rootless_euid = readint8(current); /* boolean */
break;
case OOM_SCORE_ADJ_ATTR:
config->oom_score_adj = current;
@ -687,7 +687,7 @@ void nsexec(void)
* newuidmap/newgidmap shall be used.
*/
if (config.is_rootless && !config.is_setgroup)
if (config.is_rootless_euid && !config.is_setgroup)
update_setgroups(child, SETGROUPS_DENY);
/* Set up mappings. */
@ -953,7 +953,7 @@ void nsexec(void)
if (setgid(0) < 0)
bail("setgid failed");
if (!config.is_rootless && config.is_setgroup) {
if (!config.is_rootless_euid && config.is_setgroup) {
if (setgroups(0, NULL) < 0)
bail("setgroups failed");
}

View File

@ -1,76 +0,0 @@
package seccomp
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/configs"
)
var operators = map[string]configs.Operator{
"SCMP_CMP_NE": configs.NotEqualTo,
"SCMP_CMP_LT": configs.LessThan,
"SCMP_CMP_LE": configs.LessThanOrEqualTo,
"SCMP_CMP_EQ": configs.EqualTo,
"SCMP_CMP_GE": configs.GreaterThanOrEqualTo,
"SCMP_CMP_GT": configs.GreaterThan,
"SCMP_CMP_MASKED_EQ": configs.MaskEqualTo,
}
var actions = map[string]configs.Action{
"SCMP_ACT_KILL": configs.Kill,
"SCMP_ACT_ERRNO": configs.Errno,
"SCMP_ACT_TRAP": configs.Trap,
"SCMP_ACT_ALLOW": configs.Allow,
"SCMP_ACT_TRACE": configs.Trace,
}
var archs = map[string]string{
"SCMP_ARCH_X86": "x86",
"SCMP_ARCH_X86_64": "amd64",
"SCMP_ARCH_X32": "x32",
"SCMP_ARCH_ARM": "arm",
"SCMP_ARCH_AARCH64": "arm64",
"SCMP_ARCH_MIPS": "mips",
"SCMP_ARCH_MIPS64": "mips64",
"SCMP_ARCH_MIPS64N32": "mips64n32",
"SCMP_ARCH_MIPSEL": "mipsel",
"SCMP_ARCH_MIPSEL64": "mipsel64",
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
"SCMP_ARCH_PPC": "ppc",
"SCMP_ARCH_PPC64": "ppc64",
"SCMP_ARCH_PPC64LE": "ppc64le",
"SCMP_ARCH_S390": "s390",
"SCMP_ARCH_S390X": "s390x",
}
// ConvertStringToOperator converts a string into a Seccomp comparison operator.
// Comparison operators use the names they are assigned by Libseccomp's header.
// Attempting to convert a string that is not a valid operator results in an
// error.
func ConvertStringToOperator(in string) (configs.Operator, error) {
if op, ok := operators[in]; ok == true {
return op, nil
}
return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
}
// ConvertStringToAction converts a string into a Seccomp rule match action.
// Actions use the names they are assigned in Libseccomp's header, though some
// (notable, SCMP_ACT_TRACE) are not available in this implementation and will
// return errors.
// Attempting to convert a string that is not a valid action results in an
// error.
func ConvertStringToAction(in string) (configs.Action, error) {
if act, ok := actions[in]; ok == true {
return act, nil
}
return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
}
// ConvertStringToArch converts a string into a Seccomp comparison arch.
func ConvertStringToArch(in string) (string, error) {
if arch, ok := archs[in]; ok == true {
return arch, nil
}
return "", fmt.Errorf("string %s is not a valid arch for seccomp", in)
}

View File

@ -1,258 +0,0 @@
// +build linux,cgo,seccomp
package seccomp
import (
"bufio"
"fmt"
"os"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
libseccomp "github.com/seccomp/libseccomp-golang"
"golang.org/x/sys/unix"
)
var (
actAllow = libseccomp.ActAllow
actTrap = libseccomp.ActTrap
actKill = libseccomp.ActKill
actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM))
actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
)
const (
// Linux system calls can have at most 6 arguments
syscallMaxArguments int = 6
)
// Filters given syscalls in a container, preventing them from being used
// Started in the container init process, and carried over to all child processes
// Setns calls, however, require a separate invocation, as they are not children
// of the init until they join the namespace
func InitSeccomp(config *configs.Seccomp) error {
if config == nil {
return fmt.Errorf("cannot initialize Seccomp - nil config passed")
}
defaultAction, err := getAction(config.DefaultAction)
if err != nil {
return fmt.Errorf("error initializing seccomp - invalid default action")
}
filter, err := libseccomp.NewFilter(defaultAction)
if err != nil {
return fmt.Errorf("error creating filter: %s", err)
}
// Add extra architectures
for _, arch := range config.Architectures {
scmpArch, err := libseccomp.GetArchFromString(arch)
if err != nil {
return fmt.Errorf("error validating Seccomp architecture: %s", err)
}
if err := filter.AddArch(scmpArch); err != nil {
return fmt.Errorf("error adding architecture to seccomp filter: %s", err)
}
}
// Unset no new privs bit
if err := filter.SetNoNewPrivsBit(false); err != nil {
return fmt.Errorf("error setting no new privileges: %s", err)
}
// Add a rule for each syscall
for _, call := range config.Syscalls {
if call == nil {
return fmt.Errorf("encountered nil syscall while initializing Seccomp")
}
if err = matchCall(filter, call); err != nil {
return err
}
}
if err = filter.Load(); err != nil {
return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
}
return nil
}
// IsEnabled returns if the kernel has been configured to support seccomp.
func IsEnabled() bool {
// Try to read from /proc/self/status for kernels > 3.8
s, err := parseStatusFile("/proc/self/status")
if err != nil {
// Check if Seccomp is supported, via CONFIG_SECCOMP.
if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL {
// Make sure the kernel has CONFIG_SECCOMP_FILTER.
if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL {
return true
}
}
return false
}
_, ok := s["Seccomp"]
return ok
}
// Convert Libcontainer Action to Libseccomp ScmpAction
func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
switch act {
case configs.Kill:
return actKill, nil
case configs.Errno:
return actErrno, nil
case configs.Trap:
return actTrap, nil
case configs.Allow:
return actAllow, nil
case configs.Trace:
return actTrace, nil
default:
return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule")
}
}
// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
switch op {
case configs.EqualTo:
return libseccomp.CompareEqual, nil
case configs.NotEqualTo:
return libseccomp.CompareNotEqual, nil
case configs.GreaterThan:
return libseccomp.CompareGreater, nil
case configs.GreaterThanOrEqualTo:
return libseccomp.CompareGreaterEqual, nil
case configs.LessThan:
return libseccomp.CompareLess, nil
case configs.LessThanOrEqualTo:
return libseccomp.CompareLessOrEqual, nil
case configs.MaskEqualTo:
return libseccomp.CompareMaskedEqual, nil
default:
return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule")
}
}
// Convert Libcontainer Arg to Libseccomp ScmpCondition
func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
cond := libseccomp.ScmpCondition{}
if arg == nil {
return cond, fmt.Errorf("cannot convert nil to syscall condition")
}
op, err := getOperator(arg.Op)
if err != nil {
return cond, err
}
return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
}
// Add a rule to match a single syscall
func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
if call == nil || filter == nil {
return fmt.Errorf("cannot use nil as syscall to block")
}
if len(call.Name) == 0 {
return fmt.Errorf("empty string is not a valid syscall")
}
// If we can't resolve the syscall, assume it's not supported on this kernel
// Ignore it, don't error out
callNum, err := libseccomp.GetSyscallFromName(call.Name)
if err != nil {
return nil
}
// Convert the call's action to the libseccomp equivalent
callAct, err := getAction(call.Action)
if err != nil {
return fmt.Errorf("action in seccomp profile is invalid: %s", err)
}
// Unconditional match - just add the rule
if len(call.Args) == 0 {
if err = filter.AddRule(callNum, callAct); err != nil {
return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err)
}
} else {
// If two or more arguments have the same condition,
// Revert to old behavior, adding each condition as a separate rule
argCounts := make([]uint, syscallMaxArguments)
conditions := []libseccomp.ScmpCondition{}
for _, cond := range call.Args {
newCond, err := getCondition(cond)
if err != nil {
return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err)
}
argCounts[cond.Index] += 1
conditions = append(conditions, newCond)
}
hasMultipleArgs := false
for _, count := range argCounts {
if count > 1 {
hasMultipleArgs = true
break
}
}
if hasMultipleArgs {
// Revert to old behavior
// Add each condition attached to a separate rule
for _, cond := range conditions {
condArr := []libseccomp.ScmpCondition{cond}
if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
}
}
} else {
// No conditions share same argument
// Use new, proper behavior
if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
}
}
}
return nil
}
func parseStatusFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
s := bufio.NewScanner(f)
status := make(map[string]string)
for s.Scan() {
text := s.Text()
parts := strings.Split(text, ":")
if len(parts) <= 1 {
continue
}
status[parts[0]] = parts[1]
}
if err := s.Err(); err != nil {
return nil, err
}
return status, nil
}

View File

@ -1,24 +0,0 @@
// +build !linux !cgo !seccomp
package seccomp
import (
"errors"
"github.com/opencontainers/runc/libcontainer/configs"
)
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
// InitSeccomp does nothing because seccomp is not supported.
func InitSeccomp(config *configs.Seccomp) error {
if config != nil {
return ErrSeccompNotEnabled
}
return nil
}
// IsEnabled returns false, because it is not supported.
func IsEnabled() bool {
return false
}

View File

@ -1,221 +0,0 @@
package specconv
import (
"os"
"strings"
"github.com/opencontainers/runtime-spec/specs-go"
)
// Example returns an example spec file, with many options set so a user can
// see what a standard spec file looks like.
func Example() *specs.Spec {
return &specs.Spec{
Version: specs.Version,
Root: &specs.Root{
Path: "rootfs",
Readonly: true,
},
Process: &specs.Process{
Terminal: true,
User: specs.User{},
Args: []string{
"sh",
},
Env: []string{
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm",
},
Cwd: "/",
NoNewPrivileges: true,
Capabilities: &specs.LinuxCapabilities{
Bounding: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Permitted: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Inheritable: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Ambient: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Effective: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
},
Rlimits: []specs.POSIXRlimit{
{
Type: "RLIMIT_NOFILE",
Hard: uint64(1024),
Soft: uint64(1024),
},
},
},
Hostname: "runc",
Mounts: []specs.Mount{
{
Destination: "/proc",
Type: "proc",
Source: "proc",
Options: nil,
},
{
Destination: "/dev",
Type: "tmpfs",
Source: "tmpfs",
Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
},
{
Destination: "/dev/pts",
Type: "devpts",
Source: "devpts",
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
},
{
Destination: "/dev/shm",
Type: "tmpfs",
Source: "shm",
Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
},
{
Destination: "/dev/mqueue",
Type: "mqueue",
Source: "mqueue",
Options: []string{"nosuid", "noexec", "nodev"},
},
{
Destination: "/sys",
Type: "sysfs",
Source: "sysfs",
Options: []string{"nosuid", "noexec", "nodev", "ro"},
},
{
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Source: "cgroup",
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
},
},
Linux: &specs.Linux{
MaskedPaths: []string{
"/proc/kcore",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/sys/firmware",
"/proc/scsi",
},
ReadonlyPaths: []string{
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger",
},
Resources: &specs.LinuxResources{
Devices: []specs.LinuxDeviceCgroup{
{
Allow: false,
Access: "rwm",
},
},
},
Namespaces: []specs.LinuxNamespace{
{
Type: "pid",
},
{
Type: "network",
},
{
Type: "ipc",
},
{
Type: "uts",
},
{
Type: "mount",
},
},
},
}
}
// ToRootless converts the given spec file into one that should work with
// rootless containers, by removing incompatible options and adding others that
// are needed.
func ToRootless(spec *specs.Spec) {
var namespaces []specs.LinuxNamespace
// Remove networkns from the spec.
for _, ns := range spec.Linux.Namespaces {
switch ns.Type {
case specs.NetworkNamespace, specs.UserNamespace:
// Do nothing.
default:
namespaces = append(namespaces, ns)
}
}
// Add userns to the spec.
namespaces = append(namespaces, specs.LinuxNamespace{
Type: specs.UserNamespace,
})
spec.Linux.Namespaces = namespaces
// Add mappings for the current user.
spec.Linux.UIDMappings = []specs.LinuxIDMapping{{
HostID: uint32(os.Geteuid()),
ContainerID: 0,
Size: 1,
}}
spec.Linux.GIDMappings = []specs.LinuxIDMapping{{
HostID: uint32(os.Getegid()),
ContainerID: 0,
Size: 1,
}}
// Fix up mounts.
var mounts []specs.Mount
for _, mount := range spec.Mounts {
// Ignore all mounts that are under /sys.
if strings.HasPrefix(mount.Destination, "/sys") {
continue
}
// Remove all gid= and uid= mappings.
var options []string
for _, option := range mount.Options {
if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") {
options = append(options, option)
}
}
mount.Options = options
mounts = append(mounts, mount)
}
// Add the sysfs mount as an rbind.
mounts = append(mounts, specs.Mount{
Source: "/sys",
Destination: "/sys",
Type: "none",
Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
})
spec.Mounts = mounts
// Remove cgroup settings.
spec.Linux.Resources = nil
}

View File

@ -1,836 +0,0 @@
// +build linux
// Package specconv implements conversion of specifications to libcontainer
// configurations
package specconv
import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/seccomp"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
)
const wildcard = -1
var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{
specs.PIDNamespace: configs.NEWPID,
specs.NetworkNamespace: configs.NEWNET,
specs.MountNamespace: configs.NEWNS,
specs.UserNamespace: configs.NEWUSER,
specs.IPCNamespace: configs.NEWIPC,
specs.UTSNamespace: configs.NEWUTS,
}
var mountPropagationMapping = map[string]int{
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
"private": unix.MS_PRIVATE,
"rslave": unix.MS_SLAVE | unix.MS_REC,
"slave": unix.MS_SLAVE,
"rshared": unix.MS_SHARED | unix.MS_REC,
"shared": unix.MS_SHARED,
"runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
"unbindable": unix.MS_UNBINDABLE,
"": 0,
}
var allowedDevices = []*configs.Device{
// allow mknod for any device
{
Type: 'c',
Major: wildcard,
Minor: wildcard,
Permissions: "m",
Allow: true,
},
{
Type: 'b',
Major: wildcard,
Minor: wildcard,
Permissions: "m",
Allow: true,
},
{
Type: 'c',
Path: "/dev/null",
Major: 1,
Minor: 3,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/random",
Major: 1,
Minor: 8,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/full",
Major: 1,
Minor: 7,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/tty",
Major: 5,
Minor: 0,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/zero",
Major: 1,
Minor: 5,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/urandom",
Major: 1,
Minor: 9,
Permissions: "rwm",
Allow: true,
},
{
Path: "/dev/console",
Type: 'c',
Major: 5,
Minor: 1,
Permissions: "rwm",
Allow: true,
},
// /dev/pts/ - pts namespaces are "coming soon"
{
Path: "",
Type: 'c',
Major: 136,
Minor: wildcard,
Permissions: "rwm",
Allow: true,
},
{
Path: "",
Type: 'c',
Major: 5,
Minor: 2,
Permissions: "rwm",
Allow: true,
},
// tuntap
{
Path: "",
Type: 'c',
Major: 10,
Minor: 200,
Permissions: "rwm",
Allow: true,
},
}
type CreateOpts struct {
CgroupName string
UseSystemdCgroup bool
NoPivotRoot bool
NoNewKeyring bool
Spec *specs.Spec
Rootless bool
}
// CreateLibcontainerConfig creates a new libcontainer configuration from a
// given specification and a cgroup name
func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
// runc's cwd will always be the bundle path
rcwd, err := os.Getwd()
if err != nil {
return nil, err
}
cwd, err := filepath.Abs(rcwd)
if err != nil {
return nil, err
}
spec := opts.Spec
if spec.Root == nil {
return nil, fmt.Errorf("Root must be specified")
}
rootfsPath := spec.Root.Path
if !filepath.IsAbs(rootfsPath) {
rootfsPath = filepath.Join(cwd, rootfsPath)
}
labels := []string{}
for k, v := range spec.Annotations {
labels = append(labels, fmt.Sprintf("%s=%s", k, v))
}
config := &configs.Config{
Rootfs: rootfsPath,
NoPivotRoot: opts.NoPivotRoot,
Readonlyfs: spec.Root.Readonly,
Hostname: spec.Hostname,
Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)),
NoNewKeyring: opts.NoNewKeyring,
Rootless: opts.Rootless,
}
exists := false
for _, m := range spec.Mounts {
config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
}
if err := createDevices(spec, config); err != nil {
return nil, err
}
c, err := createCgroupConfig(opts)
if err != nil {
return nil, err
}
config.Cgroups = c
// set linux-specific config
if spec.Linux != nil {
if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists {
return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
}
if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) {
return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root")
}
for _, ns := range spec.Linux.Namespaces {
t, exists := namespaceMapping[ns.Type]
if !exists {
return nil, fmt.Errorf("namespace %q does not exist", ns)
}
if config.Namespaces.Contains(t) {
return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
}
config.Namespaces.Add(t, ns.Path)
}
if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" {
config.Networks = []*configs.Network{
{
Type: "loopback",
},
}
}
if config.Namespaces.Contains(configs.NEWUSER) {
if err := setupUserNamespace(spec, config); err != nil {
return nil, err
}
}
config.MaskPaths = spec.Linux.MaskedPaths
config.ReadonlyPaths = spec.Linux.ReadonlyPaths
config.MountLabel = spec.Linux.MountLabel
config.Sysctl = spec.Linux.Sysctl
if spec.Linux.Seccomp != nil {
seccomp, err := SetupSeccomp(spec.Linux.Seccomp)
if err != nil {
return nil, err
}
config.Seccomp = seccomp
}
}
if spec.Process.SelinuxLabel != "" {
config.ProcessLabel = spec.Process.SelinuxLabel
}
if spec.Process != nil {
config.OomScoreAdj = spec.Process.OOMScoreAdj
}
if spec.Process.Capabilities != nil {
config.Capabilities = &configs.Capabilities{
Bounding: spec.Process.Capabilities.Bounding,
Effective: spec.Process.Capabilities.Effective,
Permitted: spec.Process.Capabilities.Permitted,
Inheritable: spec.Process.Capabilities.Inheritable,
Ambient: spec.Process.Capabilities.Ambient,
}
}
createHooks(spec, config)
config.Version = specs.Version
if spec.Linux.IntelRdt != nil {
config.IntelRdt = &configs.IntelRdt{}
if spec.Linux.IntelRdt.L3CacheSchema != "" {
config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema
}
}
return config, nil
}
func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
flags, pgflags, data, ext := parseMountOptions(m.Options)
source := m.Source
device := m.Type
if flags&unix.MS_BIND != 0 {
if device == "" {
device = "bind"
}
if !filepath.IsAbs(source) {
source = filepath.Join(cwd, m.Source)
}
}
return &configs.Mount{
Device: device,
Source: source,
Destination: m.Destination,
Data: data,
Flags: flags,
PropagationFlags: pgflags,
Extensions: ext,
}
}
func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
var (
myCgroupPath string
spec = opts.Spec
useSystemdCgroup = opts.UseSystemdCgroup
name = opts.CgroupName
)
c := &configs.Cgroup{
Resources: &configs.Resources{},
}
if spec.Linux != nil && spec.Linux.CgroupsPath != "" {
myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath)
if useSystemdCgroup {
myCgroupPath = spec.Linux.CgroupsPath
}
}
if useSystemdCgroup {
if myCgroupPath == "" {
c.Parent = "system.slice"
c.ScopePrefix = "runc"
c.Name = name
} else {
// Parse the path from expected "slice:prefix:name"
// for e.g. "system.slice:docker:1234"
parts := strings.Split(myCgroupPath, ":")
if len(parts) != 3 {
return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups")
}
c.Parent = parts[0]
c.ScopePrefix = parts[1]
c.Name = parts[2]
}
} else {
if myCgroupPath == "" {
c.Name = name
}
c.Path = myCgroupPath
}
// In rootless containers, any attempt to make cgroup changes will fail.
// libcontainer will validate this and we shouldn't add any cgroup options
// the user didn't specify.
if !opts.Rootless {
c.Resources.AllowedDevices = allowedDevices
}
if spec.Linux != nil {
r := spec.Linux.Resources
if r == nil {
return c, nil
}
for i, d := range spec.Linux.Resources.Devices {
var (
t = "a"
major = int64(-1)
minor = int64(-1)
)
if d.Type != "" {
t = d.Type
}
if d.Major != nil {
major = *d.Major
}
if d.Minor != nil {
minor = *d.Minor
}
if d.Access == "" {
return nil, fmt.Errorf("device access at %d field cannot be empty", i)
}
dt, err := stringToCgroupDeviceRune(t)
if err != nil {
return nil, err
}
dd := &configs.Device{
Type: dt,
Major: major,
Minor: minor,
Permissions: d.Access,
Allow: d.Allow,
}
c.Resources.Devices = append(c.Resources.Devices, dd)
}
if r.Memory != nil {
if r.Memory.Limit != nil {
c.Resources.Memory = *r.Memory.Limit
}
if r.Memory.Reservation != nil {
c.Resources.MemoryReservation = *r.Memory.Reservation
}
if r.Memory.Swap != nil {
c.Resources.MemorySwap = *r.Memory.Swap
}
if r.Memory.Kernel != nil {
c.Resources.KernelMemory = *r.Memory.Kernel
}
if r.Memory.KernelTCP != nil {
c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
}
if r.Memory.Swappiness != nil {
c.Resources.MemorySwappiness = r.Memory.Swappiness
}
if r.Memory.DisableOOMKiller != nil {
c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
}
}
if r.CPU != nil {
if r.CPU.Shares != nil {
c.Resources.CpuShares = *r.CPU.Shares
}
if r.CPU.Quota != nil {
c.Resources.CpuQuota = *r.CPU.Quota
}
if r.CPU.Period != nil {
c.Resources.CpuPeriod = *r.CPU.Period
}
if r.CPU.RealtimeRuntime != nil {
c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
}
if r.CPU.RealtimePeriod != nil {
c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
}
if r.CPU.Cpus != "" {
c.Resources.CpusetCpus = r.CPU.Cpus
}
if r.CPU.Mems != "" {
c.Resources.CpusetMems = r.CPU.Mems
}
}
if r.Pids != nil {
c.Resources.PidsLimit = r.Pids.Limit
}
if r.BlockIO != nil {
if r.BlockIO.Weight != nil {
c.Resources.BlkioWeight = *r.BlockIO.Weight
}
if r.BlockIO.LeafWeight != nil {
c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight
}
if r.BlockIO.WeightDevice != nil {
for _, wd := range r.BlockIO.WeightDevice {
var weight, leafWeight uint16
if wd.Weight != nil {
weight = *wd.Weight
}
if wd.LeafWeight != nil {
leafWeight = *wd.LeafWeight
}
weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight)
c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
}
}
if r.BlockIO.ThrottleReadBpsDevice != nil {
for _, td := range r.BlockIO.ThrottleReadBpsDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleWriteBpsDevice != nil {
for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleReadIOPSDevice != nil {
for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleWriteIOPSDevice != nil {
for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
}
}
}
for _, l := range r.HugepageLimits {
c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{
Pagesize: l.Pagesize,
Limit: l.Limit,
})
}
if r.Network != nil {
if r.Network.ClassID != nil {
c.Resources.NetClsClassid = *r.Network.ClassID
}
for _, m := range r.Network.Priorities {
c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
Interface: m.Name,
Priority: int64(m.Priority),
})
}
}
}
if !opts.Rootless {
// append the default allowed devices to the end of the list
c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
}
return c, nil
}
func stringToCgroupDeviceRune(s string) (rune, error) {
switch s {
case "a":
return 'a', nil
case "b":
return 'b', nil
case "c":
return 'c', nil
default:
return 0, fmt.Errorf("invalid cgroup device type %q", s)
}
}
func stringToDeviceRune(s string) (rune, error) {
switch s {
case "p":
return 'p', nil
case "u":
return 'u', nil
case "b":
return 'b', nil
case "c":
return 'c', nil
default:
return 0, fmt.Errorf("invalid device type %q", s)
}
}
func createDevices(spec *specs.Spec, config *configs.Config) error {
// add whitelisted devices
config.Devices = []*configs.Device{
{
Type: 'c',
Path: "/dev/null",
Major: 1,
Minor: 3,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/random",
Major: 1,
Minor: 8,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/full",
Major: 1,
Minor: 7,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/tty",
Major: 5,
Minor: 0,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/zero",
Major: 1,
Minor: 5,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/urandom",
Major: 1,
Minor: 9,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
}
// merge in additional devices from the spec
if spec.Linux != nil {
for _, d := range spec.Linux.Devices {
var uid, gid uint32
var filemode os.FileMode = 0666
if d.UID != nil {
uid = *d.UID
}
if d.GID != nil {
gid = *d.GID
}
dt, err := stringToDeviceRune(d.Type)
if err != nil {
return err
}
if d.FileMode != nil {
filemode = *d.FileMode
}
device := &configs.Device{
Type: dt,
Path: d.Path,
Major: d.Major,
Minor: d.Minor,
FileMode: filemode,
Uid: uid,
Gid: gid,
}
config.Devices = append(config.Devices, device)
}
}
return nil
}
func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
create := func(m specs.LinuxIDMapping) configs.IDMap {
return configs.IDMap{
HostID: int(m.HostID),
ContainerID: int(m.ContainerID),
Size: int(m.Size),
}
}
if spec.Linux != nil {
for _, m := range spec.Linux.UIDMappings {
config.UidMappings = append(config.UidMappings, create(m))
}
for _, m := range spec.Linux.GIDMappings {
config.GidMappings = append(config.GidMappings, create(m))
}
}
rootUID, err := config.HostRootUID()
if err != nil {
return err
}
rootGID, err := config.HostRootGID()
if err != nil {
return err
}
for _, node := range config.Devices {
node.Uid = uint32(rootUID)
node.Gid = uint32(rootGID)
}
return nil
}
// parseMountOptions parses the string and returns the flags, propagation
// flags and any mount data that it contains.
func parseMountOptions(options []string) (int, []int, string, int) {
var (
flag int
pgflag []int
data []string
extFlags int
)
flags := map[string]struct {
clear bool
flag int
}{
"acl": {false, unix.MS_POSIXACL},
"async": {true, unix.MS_SYNCHRONOUS},
"atime": {true, unix.MS_NOATIME},
"bind": {false, unix.MS_BIND},
"defaults": {false, 0},
"dev": {true, unix.MS_NODEV},
"diratime": {true, unix.MS_NODIRATIME},
"dirsync": {false, unix.MS_DIRSYNC},
"exec": {true, unix.MS_NOEXEC},
"iversion": {false, unix.MS_I_VERSION},
"lazytime": {false, unix.MS_LAZYTIME},
"loud": {true, unix.MS_SILENT},
"mand": {false, unix.MS_MANDLOCK},
"noacl": {true, unix.MS_POSIXACL},
"noatime": {false, unix.MS_NOATIME},
"nodev": {false, unix.MS_NODEV},
"nodiratime": {false, unix.MS_NODIRATIME},
"noexec": {false, unix.MS_NOEXEC},
"noiversion": {true, unix.MS_I_VERSION},
"nolazytime": {true, unix.MS_LAZYTIME},
"nomand": {true, unix.MS_MANDLOCK},
"norelatime": {true, unix.MS_RELATIME},
"nostrictatime": {true, unix.MS_STRICTATIME},
"nosuid": {false, unix.MS_NOSUID},
"rbind": {false, unix.MS_BIND | unix.MS_REC},
"relatime": {false, unix.MS_RELATIME},
"remount": {false, unix.MS_REMOUNT},
"ro": {false, unix.MS_RDONLY},
"rw": {true, unix.MS_RDONLY},
"silent": {false, unix.MS_SILENT},
"strictatime": {false, unix.MS_STRICTATIME},
"suid": {true, unix.MS_NOSUID},
"sync": {false, unix.MS_SYNCHRONOUS},
}
propagationFlags := map[string]int{
"private": unix.MS_PRIVATE,
"shared": unix.MS_SHARED,
"slave": unix.MS_SLAVE,
"unbindable": unix.MS_UNBINDABLE,
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
"rshared": unix.MS_SHARED | unix.MS_REC,
"rslave": unix.MS_SLAVE | unix.MS_REC,
"runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
}
extensionFlags := map[string]struct {
clear bool
flag int
}{
"tmpcopyup": {false, configs.EXT_COPYUP},
}
for _, o := range options {
// If the option does not exist in the flags table or the flag
// is not supported on the platform,
// then it is a data value for a specific fs type
if f, exists := flags[o]; exists && f.flag != 0 {
if f.clear {
flag &= ^f.flag
} else {
flag |= f.flag
}
} else if f, exists := propagationFlags[o]; exists && f != 0 {
pgflag = append(pgflag, f)
} else if f, exists := extensionFlags[o]; exists && f.flag != 0 {
if f.clear {
extFlags &= ^f.flag
} else {
extFlags |= f.flag
}
} else {
data = append(data, o)
}
}
return flag, pgflag, strings.Join(data, ","), extFlags
}
func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
if config == nil {
return nil, nil
}
// No default action specified, no syscalls listed, assume seccomp disabled
if config.DefaultAction == "" && len(config.Syscalls) == 0 {
return nil, nil
}
newConfig := new(configs.Seccomp)
newConfig.Syscalls = []*configs.Syscall{}
if len(config.Architectures) > 0 {
newConfig.Architectures = []string{}
for _, arch := range config.Architectures {
newArch, err := seccomp.ConvertStringToArch(string(arch))
if err != nil {
return nil, err
}
newConfig.Architectures = append(newConfig.Architectures, newArch)
}
}
// Convert default action from string representation
newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
if err != nil {
return nil, err
}
newConfig.DefaultAction = newDefaultAction
// Loop through all syscall blocks and convert them to libcontainer format
for _, call := range config.Syscalls {
newAction, err := seccomp.ConvertStringToAction(string(call.Action))
if err != nil {
return nil, err
}
for _, name := range call.Names {
newCall := configs.Syscall{
Name: name,
Action: newAction,
Args: []*configs.Arg{},
}
// Loop through all the arguments of the syscall and convert them
for _, arg := range call.Args {
newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
if err != nil {
return nil, err
}
newArg := configs.Arg{
Index: arg.Index,
Value: arg.Value,
ValueTwo: arg.ValueTwo,
Op: newOp,
}
newCall.Args = append(newCall.Args, &newArg)
}
newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
}
}
return newConfig, nil
}
func createHooks(rspec *specs.Spec, config *configs.Config) {
config.Hooks = &configs.Hooks{}
if rspec.Hooks != nil {
for _, h := range rspec.Hooks.Prestart {
cmd := createCommandHook(h)
config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd))
}
for _, h := range rspec.Hooks.Poststart {
cmd := createCommandHook(h)
config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd))
}
for _, h := range rspec.Hooks.Poststop {
cmd := createCommandHook(h)
config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd))
}
}
}
func createCommandHook(h specs.Hook) configs.Command {
cmd := configs.Command{
Path: h.Path,
Args: h.Args,
Env: h.Env,
}
if h.Timeout != nil {
d := time.Duration(*h.Timeout) * time.Second
cmd.Timeout = &d
}
return cmd
}

View File

@ -1,93 +0,0 @@
// +build linux
package utils
/*
* Copyright 2016, 2017 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import (
"fmt"
"os"
"golang.org/x/sys/unix"
)
// MaxSendfdLen is the maximum length of the name of a file descriptor being
// sent using SendFd. The name of the file handle returned by RecvFd will never
// be larger than this value.
const MaxNameLen = 4096
// oobSpace is the size of the oob slice required to store a single FD. Note
// that unix.UnixRights appears to make the assumption that fd is always int32,
// so sizeof(fd) = 4.
var oobSpace = unix.CmsgSpace(4)
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFd(socket *os.File) (*os.File, error) {
// For some reason, unix.Recvmsg uses the length rather than the capacity
// when passing the msg_controllen and other attributes to recvmsg. So we
// have to actually set the length.
name := make([]byte, MaxNameLen)
oob := make([]byte, oobSpace)
sockfd := socket.Fd()
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
if err != nil {
return nil, err
}
if n >= MaxNameLen || oobn != oobSpace {
return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
}
// Truncate.
name = name[:n]
oob = oob[:oobn]
scms, err := unix.ParseSocketControlMessage(oob)
if err != nil {
return nil, err
}
if len(scms) != 1 {
return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
}
scm := scms[0]
fds, err := unix.ParseUnixRights(&scm)
if err != nil {
return nil, err
}
if len(fds) != 1 {
return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
}
fd := uintptr(fds[0])
return os.NewFile(fd, string(name)), nil
}
// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket *os.File, name string, fd uintptr) error {
if len(name) >= MaxNameLen {
return fmt.Errorf("sendfd: filename too long: %s", name)
}
oob := unix.UnixRights(int(fd))
return unix.Sendmsg(int(socket.Fd()), []byte(name), oob, nil, 0)
}

View File

@ -1,112 +0,0 @@
package utils
import (
"encoding/json"
"io"
"os"
"path/filepath"
"strings"
"unsafe"
"golang.org/x/sys/unix"
)
const (
exitSignalOffset = 128
)
// ResolveRootfs ensures that the current working directory is
// not a symlink and returns the absolute path to the rootfs
func ResolveRootfs(uncleanRootfs string) (string, error) {
rootfs, err := filepath.Abs(uncleanRootfs)
if err != nil {
return "", err
}
return filepath.EvalSymlinks(rootfs)
}
// ExitStatus returns the correct exit status for a process based on if it
// was signaled or exited cleanly
func ExitStatus(status unix.WaitStatus) int {
if status.Signaled() {
return exitSignalOffset + int(status.Signal())
}
return status.ExitStatus()
}
// WriteJSON writes the provided struct v to w using standard json marshaling
func WriteJSON(w io.Writer, v interface{}) error {
data, err := json.Marshal(v)
if err != nil {
return err
}
_, err = w.Write(data)
return err
}
// CleanPath makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using CleanPath.
func CleanPath(path string) string {
// Deal with empty strings nicely.
if path == "" {
return ""
}
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}
// SearchLabels searches a list of key-value pairs for the provided key and
// returns the corresponding value. The pairs must be separated with '='.
func SearchLabels(labels []string, query string) string {
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == query {
return parts[1]
}
}
return ""
}
// Annotations returns the bundle path and user defined annotations from the
// libcontainer state. We need to remove the bundle because that is a label
// added by libcontainer.
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
userAnnotations = make(map[string]string)
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == "bundle" {
bundle = parts[1]
} else {
userAnnotations[parts[0]] = parts[1]
}
}
return
}
func GetIntSize() int {
return int(unsafe.Sizeof(1))
}

View File

@ -1,44 +0,0 @@
// +build !windows
package utils
import (
"io/ioutil"
"os"
"strconv"
"golang.org/x/sys/unix"
)
func CloseExecFrom(minFd int) error {
fdList, err := ioutil.ReadDir("/proc/self/fd")
if err != nil {
return err
}
for _, fi := range fdList {
fd, err := strconv.Atoi(fi.Name())
if err != nil {
// ignore non-numeric file names
continue
}
if fd < minFd {
// ignore descriptors lower than our specified minimum
continue
}
// intentionally ignore errors from unix.CloseOnExec
unix.CloseOnExec(fd)
// the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall)
}
return nil
}
// NewSockPair returns a new unix socket pair
func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
}

View File

@ -1,22 +0,0 @@
Copyright (c) 2015 Matthew Heon <mheon@redhat.com>
Copyright (c) 2015 Paul Moore <pmoore@redhat.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
- Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,26 +0,0 @@
libseccomp-golang: Go Language Bindings for the libseccomp Project
===============================================================================
https://github.com/seccomp/libseccomp-golang
https://github.com/seccomp/libseccomp
The libseccomp library provides an easy to use, platform independent, interface
to the Linux Kernel's syscall filtering mechanism. The libseccomp API is
designed to abstract away the underlying BPF based syscall filter language and
present a more conventional function-call based filtering interface that should
be familiar to, and easily adopted by, application developers.
The libseccomp-golang library provides a Go based interface to the libseccomp
library.
* Online Resources
The library source repository currently lives on GitHub at the following URLs:
-> https://github.com/seccomp/libseccomp-golang
-> https://github.com/seccomp/libseccomp
The project mailing list is currently hosted on Google Groups at the URL below,
please note that a Google account is not required to subscribe to the mailing
list.
-> https://groups.google.com/d/forum/libseccomp

View File

@ -1,857 +0,0 @@
// +build linux
// Public API specification for libseccomp Go bindings
// Contains public API for the bindings
// Package seccomp provides bindings for libseccomp, a library wrapping the Linux
// seccomp syscall. Seccomp enables an application to restrict system call use
// for itself and its children.
package seccomp
import (
"fmt"
"os"
"runtime"
"strings"
"sync"
"syscall"
"unsafe"
)
// C wrapping code
// #cgo pkg-config: libseccomp
// #include <stdlib.h>
// #include <seccomp.h>
import "C"
// Exported types
// ScmpArch represents a CPU architecture. Seccomp can restrict syscalls on a
// per-architecture basis.
type ScmpArch uint
// ScmpAction represents an action to be taken on a filter rule match in
// libseccomp
type ScmpAction uint
// ScmpCompareOp represents a comparison operator which can be used in a filter
// rule
type ScmpCompareOp uint
// ScmpCondition represents a rule in a libseccomp filter context
type ScmpCondition struct {
Argument uint `json:"argument,omitempty"`
Op ScmpCompareOp `json:"operator,omitempty"`
Operand1 uint64 `json:"operand_one,omitempty"`
Operand2 uint64 `json:"operand_two,omitempty"`
}
// ScmpSyscall represents a Linux System Call
type ScmpSyscall int32
// Exported Constants
const (
// Valid architectures recognized by libseccomp
// ARM64 and all MIPS architectures are unsupported by versions of the
// library before v2.2 and will return errors if used
// ArchInvalid is a placeholder to ensure uninitialized ScmpArch
// variables are invalid
ArchInvalid ScmpArch = iota
// ArchNative is the native architecture of the kernel
ArchNative ScmpArch = iota
// ArchX86 represents 32-bit x86 syscalls
ArchX86 ScmpArch = iota
// ArchAMD64 represents 64-bit x86-64 syscalls
ArchAMD64 ScmpArch = iota
// ArchX32 represents 64-bit x86-64 syscalls (32-bit pointers)
ArchX32 ScmpArch = iota
// ArchARM represents 32-bit ARM syscalls
ArchARM ScmpArch = iota
// ArchARM64 represents 64-bit ARM syscalls
ArchARM64 ScmpArch = iota
// ArchMIPS represents 32-bit MIPS syscalls
ArchMIPS ScmpArch = iota
// ArchMIPS64 represents 64-bit MIPS syscalls
ArchMIPS64 ScmpArch = iota
// ArchMIPS64N32 represents 64-bit MIPS syscalls (32-bit pointers)
ArchMIPS64N32 ScmpArch = iota
// ArchMIPSEL represents 32-bit MIPS syscalls (little endian)
ArchMIPSEL ScmpArch = iota
// ArchMIPSEL64 represents 64-bit MIPS syscalls (little endian)
ArchMIPSEL64 ScmpArch = iota
// ArchMIPSEL64N32 represents 64-bit MIPS syscalls (little endian,
// 32-bit pointers)
ArchMIPSEL64N32 ScmpArch = iota
// ArchPPC represents 32-bit POWERPC syscalls
ArchPPC ScmpArch = iota
// ArchPPC64 represents 64-bit POWER syscalls (big endian)
ArchPPC64 ScmpArch = iota
// ArchPPC64LE represents 64-bit POWER syscalls (little endian)
ArchPPC64LE ScmpArch = iota
// ArchS390 represents 31-bit System z/390 syscalls
ArchS390 ScmpArch = iota
// ArchS390X represents 64-bit System z/390 syscalls
ArchS390X ScmpArch = iota
)
const (
// Supported actions on filter match
// ActInvalid is a placeholder to ensure uninitialized ScmpAction
// variables are invalid
ActInvalid ScmpAction = iota
// ActKill kills the process
ActKill ScmpAction = iota
// ActTrap throws SIGSYS
ActTrap ScmpAction = iota
// ActErrno causes the syscall to return a negative error code. This
// code can be set with the SetReturnCode method
ActErrno ScmpAction = iota
// ActTrace causes the syscall to notify tracing processes with the
// given error code. This code can be set with the SetReturnCode method
ActTrace ScmpAction = iota
// ActAllow permits the syscall to continue execution
ActAllow ScmpAction = iota
)
const (
// These are comparison operators used in conditional seccomp rules
// They are used to compare the value of a single argument of a syscall
// against a user-defined constant
// CompareInvalid is a placeholder to ensure uninitialized ScmpCompareOp
// variables are invalid
CompareInvalid ScmpCompareOp = iota
// CompareNotEqual returns true if the argument is not equal to the
// given value
CompareNotEqual ScmpCompareOp = iota
// CompareLess returns true if the argument is less than the given value
CompareLess ScmpCompareOp = iota
// CompareLessOrEqual returns true if the argument is less than or equal
// to the given value
CompareLessOrEqual ScmpCompareOp = iota
// CompareEqual returns true if the argument is equal to the given value
CompareEqual ScmpCompareOp = iota
// CompareGreaterEqual returns true if the argument is greater than or
// equal to the given value
CompareGreaterEqual ScmpCompareOp = iota
// CompareGreater returns true if the argument is greater than the given
// value
CompareGreater ScmpCompareOp = iota
// CompareMaskedEqual returns true if the argument is equal to the given
// value, when masked (bitwise &) against the second given value
CompareMaskedEqual ScmpCompareOp = iota
)
// Helpers for types
// GetArchFromString returns an ScmpArch constant from a string representing an
// architecture
func GetArchFromString(arch string) (ScmpArch, error) {
switch strings.ToLower(arch) {
case "x86":
return ArchX86, nil
case "amd64", "x86-64", "x86_64", "x64":
return ArchAMD64, nil
case "x32":
return ArchX32, nil
case "arm":
return ArchARM, nil
case "arm64", "aarch64":
return ArchARM64, nil
case "mips":
return ArchMIPS, nil
case "mips64":
return ArchMIPS64, nil
case "mips64n32":
return ArchMIPS64N32, nil
case "mipsel":
return ArchMIPSEL, nil
case "mipsel64":
return ArchMIPSEL64, nil
case "mipsel64n32":
return ArchMIPSEL64N32, nil
case "ppc":
return ArchPPC, nil
case "ppc64":
return ArchPPC64, nil
case "ppc64le":
return ArchPPC64LE, nil
case "s390":
return ArchS390, nil
case "s390x":
return ArchS390X, nil
default:
return ArchInvalid, fmt.Errorf("cannot convert unrecognized string %s", arch)
}
}
// String returns a string representation of an architecture constant
func (a ScmpArch) String() string {
switch a {
case ArchX86:
return "x86"
case ArchAMD64:
return "amd64"
case ArchX32:
return "x32"
case ArchARM:
return "arm"
case ArchARM64:
return "arm64"
case ArchMIPS:
return "mips"
case ArchMIPS64:
return "mips64"
case ArchMIPS64N32:
return "mips64n32"
case ArchMIPSEL:
return "mipsel"
case ArchMIPSEL64:
return "mipsel64"
case ArchMIPSEL64N32:
return "mipsel64n32"
case ArchPPC:
return "ppc"
case ArchPPC64:
return "ppc64"
case ArchPPC64LE:
return "ppc64le"
case ArchS390:
return "s390"
case ArchS390X:
return "s390x"
case ArchNative:
return "native"
case ArchInvalid:
return "Invalid architecture"
default:
return "Unknown architecture"
}
}
// String returns a string representation of a comparison operator constant
func (a ScmpCompareOp) String() string {
switch a {
case CompareNotEqual:
return "Not equal"
case CompareLess:
return "Less than"
case CompareLessOrEqual:
return "Less than or equal to"
case CompareEqual:
return "Equal"
case CompareGreaterEqual:
return "Greater than or equal to"
case CompareGreater:
return "Greater than"
case CompareMaskedEqual:
return "Masked equality"
case CompareInvalid:
return "Invalid comparison operator"
default:
return "Unrecognized comparison operator"
}
}
// String returns a string representation of a seccomp match action
func (a ScmpAction) String() string {
switch a & 0xFFFF {
case ActKill:
return "Action: Kill Process"
case ActTrap:
return "Action: Send SIGSYS"
case ActErrno:
return fmt.Sprintf("Action: Return error code %d", (a >> 16))
case ActTrace:
return fmt.Sprintf("Action: Notify tracing processes with code %d",
(a >> 16))
case ActAllow:
return "Action: Allow system call"
default:
return "Unrecognized Action"
}
}
// SetReturnCode adds a return code to a supporting ScmpAction, clearing any
// existing code Only valid on ActErrno and ActTrace. Takes no action otherwise.
// Accepts 16-bit return code as argument.
// Returns a valid ScmpAction of the original type with the new error code set.
func (a ScmpAction) SetReturnCode(code int16) ScmpAction {
aTmp := a & 0x0000FFFF
if aTmp == ActErrno || aTmp == ActTrace {
return (aTmp | (ScmpAction(code)&0xFFFF)<<16)
}
return a
}
// GetReturnCode returns the return code of an ScmpAction
func (a ScmpAction) GetReturnCode() int16 {
return int16(a >> 16)
}
// General utility functions
// GetLibraryVersion returns the version of the library the bindings are built
// against.
// The version is formatted as follows: Major.Minor.Micro
func GetLibraryVersion() (major, minor, micro int) {
return verMajor, verMinor, verMicro
}
// Syscall functions
// GetName retrieves the name of a syscall from its number.
// Acts on any syscall number.
// Returns either a string containing the name of the syscall, or an error.
func (s ScmpSyscall) GetName() (string, error) {
return s.GetNameByArch(ArchNative)
}
// GetNameByArch retrieves the name of a syscall from its number for a given
// architecture.
// Acts on any syscall number.
// Accepts a valid architecture constant.
// Returns either a string containing the name of the syscall, or an error.
// if the syscall is unrecognized or an issue occurred.
func (s ScmpSyscall) GetNameByArch(arch ScmpArch) (string, error) {
if err := sanitizeArch(arch); err != nil {
return "", err
}
cString := C.seccomp_syscall_resolve_num_arch(arch.toNative(), C.int(s))
if cString == nil {
return "", fmt.Errorf("could not resolve syscall name")
}
defer C.free(unsafe.Pointer(cString))
finalStr := C.GoString(cString)
return finalStr, nil
}
// GetSyscallFromName returns the number of a syscall by name on the kernel's
// native architecture.
// Accepts a string containing the name of a syscall.
// Returns the number of the syscall, or an error if no syscall with that name
// was found.
func GetSyscallFromName(name string) (ScmpSyscall, error) {
cString := C.CString(name)
defer C.free(unsafe.Pointer(cString))
result := C.seccomp_syscall_resolve_name(cString)
if result == scmpError {
return 0, fmt.Errorf("could not resolve name to syscall")
}
return ScmpSyscall(result), nil
}
// GetSyscallFromNameByArch returns the number of a syscall by name for a given
// architecture's ABI.
// Accepts the name of a syscall and an architecture constant.
// Returns the number of the syscall, or an error if an invalid architecture is
// passed or a syscall with that name was not found.
func GetSyscallFromNameByArch(name string, arch ScmpArch) (ScmpSyscall, error) {
if err := sanitizeArch(arch); err != nil {
return 0, err
}
cString := C.CString(name)
defer C.free(unsafe.Pointer(cString))
result := C.seccomp_syscall_resolve_name_arch(arch.toNative(), cString)
if result == scmpError {
return 0, fmt.Errorf("could not resolve name to syscall")
}
return ScmpSyscall(result), nil
}
// MakeCondition creates and returns a new condition to attach to a filter rule.
// Associated rules will only match if this condition is true.
// Accepts the number the argument we are checking, and a comparison operator
// and value to compare to.
// The rule will match if argument $arg (zero-indexed) of the syscall is
// $COMPARE_OP the provided comparison value.
// Some comparison operators accept two values. Masked equals, for example,
// will mask $arg of the syscall with the second value provided (via bitwise
// AND) and then compare against the first value provided.
// For example, in the less than or equal case, if the syscall argument was
// 0 and the value provided was 1, the condition would match, as 0 is less
// than or equal to 1.
// Return either an error on bad argument or a valid ScmpCondition struct.
func MakeCondition(arg uint, comparison ScmpCompareOp, values ...uint64) (ScmpCondition, error) {
var condStruct ScmpCondition
if comparison == CompareInvalid {
return condStruct, fmt.Errorf("invalid comparison operator")
} else if arg > 5 {
return condStruct, fmt.Errorf("syscalls only have up to 6 arguments")
} else if len(values) > 2 {
return condStruct, fmt.Errorf("conditions can have at most 2 arguments")
} else if len(values) == 0 {
return condStruct, fmt.Errorf("must provide at least one value to compare against")
}
condStruct.Argument = arg
condStruct.Op = comparison
condStruct.Operand1 = values[0]
if len(values) == 2 {
condStruct.Operand2 = values[1]
} else {
condStruct.Operand2 = 0 // Unused
}
return condStruct, nil
}
// Utility Functions
// GetNativeArch returns architecture token representing the native kernel
// architecture
func GetNativeArch() (ScmpArch, error) {
arch := C.seccomp_arch_native()
return archFromNative(arch)
}
// Public Filter API
// ScmpFilter represents a filter context in libseccomp.
// A filter context is initially empty. Rules can be added to it, and it can
// then be loaded into the kernel.
type ScmpFilter struct {
filterCtx C.scmp_filter_ctx
valid bool
lock sync.Mutex
}
// NewFilter creates and returns a new filter context.
// Accepts a default action to be taken for syscalls which match no rules in
// the filter.
// Returns a reference to a valid filter context, or nil and an error if the
// filter context could not be created or an invalid default action was given.
func NewFilter(defaultAction ScmpAction) (*ScmpFilter, error) {
if err := sanitizeAction(defaultAction); err != nil {
return nil, err
}
fPtr := C.seccomp_init(defaultAction.toNative())
if fPtr == nil {
return nil, fmt.Errorf("could not create filter")
}
filter := new(ScmpFilter)
filter.filterCtx = fPtr
filter.valid = true
runtime.SetFinalizer(filter, filterFinalizer)
return filter, nil
}
// IsValid determines whether a filter context is valid to use.
// Some operations (Release and Merge) render filter contexts invalid and
// consequently prevent further use.
func (f *ScmpFilter) IsValid() bool {
f.lock.Lock()
defer f.lock.Unlock()
return f.valid
}
// Reset resets a filter context, removing all its existing state.
// Accepts a new default action to be taken for syscalls which do not match.
// Returns an error if the filter or action provided are invalid.
func (f *ScmpFilter) Reset(defaultAction ScmpAction) error {
f.lock.Lock()
defer f.lock.Unlock()
if err := sanitizeAction(defaultAction); err != nil {
return err
} else if !f.valid {
return errBadFilter
}
retCode := C.seccomp_reset(f.filterCtx, defaultAction.toNative())
if retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// Release releases a filter context, freeing its memory. Should be called after
// loading into the kernel, when the filter is no longer needed.
// After calling this function, the given filter is no longer valid and cannot
// be used.
// Release() will be invoked automatically when a filter context is garbage
// collected, but can also be called manually to free memory.
func (f *ScmpFilter) Release() {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return
}
f.valid = false
C.seccomp_release(f.filterCtx)
}
// Merge merges two filter contexts.
// The source filter src will be released as part of the process, and will no
// longer be usable or valid after this call.
// To be merged, filters must NOT share any architectures, and all their
// attributes (Default Action, Bad Arch Action, No New Privs and TSync bools)
// must match.
// The filter src will be merged into the filter this is called on.
// The architectures of the src filter not present in the destination, and all
// associated rules, will be added to the destination.
// Returns an error if merging the filters failed.
func (f *ScmpFilter) Merge(src *ScmpFilter) error {
f.lock.Lock()
defer f.lock.Unlock()
src.lock.Lock()
defer src.lock.Unlock()
if !src.valid || !f.valid {
return fmt.Errorf("one or more of the filter contexts is invalid or uninitialized")
}
// Merge the filters
retCode := C.seccomp_merge(f.filterCtx, src.filterCtx)
if syscall.Errno(-1*retCode) == syscall.EINVAL {
return fmt.Errorf("filters could not be merged due to a mismatch in attributes or invalid filter")
} else if retCode != 0 {
return syscall.Errno(-1 * retCode)
}
src.valid = false
return nil
}
// IsArchPresent checks if an architecture is present in a filter.
// If a filter contains an architecture, it uses its default action for
// syscalls which do not match rules in it, and its rules can match syscalls
// for that ABI.
// If a filter does not contain an architecture, all syscalls made to that
// kernel ABI will fail with the filter's default Bad Architecture Action
// (by default, killing the process).
// Accepts an architecture constant.
// Returns true if the architecture is present in the filter, false otherwise,
// and an error on an invalid filter context, architecture constant, or an
// issue with the call to libseccomp.
func (f *ScmpFilter) IsArchPresent(arch ScmpArch) (bool, error) {
f.lock.Lock()
defer f.lock.Unlock()
if err := sanitizeArch(arch); err != nil {
return false, err
} else if !f.valid {
return false, errBadFilter
}
retCode := C.seccomp_arch_exist(f.filterCtx, arch.toNative())
if syscall.Errno(-1*retCode) == syscall.EEXIST {
// -EEXIST is "arch not present"
return false, nil
} else if retCode != 0 {
return false, syscall.Errno(-1 * retCode)
}
return true, nil
}
// AddArch adds an architecture to the filter.
// Accepts an architecture constant.
// Returns an error on invalid filter context or architecture token, or an
// issue with the call to libseccomp.
func (f *ScmpFilter) AddArch(arch ScmpArch) error {
f.lock.Lock()
defer f.lock.Unlock()
if err := sanitizeArch(arch); err != nil {
return err
} else if !f.valid {
return errBadFilter
}
// Libseccomp returns -EEXIST if the specified architecture is already
// present. Succeed silently in this case, as it's not fatal, and the
// architecture is present already.
retCode := C.seccomp_arch_add(f.filterCtx, arch.toNative())
if retCode != 0 && syscall.Errno(-1*retCode) != syscall.EEXIST {
return syscall.Errno(-1 * retCode)
}
return nil
}
// RemoveArch removes an architecture from the filter.
// Accepts an architecture constant.
// Returns an error on invalid filter context or architecture token, or an
// issue with the call to libseccomp.
func (f *ScmpFilter) RemoveArch(arch ScmpArch) error {
f.lock.Lock()
defer f.lock.Unlock()
if err := sanitizeArch(arch); err != nil {
return err
} else if !f.valid {
return errBadFilter
}
// Similar to AddArch, -EEXIST is returned if the arch is not present
// Succeed silently in that case, this is not fatal and the architecture
// is not present in the filter after RemoveArch
retCode := C.seccomp_arch_remove(f.filterCtx, arch.toNative())
if retCode != 0 && syscall.Errno(-1*retCode) != syscall.EEXIST {
return syscall.Errno(-1 * retCode)
}
return nil
}
// Load loads a filter context into the kernel.
// Returns an error if the filter context is invalid or the syscall failed.
func (f *ScmpFilter) Load() error {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return errBadFilter
}
if retCode := C.seccomp_load(f.filterCtx); retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// GetDefaultAction returns the default action taken on a syscall which does not
// match a rule in the filter, or an error if an issue was encountered
// retrieving the value.
func (f *ScmpFilter) GetDefaultAction() (ScmpAction, error) {
action, err := f.getFilterAttr(filterAttrActDefault)
if err != nil {
return 0x0, err
}
return actionFromNative(action)
}
// GetBadArchAction returns the default action taken on a syscall for an
// architecture not in the filter, or an error if an issue was encountered
// retrieving the value.
func (f *ScmpFilter) GetBadArchAction() (ScmpAction, error) {
action, err := f.getFilterAttr(filterAttrActBadArch)
if err != nil {
return 0x0, err
}
return actionFromNative(action)
}
// GetNoNewPrivsBit returns the current state the No New Privileges bit will be set
// to on the filter being loaded, or an error if an issue was encountered
// retrieving the value.
// The No New Privileges bit tells the kernel that new processes run with exec()
// cannot gain more privileges than the process that ran exec().
// For example, a process with No New Privileges set would be unable to exec
// setuid/setgid executables.
func (f *ScmpFilter) GetNoNewPrivsBit() (bool, error) {
noNewPrivs, err := f.getFilterAttr(filterAttrNNP)
if err != nil {
return false, err
}
if noNewPrivs == 0 {
return false, nil
}
return true, nil
}
// GetTsyncBit returns whether Thread Synchronization will be enabled on the
// filter being loaded, or an error if an issue was encountered retrieving the
// value.
// Thread Sync ensures that all members of the thread group of the calling
// process will share the same Seccomp filter set.
// Tsync is a fairly recent addition to the Linux kernel and older kernels
// lack support. If the running kernel does not support Tsync and it is
// requested in a filter, Libseccomp will not enable TSync support and will
// proceed as normal.
// This function is unavailable before v2.2 of libseccomp and will return an
// error.
func (f *ScmpFilter) GetTsyncBit() (bool, error) {
tSync, err := f.getFilterAttr(filterAttrTsync)
if err != nil {
return false, err
}
if tSync == 0 {
return false, nil
}
return true, nil
}
// SetBadArchAction sets the default action taken on a syscall for an
// architecture not in the filter, or an error if an issue was encountered
// setting the value.
func (f *ScmpFilter) SetBadArchAction(action ScmpAction) error {
if err := sanitizeAction(action); err != nil {
return err
}
return f.setFilterAttr(filterAttrActBadArch, action.toNative())
}
// SetNoNewPrivsBit sets the state of the No New Privileges bit, which will be
// applied on filter load, or an error if an issue was encountered setting the
// value.
// Filters with No New Privileges set to 0 can only be loaded if the process
// has the CAP_SYS_ADMIN capability.
func (f *ScmpFilter) SetNoNewPrivsBit(state bool) error {
var toSet C.uint32_t = 0x0
if state {
toSet = 0x1
}
return f.setFilterAttr(filterAttrNNP, toSet)
}
// SetTsync sets whether Thread Synchronization will be enabled on the filter
// being loaded. Returns an error if setting Tsync failed, or the filter is
// invalid.
// Thread Sync ensures that all members of the thread group of the calling
// process will share the same Seccomp filter set.
// Tsync is a fairly recent addition to the Linux kernel and older kernels
// lack support. If the running kernel does not support Tsync and it is
// requested in a filter, Libseccomp will not enable TSync support and will
// proceed as normal.
// This function is unavailable before v2.2 of libseccomp and will return an
// error.
func (f *ScmpFilter) SetTsync(enable bool) error {
var toSet C.uint32_t = 0x0
if enable {
toSet = 0x1
}
return f.setFilterAttr(filterAttrTsync, toSet)
}
// SetSyscallPriority sets a syscall's priority.
// This provides a hint to the filter generator in libseccomp about the
// importance of this syscall. High-priority syscalls are placed
// first in the filter code, and incur less overhead (at the expense of
// lower-priority syscalls).
func (f *ScmpFilter) SetSyscallPriority(call ScmpSyscall, priority uint8) error {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return errBadFilter
}
if retCode := C.seccomp_syscall_priority(f.filterCtx, C.int(call),
C.uint8_t(priority)); retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// AddRule adds a single rule for an unconditional action on a syscall.
// Accepts the number of the syscall and the action to be taken on the call
// being made.
// Returns an error if an issue was encountered adding the rule.
func (f *ScmpFilter) AddRule(call ScmpSyscall, action ScmpAction) error {
return f.addRuleGeneric(call, action, false, nil)
}
// AddRuleExact adds a single rule for an unconditional action on a syscall.
// Accepts the number of the syscall and the action to be taken on the call
// being made.
// No modifications will be made to the rule, and it will fail to add if it
// cannot be applied to the current architecture without modification.
// The rule will function exactly as described, but it may not function identically
// (or be able to be applied to) all architectures.
// Returns an error if an issue was encountered adding the rule.
func (f *ScmpFilter) AddRuleExact(call ScmpSyscall, action ScmpAction) error {
return f.addRuleGeneric(call, action, true, nil)
}
// AddRuleConditional adds a single rule for a conditional action on a syscall.
// Returns an error if an issue was encountered adding the rule.
// All conditions must match for the rule to match.
// There is a bug in library versions below v2.2.1 which can, in some cases,
// cause conditions to be lost when more than one are used. Consequently,
// AddRuleConditional is disabled on library versions lower than v2.2.1
func (f *ScmpFilter) AddRuleConditional(call ScmpSyscall, action ScmpAction, conds []ScmpCondition) error {
return f.addRuleGeneric(call, action, false, conds)
}
// AddRuleConditionalExact adds a single rule for a conditional action on a
// syscall.
// No modifications will be made to the rule, and it will fail to add if it
// cannot be applied to the current architecture without modification.
// The rule will function exactly as described, but it may not function identically
// (or be able to be applied to) all architectures.
// Returns an error if an issue was encountered adding the rule.
// There is a bug in library versions below v2.2.1 which can, in some cases,
// cause conditions to be lost when more than one are used. Consequently,
// AddRuleConditionalExact is disabled on library versions lower than v2.2.1
func (f *ScmpFilter) AddRuleConditionalExact(call ScmpSyscall, action ScmpAction, conds []ScmpCondition) error {
return f.addRuleGeneric(call, action, true, conds)
}
// ExportPFC output PFC-formatted, human-readable dump of a filter context's
// rules to a file.
// Accepts file to write to (must be open for writing).
// Returns an error if writing to the file fails.
func (f *ScmpFilter) ExportPFC(file *os.File) error {
f.lock.Lock()
defer f.lock.Unlock()
fd := file.Fd()
if !f.valid {
return errBadFilter
}
if retCode := C.seccomp_export_pfc(f.filterCtx, C.int(fd)); retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// ExportBPF outputs Berkeley Packet Filter-formatted, kernel-readable dump of a
// filter context's rules to a file.
// Accepts file to write to (must be open for writing).
// Returns an error if writing to the file fails.
func (f *ScmpFilter) ExportBPF(file *os.File) error {
f.lock.Lock()
defer f.lock.Unlock()
fd := file.Fd()
if !f.valid {
return errBadFilter
}
if retCode := C.seccomp_export_bpf(f.filterCtx, C.int(fd)); retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}

View File

@ -1,506 +0,0 @@
// +build linux
// Internal functions for libseccomp Go bindings
// No exported functions
package seccomp
import (
"fmt"
"os"
"syscall"
)
// Unexported C wrapping code - provides the C-Golang interface
// Get the seccomp header in scope
// Need stdlib.h for free() on cstrings
// #cgo pkg-config: libseccomp
/*
#include <stdlib.h>
#include <seccomp.h>
#if SCMP_VER_MAJOR < 2
#error Minimum supported version of Libseccomp is v2.1.0
#elif SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 1
#error Minimum supported version of Libseccomp is v2.1.0
#endif
#define ARCH_BAD ~0
const uint32_t C_ARCH_BAD = ARCH_BAD;
#ifndef SCMP_ARCH_AARCH64
#define SCMP_ARCH_AARCH64 ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPS
#define SCMP_ARCH_MIPS ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPS64
#define SCMP_ARCH_MIPS64 ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPS64N32
#define SCMP_ARCH_MIPS64N32 ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPSEL
#define SCMP_ARCH_MIPSEL ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPSEL64
#define SCMP_ARCH_MIPSEL64 ARCH_BAD
#endif
#ifndef SCMP_ARCH_MIPSEL64N32
#define SCMP_ARCH_MIPSEL64N32 ARCH_BAD
#endif
#ifndef SCMP_ARCH_PPC
#define SCMP_ARCH_PPC ARCH_BAD
#endif
#ifndef SCMP_ARCH_PPC64
#define SCMP_ARCH_PPC64 ARCH_BAD
#endif
#ifndef SCMP_ARCH_PPC64LE
#define SCMP_ARCH_PPC64LE ARCH_BAD
#endif
#ifndef SCMP_ARCH_S390
#define SCMP_ARCH_S390 ARCH_BAD
#endif
#ifndef SCMP_ARCH_S390X
#define SCMP_ARCH_S390X ARCH_BAD
#endif
const uint32_t C_ARCH_NATIVE = SCMP_ARCH_NATIVE;
const uint32_t C_ARCH_X86 = SCMP_ARCH_X86;
const uint32_t C_ARCH_X86_64 = SCMP_ARCH_X86_64;
const uint32_t C_ARCH_X32 = SCMP_ARCH_X32;
const uint32_t C_ARCH_ARM = SCMP_ARCH_ARM;
const uint32_t C_ARCH_AARCH64 = SCMP_ARCH_AARCH64;
const uint32_t C_ARCH_MIPS = SCMP_ARCH_MIPS;
const uint32_t C_ARCH_MIPS64 = SCMP_ARCH_MIPS64;
const uint32_t C_ARCH_MIPS64N32 = SCMP_ARCH_MIPS64N32;
const uint32_t C_ARCH_MIPSEL = SCMP_ARCH_MIPSEL;
const uint32_t C_ARCH_MIPSEL64 = SCMP_ARCH_MIPSEL64;
const uint32_t C_ARCH_MIPSEL64N32 = SCMP_ARCH_MIPSEL64N32;
const uint32_t C_ARCH_PPC = SCMP_ARCH_PPC;
const uint32_t C_ARCH_PPC64 = SCMP_ARCH_PPC64;
const uint32_t C_ARCH_PPC64LE = SCMP_ARCH_PPC64LE;
const uint32_t C_ARCH_S390 = SCMP_ARCH_S390;
const uint32_t C_ARCH_S390X = SCMP_ARCH_S390X;
const uint32_t C_ACT_KILL = SCMP_ACT_KILL;
const uint32_t C_ACT_TRAP = SCMP_ACT_TRAP;
const uint32_t C_ACT_ERRNO = SCMP_ACT_ERRNO(0);
const uint32_t C_ACT_TRACE = SCMP_ACT_TRACE(0);
const uint32_t C_ACT_ALLOW = SCMP_ACT_ALLOW;
// If TSync is not supported, make sure it doesn't map to a supported filter attribute
// Don't worry about major version < 2, the minimum version checks should catch that case
#if SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 2
#define SCMP_FLTATR_CTL_TSYNC _SCMP_CMP_MIN
#endif
const uint32_t C_ATTRIBUTE_DEFAULT = (uint32_t)SCMP_FLTATR_ACT_DEFAULT;
const uint32_t C_ATTRIBUTE_BADARCH = (uint32_t)SCMP_FLTATR_ACT_BADARCH;
const uint32_t C_ATTRIBUTE_NNP = (uint32_t)SCMP_FLTATR_CTL_NNP;
const uint32_t C_ATTRIBUTE_TSYNC = (uint32_t)SCMP_FLTATR_CTL_TSYNC;
const int C_CMP_NE = (int)SCMP_CMP_NE;
const int C_CMP_LT = (int)SCMP_CMP_LT;
const int C_CMP_LE = (int)SCMP_CMP_LE;
const int C_CMP_EQ = (int)SCMP_CMP_EQ;
const int C_CMP_GE = (int)SCMP_CMP_GE;
const int C_CMP_GT = (int)SCMP_CMP_GT;
const int C_CMP_MASKED_EQ = (int)SCMP_CMP_MASKED_EQ;
const int C_VERSION_MAJOR = SCMP_VER_MAJOR;
const int C_VERSION_MINOR = SCMP_VER_MINOR;
const int C_VERSION_MICRO = SCMP_VER_MICRO;
typedef struct scmp_arg_cmp* scmp_cast_t;
// Wrapper to create an scmp_arg_cmp struct
void*
make_struct_arg_cmp(
unsigned int arg,
int compare,
uint64_t a,
uint64_t b
)
{
struct scmp_arg_cmp *s = malloc(sizeof(struct scmp_arg_cmp));
s->arg = arg;
s->op = compare;
s->datum_a = a;
s->datum_b = b;
return s;
}
*/
import "C"
// Nonexported types
type scmpFilterAttr uint32
// Nonexported constants
const (
filterAttrActDefault scmpFilterAttr = iota
filterAttrActBadArch scmpFilterAttr = iota
filterAttrNNP scmpFilterAttr = iota
filterAttrTsync scmpFilterAttr = iota
)
const (
// An error return from certain libseccomp functions
scmpError C.int = -1
// Comparison boundaries to check for architecture validity
archStart ScmpArch = ArchNative
archEnd ScmpArch = ArchS390X
// Comparison boundaries to check for action validity
actionStart ScmpAction = ActKill
actionEnd ScmpAction = ActAllow
// Comparison boundaries to check for comparison operator validity
compareOpStart ScmpCompareOp = CompareNotEqual
compareOpEnd ScmpCompareOp = CompareMaskedEqual
)
var (
// Error thrown on bad filter context
errBadFilter = fmt.Errorf("filter is invalid or uninitialized")
// Constants representing library major, minor, and micro versions
verMajor = int(C.C_VERSION_MAJOR)
verMinor = int(C.C_VERSION_MINOR)
verMicro = int(C.C_VERSION_MICRO)
)
// Nonexported functions
// Check if library version is greater than or equal to the given one
func checkVersionAbove(major, minor, micro int) bool {
return (verMajor > major) ||
(verMajor == major && verMinor > minor) ||
(verMajor == major && verMinor == minor && verMicro >= micro)
}
// Init function: Verify library version is appropriate
func init() {
if !checkVersionAbove(2, 1, 0) {
fmt.Fprintf(os.Stderr, "Libseccomp version too low: minimum supported is 2.1.0, detected %d.%d.%d", C.C_VERSION_MAJOR, C.C_VERSION_MINOR, C.C_VERSION_MICRO)
os.Exit(-1)
}
}
// Filter helpers
// Filter finalizer - ensure that kernel context for filters is freed
func filterFinalizer(f *ScmpFilter) {
f.Release()
}
// Get a raw filter attribute
func (f *ScmpFilter) getFilterAttr(attr scmpFilterAttr) (C.uint32_t, error) {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return 0x0, errBadFilter
}
if !checkVersionAbove(2, 2, 0) && attr == filterAttrTsync {
return 0x0, fmt.Errorf("the thread synchronization attribute is not supported in this version of the library")
}
var attribute C.uint32_t
retCode := C.seccomp_attr_get(f.filterCtx, attr.toNative(), &attribute)
if retCode != 0 {
return 0x0, syscall.Errno(-1 * retCode)
}
return attribute, nil
}
// Set a raw filter attribute
func (f *ScmpFilter) setFilterAttr(attr scmpFilterAttr, value C.uint32_t) error {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return errBadFilter
}
if !checkVersionAbove(2, 2, 0) && attr == filterAttrTsync {
return fmt.Errorf("the thread synchronization attribute is not supported in this version of the library")
}
retCode := C.seccomp_attr_set(f.filterCtx, attr.toNative(), value)
if retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// DOES NOT LOCK OR CHECK VALIDITY
// Assumes caller has already done this
// Wrapper for seccomp_rule_add_... functions
func (f *ScmpFilter) addRuleWrapper(call ScmpSyscall, action ScmpAction, exact bool, cond C.scmp_cast_t) error {
var length C.uint
if cond != nil {
length = 1
} else {
length = 0
}
var retCode C.int
if exact {
retCode = C.seccomp_rule_add_exact_array(f.filterCtx, action.toNative(), C.int(call), length, cond)
} else {
retCode = C.seccomp_rule_add_array(f.filterCtx, action.toNative(), C.int(call), length, cond)
}
if syscall.Errno(-1*retCode) == syscall.EFAULT {
return fmt.Errorf("unrecognized syscall")
} else if syscall.Errno(-1*retCode) == syscall.EPERM {
return fmt.Errorf("requested action matches default action of filter")
} else if retCode != 0 {
return syscall.Errno(-1 * retCode)
}
return nil
}
// Generic add function for filter rules
func (f *ScmpFilter) addRuleGeneric(call ScmpSyscall, action ScmpAction, exact bool, conds []ScmpCondition) error {
f.lock.Lock()
defer f.lock.Unlock()
if !f.valid {
return errBadFilter
}
if len(conds) == 0 {
if err := f.addRuleWrapper(call, action, exact, nil); err != nil {
return err
}
} else {
// We don't support conditional filtering in library version v2.1
if !checkVersionAbove(2, 2, 1) {
return fmt.Errorf("conditional filtering requires libseccomp version >= 2.2.1")
}
for _, cond := range conds {
cmpStruct := C.make_struct_arg_cmp(C.uint(cond.Argument), cond.Op.toNative(), C.uint64_t(cond.Operand1), C.uint64_t(cond.Operand2))
defer C.free(cmpStruct)
if err := f.addRuleWrapper(call, action, exact, C.scmp_cast_t(cmpStruct)); err != nil {
return err
}
}
}
return nil
}
// Generic Helpers
// Helper - Sanitize Arch token input
func sanitizeArch(in ScmpArch) error {
if in < archStart || in > archEnd {
return fmt.Errorf("unrecognized architecture")
}
if in.toNative() == C.C_ARCH_BAD {
return fmt.Errorf("architecture is not supported on this version of the library")
}
return nil
}
func sanitizeAction(in ScmpAction) error {
inTmp := in & 0x0000FFFF
if inTmp < actionStart || inTmp > actionEnd {
return fmt.Errorf("unrecognized action")
}
if inTmp != ActTrace && inTmp != ActErrno && (in&0xFFFF0000) != 0 {
return fmt.Errorf("highest 16 bits must be zeroed except for Trace and Errno")
}
return nil
}
func sanitizeCompareOp(in ScmpCompareOp) error {
if in < compareOpStart || in > compareOpEnd {
return fmt.Errorf("unrecognized comparison operator")
}
return nil
}
func archFromNative(a C.uint32_t) (ScmpArch, error) {
switch a {
case C.C_ARCH_X86:
return ArchX86, nil
case C.C_ARCH_X86_64:
return ArchAMD64, nil
case C.C_ARCH_X32:
return ArchX32, nil
case C.C_ARCH_ARM:
return ArchARM, nil
case C.C_ARCH_NATIVE:
return ArchNative, nil
case C.C_ARCH_AARCH64:
return ArchARM64, nil
case C.C_ARCH_MIPS:
return ArchMIPS, nil
case C.C_ARCH_MIPS64:
return ArchMIPS64, nil
case C.C_ARCH_MIPS64N32:
return ArchMIPS64N32, nil
case C.C_ARCH_MIPSEL:
return ArchMIPSEL, nil
case C.C_ARCH_MIPSEL64:
return ArchMIPSEL64, nil
case C.C_ARCH_MIPSEL64N32:
return ArchMIPSEL64N32, nil
case C.C_ARCH_PPC:
return ArchPPC, nil
case C.C_ARCH_PPC64:
return ArchPPC64, nil
case C.C_ARCH_PPC64LE:
return ArchPPC64LE, nil
case C.C_ARCH_S390:
return ArchS390, nil
case C.C_ARCH_S390X:
return ArchS390X, nil
default:
return 0x0, fmt.Errorf("unrecognized architecture")
}
}
// Only use with sanitized arches, no error handling
func (a ScmpArch) toNative() C.uint32_t {
switch a {
case ArchX86:
return C.C_ARCH_X86
case ArchAMD64:
return C.C_ARCH_X86_64
case ArchX32:
return C.C_ARCH_X32
case ArchARM:
return C.C_ARCH_ARM
case ArchARM64:
return C.C_ARCH_AARCH64
case ArchMIPS:
return C.C_ARCH_MIPS
case ArchMIPS64:
return C.C_ARCH_MIPS64
case ArchMIPS64N32:
return C.C_ARCH_MIPS64N32
case ArchMIPSEL:
return C.C_ARCH_MIPSEL
case ArchMIPSEL64:
return C.C_ARCH_MIPSEL64
case ArchMIPSEL64N32:
return C.C_ARCH_MIPSEL64N32
case ArchPPC:
return C.C_ARCH_PPC
case ArchPPC64:
return C.C_ARCH_PPC64
case ArchPPC64LE:
return C.C_ARCH_PPC64LE
case ArchS390:
return C.C_ARCH_S390
case ArchS390X:
return C.C_ARCH_S390X
case ArchNative:
return C.C_ARCH_NATIVE
default:
return 0x0
}
}
// Only use with sanitized ops, no error handling
func (a ScmpCompareOp) toNative() C.int {
switch a {
case CompareNotEqual:
return C.C_CMP_NE
case CompareLess:
return C.C_CMP_LT
case CompareLessOrEqual:
return C.C_CMP_LE
case CompareEqual:
return C.C_CMP_EQ
case CompareGreaterEqual:
return C.C_CMP_GE
case CompareGreater:
return C.C_CMP_GT
case CompareMaskedEqual:
return C.C_CMP_MASKED_EQ
default:
return 0x0
}
}
func actionFromNative(a C.uint32_t) (ScmpAction, error) {
aTmp := a & 0xFFFF
switch a & 0xFFFF0000 {
case C.C_ACT_KILL:
return ActKill, nil
case C.C_ACT_TRAP:
return ActTrap, nil
case C.C_ACT_ERRNO:
return ActErrno.SetReturnCode(int16(aTmp)), nil
case C.C_ACT_TRACE:
return ActTrace.SetReturnCode(int16(aTmp)), nil
case C.C_ACT_ALLOW:
return ActAllow, nil
default:
return 0x0, fmt.Errorf("unrecognized action")
}
}
// Only use with sanitized actions, no error handling
func (a ScmpAction) toNative() C.uint32_t {
switch a & 0xFFFF {
case ActKill:
return C.C_ACT_KILL
case ActTrap:
return C.C_ACT_TRAP
case ActErrno:
return C.C_ACT_ERRNO | (C.uint32_t(a) >> 16)
case ActTrace:
return C.C_ACT_TRACE | (C.uint32_t(a) >> 16)
case ActAllow:
return C.C_ACT_ALLOW
default:
return 0x0
}
}
// Internal only, assumes safe attribute
func (a scmpFilterAttr) toNative() uint32 {
switch a {
case filterAttrActDefault:
return uint32(C.C_ATTRIBUTE_DEFAULT)
case filterAttrActBadArch:
return uint32(C.C_ATTRIBUTE_BADARCH)
case filterAttrNNP:
return uint32(C.C_ATTRIBUTE_NNP)
case filterAttrTsync:
return uint32(C.C_ATTRIBUTE_TSYNC)
default:
return 0x0
}
}