simplify rootless
Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>docker-18.09
parent
8db1defa00
commit
048130d1d0
|
@ -1,7 +1,7 @@
|
|||
# Rootless mode (Experimental)
|
||||
|
||||
Requirements:
|
||||
- runc `ecd55a4135e0a26de884ce436442914f945b1e76` (May 30, 2018) or later
|
||||
- runc `a00bf0190895aa465a5fbed0268888e2c8ddfe85` (Oct 15, 2018) or later
|
||||
- Some distros such as Debian (excluding Ubuntu) and Arch Linux require `echo 1 > /proc/sys/kernel/unprivileged_userns_clone`
|
||||
- `newuidmap` and `newgidmap` need to be installed on the host. These commands are provided by the `uidmap` package.
|
||||
- `/etc/subuid` and `/etc/subgid` should contain >= 65536 sub-IDs. e.g. `penguin:231072:65536`.
|
||||
|
@ -45,6 +45,7 @@ unshared# buildkitd
|
|||
* `overlayfs` snapshotter is not supported except Ubuntu-flavored kernel: http://kernel.ubuntu.com/git/ubuntu/ubuntu-artful.git/commit/fs/overlayfs?h=Ubuntu-4.13.0-25.29&id=0a414bdc3d01f3b61ed86cfe3ce8b63a9240eba7
|
||||
* containerd worker is not supported ( pending PR: https://github.com/containerd/containerd/pull/2006 )
|
||||
* Network namespace is not used at the moment.
|
||||
* Cgroups is disabled.
|
||||
|
||||
### Terminal 2:
|
||||
|
||||
|
@ -61,7 +62,7 @@ $ docker run --name buildkitd -d --privileged -p 1234:1234 buildkit-rootless --
|
|||
```
|
||||
|
||||
`docker run` requires `--privileged` but the BuildKit daemon is executed as a normal user.
|
||||
See [`moby/moby#36597`](https://github.com/moby/moby/issues/36597, [`kubernetes/community#1934`](https://github.com/kubernetes/community/pull/1934) and [Jess's blog](https://blog.jessfraz.com/post/building-container-images-securely-on-kubernetes/) for the ongoing work to remove this requirement
|
||||
See [`docker/cli#1347`](https://github.com/docker/cli/pull/1347) for the ongoing work to remove this requirement
|
||||
|
||||
```
|
||||
$ docker exec buildkitd id
|
||||
|
|
|
@ -18,7 +18,7 @@ func main() {
|
|||
var opt buildOpt
|
||||
flag.BoolVar(&opt.withContainerd, "with-containerd", true, "enable containerd worker")
|
||||
flag.StringVar(&opt.containerd, "containerd", "v1.2.0-rc.1", "containerd version")
|
||||
flag.StringVar(&opt.runc, "runc", "dd56ece8236d6d9e5bed4ea0c31fe53c7b873ff4", "runc version")
|
||||
flag.StringVar(&opt.runc, "runc", "a00bf0190895aa465a5fbed0268888e2c8ddfe85", "runc version")
|
||||
flag.Parse()
|
||||
|
||||
bk := buildkit(opt)
|
||||
|
|
|
@ -18,7 +18,7 @@ func main() {
|
|||
var opt buildOpt
|
||||
flag.BoolVar(&opt.withContainerd, "with-containerd", true, "enable containerd worker")
|
||||
flag.StringVar(&opt.containerd, "containerd", "v1.2.0-rc.1", "containerd version")
|
||||
flag.StringVar(&opt.runc, "runc", "dd56ece8236d6d9e5bed4ea0c31fe53c7b873ff4", "runc version")
|
||||
flag.StringVar(&opt.runc, "runc", "a00bf0190895aa465a5fbed0268888e2c8ddfe85", "runc version")
|
||||
flag.Parse()
|
||||
|
||||
bk := buildkit(opt)
|
||||
|
|
|
@ -18,7 +18,7 @@ func main() {
|
|||
var opt buildOpt
|
||||
flag.BoolVar(&opt.withContainerd, "with-containerd", true, "enable containerd worker")
|
||||
flag.StringVar(&opt.containerd, "containerd", "v1.2.0-rc.1", "containerd version")
|
||||
flag.StringVar(&opt.runc, "runc", "dd56ece8236d6d9e5bed4ea0c31fe53c7b873ff4", "runc version")
|
||||
flag.StringVar(&opt.runc, "runc", "a00bf0190895aa465a5fbed0268888e2c8ddfe85", "runc version")
|
||||
flag.Parse()
|
||||
|
||||
bk := buildkit(opt)
|
||||
|
|
|
@ -19,7 +19,7 @@ func main() {
|
|||
var opt buildOpt
|
||||
flag.BoolVar(&opt.withContainerd, "with-containerd", true, "enable containerd worker")
|
||||
flag.StringVar(&opt.containerd, "containerd", "v1.2.0-rc.1", "containerd version")
|
||||
flag.StringVar(&opt.runc, "runc", "dd56ece8236d6d9e5bed4ea0c31fe53c7b873ff4", "runc version")
|
||||
flag.StringVar(&opt.runc, "runc", "a00bf0190895aa465a5fbed0268888e2c8ddfe85", "runc version")
|
||||
flag.StringVar(&opt.buildkit, "buildkit", "master", "buildkit version")
|
||||
flag.Parse()
|
||||
|
||||
|
|
|
@ -1604,10 +1604,6 @@ RUN ["ls"]
|
|||
}
|
||||
|
||||
func testUser(t *testing.T, sb integration.Sandbox) {
|
||||
if sb.Rootless() {
|
||||
t.Skip("only for rootful worker, due to lack of support for additional gids (https://github.com/opencontainers/runc/issues/1835)")
|
||||
}
|
||||
|
||||
f := getFrontend(t, sb)
|
||||
|
||||
dockerfile := []byte(`
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
ARG RUNC_VERSION=00dc70017d222b178a002ed30e9321b12647af2d
|
||||
ARG RUNC_VERSION=a00bf0190895aa465a5fbed0268888e2c8ddfe85
|
||||
ARG CONTAINERD_VERSION=v1.2.0-rc.1
|
||||
# containerd v1.0 for integration tests
|
||||
ARG CONTAINERD10_VERSION=v1.0.3
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# syntax = tonistiigi/dockerfile:runmount20180925
|
||||
|
||||
ARG RUNC_VERSION=00dc70017d222b178a002ed30e9321b12647af2d
|
||||
ARG RUNC_VERSION=a00bf0190895aa465a5fbed0268888e2c8ddfe85
|
||||
ARG CONTAINERD_VERSION=v1.2.0-rc.1
|
||||
# containerd v1.0 for integration tests
|
||||
ARG CONTAINERD10_VERSION=v1.0.3
|
||||
|
|
|
@ -1,113 +1,40 @@
|
|||
package specconv
|
||||
|
||||
import (
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
"github.com/opencontainers/runc/libcontainer/user"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
// ToRootless converts spec to be compatible with "rootless" runc.
|
||||
// * Adds userns (Note: since we are already in userns, ideally we should not need to do this. runc-side issue is tracked at https://github.com/opencontainers/runc/issues/1837)
|
||||
// * Fix up mount flags (same as above)
|
||||
// * Replace /sys with bind-mount (FIXME: we don't need to do this if netns is unshared)
|
||||
// * Remove /sys mount
|
||||
// * Remove cgroups
|
||||
//
|
||||
// See docs/rootless.md for the supported runc revision.
|
||||
func ToRootless(spec *specs.Spec) error {
|
||||
if !system.RunningInUserNS() {
|
||||
return errors.New("needs to be in user namespace")
|
||||
// Remove /sys mount because we can't mount /sys when the daemon netns
|
||||
// is not unshared from the host.
|
||||
//
|
||||
// Instead, we could bind-mount /sys from the host, however, `rbind, ro`
|
||||
// does not make /sys/fs/cgroup read-only (and we can't bind-mount /sys
|
||||
// without rbind)
|
||||
//
|
||||
// PR for making /sys/fs/cgroup read-only is proposed, but it is very
|
||||
// complicated: https://github.com/opencontainers/runc/pull/1869
|
||||
//
|
||||
// For buildkit usecase, we suppose we don't need to provide /sys to
|
||||
// containers and remove /sys mount as a workaround.
|
||||
var mounts []specs.Mount
|
||||
for _, mount := range spec.Mounts {
|
||||
if strings.HasPrefix(mount.Destination, "/sys") {
|
||||
continue
|
||||
}
|
||||
uidMap, err := user.CurrentProcessUIDMap()
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
gidMap, err := user.CurrentProcessUIDMap()
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return toRootless(spec, uidMap, gidMap)
|
||||
mounts = append(mounts, mount)
|
||||
}
|
||||
spec.Mounts = mounts
|
||||
|
||||
// toRootless was forked from github.com/opencontainers/runc/libcontainer/specconv
|
||||
func toRootless(spec *specs.Spec, uidMap, gidMap []user.IDMap) error {
|
||||
if err := configureUserNS(spec, uidMap, gidMap); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := configureMounts(spec); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Remove cgroup settings.
|
||||
// Remove cgroups so as to avoid `container_linux.go:337: starting container process caused "process_linux.go:280: applying cgroup configuration for process caused \"mkdir /sys/fs/cgroup/cpuset/buildkit: permission denied\""`
|
||||
spec.Linux.Resources = nil
|
||||
spec.Linux.CgroupsPath = ""
|
||||
return nil
|
||||
}
|
||||
|
||||
// configureUserNS add suserns and the current ID map to the spec.
|
||||
// Since we are already in userns, ideally we should not need to add userns.
|
||||
// However, currently rootless runc always requires userns to be added.
|
||||
// https://github.com/opencontainers/runc/issues/1837
|
||||
func configureUserNS(spec *specs.Spec, uidMap, gidMap []user.IDMap) error {
|
||||
spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{
|
||||
Type: specs.UserNamespace,
|
||||
})
|
||||
|
||||
sort.Slice(uidMap, func(i, j int) bool { return uidMap[i].ID < uidMap[j].ID })
|
||||
uNextContainerID := int64(0)
|
||||
for _, u := range uidMap {
|
||||
spec.Linux.UIDMappings = append(spec.Linux.UIDMappings,
|
||||
specs.LinuxIDMapping{
|
||||
HostID: uint32(u.ID),
|
||||
ContainerID: uint32(uNextContainerID),
|
||||
Size: uint32(u.Count),
|
||||
})
|
||||
uNextContainerID += int64(u.Count)
|
||||
}
|
||||
sort.Slice(gidMap, func(i, j int) bool { return gidMap[i].ID < gidMap[j].ID })
|
||||
gNextContainerID := int64(0)
|
||||
for _, g := range gidMap {
|
||||
spec.Linux.GIDMappings = append(spec.Linux.GIDMappings,
|
||||
specs.LinuxIDMapping{
|
||||
HostID: uint32(g.ID),
|
||||
ContainerID: uint32(gNextContainerID),
|
||||
Size: uint32(g.Count),
|
||||
})
|
||||
gNextContainerID += int64(g.Count)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func configureMounts(spec *specs.Spec) error {
|
||||
var mounts []specs.Mount
|
||||
for _, mount := range spec.Mounts {
|
||||
// Ignore all mounts that are under /sys, because we add /sys later.
|
||||
if strings.HasPrefix(mount.Destination, "/sys") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Remove all gid= and uid= mappings.
|
||||
// Since we are already in userns, ideally we should not need to do this.
|
||||
// https://github.com/opencontainers/runc/issues/1837
|
||||
var options []string
|
||||
for _, option := range mount.Options {
|
||||
if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") {
|
||||
options = append(options, option)
|
||||
}
|
||||
}
|
||||
mount.Options = options
|
||||
mounts = append(mounts, mount)
|
||||
}
|
||||
|
||||
// Add the sysfs mount as an rbind, because we can't mount /sys unless we have netns.
|
||||
// TODO: keep original /sys mount when we have netns.
|
||||
mounts = append(mounts, specs.Mount{
|
||||
Source: "/sys",
|
||||
Destination: "/sys",
|
||||
Type: "none",
|
||||
Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
|
||||
})
|
||||
spec.Mounts = mounts
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -1,42 +0,0 @@
|
|||
package specconv
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/specconv"
|
||||
"github.com/opencontainers/runc/libcontainer/user"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestToRootless(t *testing.T) {
|
||||
spec := specconv.Example()
|
||||
uidMap := []user.IDMap{
|
||||
{
|
||||
ID: 0,
|
||||
ParentID: 4242,
|
||||
Count: 1,
|
||||
},
|
||||
{
|
||||
ID: 1,
|
||||
ParentID: 231072,
|
||||
Count: 65536,
|
||||
},
|
||||
}
|
||||
gidMap := uidMap
|
||||
expectedUIDMappings := []specs.LinuxIDMapping{
|
||||
{
|
||||
HostID: 0,
|
||||
ContainerID: 0,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
HostID: 1,
|
||||
ContainerID: 1,
|
||||
Size: 65536,
|
||||
},
|
||||
}
|
||||
err := toRootless(spec, uidMap, gidMap)
|
||||
require.NoError(t, err)
|
||||
require.EqualValues(t, expectedUIDMappings, spec.Linux.UIDMappings)
|
||||
}
|
|
@ -18,7 +18,7 @@ github.com/gogo/googleapis b23578765ee54ff6bceff57f397d833bf4ca6869
|
|||
github.com/golang/protobuf v1.1.0
|
||||
github.com/containerd/continuity bd77b46c8352f74eb12c85bdc01f4b90f69d66b4
|
||||
github.com/opencontainers/image-spec v1.0.1
|
||||
github.com/opencontainers/runc 00dc70017d222b178a002ed30e9321b12647af2d
|
||||
github.com/opencontainers/runc a00bf0190895aa465a5fbed0268888e2c8ddfe85
|
||||
github.com/Microsoft/go-winio v0.4.11
|
||||
github.com/containerd/fifo 3d5202aec260678c48179c56f40e6f38a095738c
|
||||
github.com/opencontainers/runtime-spec eba862dc2470385a233c7507392675cbeadf7353 # v1.0.1-45-geba862d
|
||||
|
@ -67,6 +67,3 @@ github.com/opentracing-contrib/go-stdlib b1a47cfbdd7543e70e9ef3e73d0802ad306cc1c
|
|||
# used by dockerfile tests
|
||||
gotest.tools v2.1.0
|
||||
github.com/google/go-cmp v0.2.0
|
||||
|
||||
# used by rootless spec conv test
|
||||
github.com/seccomp/libseccomp-golang 32f571b70023028bd57d9288c20efbcb237f3ce0
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
package configs
|
||||
|
||||
import "fmt"
|
||||
|
||||
// blockIODevice holds major:minor format supported in blkio cgroup
|
||||
type blockIODevice struct {
|
||||
// Major is the device's major number
|
||||
Major int64 `json:"major"`
|
||||
// Minor is the device's minor number
|
||||
Minor int64 `json:"minor"`
|
||||
}
|
||||
|
||||
// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
|
||||
type WeightDevice struct {
|
||||
blockIODevice
|
||||
// Weight is the bandwidth rate for the device, range is from 10 to 1000
|
||||
Weight uint16 `json:"weight"`
|
||||
// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
|
||||
LeafWeight uint16 `json:"leafWeight"`
|
||||
}
|
||||
|
||||
// NewWeightDevice returns a configured WeightDevice pointer
|
||||
func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice {
|
||||
wd := &WeightDevice{}
|
||||
wd.Major = major
|
||||
wd.Minor = minor
|
||||
wd.Weight = weight
|
||||
wd.LeafWeight = leafWeight
|
||||
return wd
|
||||
}
|
||||
|
||||
// WeightString formats the struct to be writable to the cgroup specific file
|
||||
func (wd *WeightDevice) WeightString() string {
|
||||
return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)
|
||||
}
|
||||
|
||||
// LeafWeightString formats the struct to be writable to the cgroup specific file
|
||||
func (wd *WeightDevice) LeafWeightString() string {
|
||||
return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)
|
||||
}
|
||||
|
||||
// ThrottleDevice struct holds a `major:minor rate_per_second` pair
|
||||
type ThrottleDevice struct {
|
||||
blockIODevice
|
||||
// Rate is the IO rate limit per cgroup per device
|
||||
Rate uint64 `json:"rate"`
|
||||
}
|
||||
|
||||
// NewThrottleDevice returns a configured ThrottleDevice pointer
|
||||
func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
|
||||
td := &ThrottleDevice{}
|
||||
td.Major = major
|
||||
td.Minor = minor
|
||||
td.Rate = rate
|
||||
return td
|
||||
}
|
||||
|
||||
// String formats the struct to be writable to the cgroup specific file
|
||||
func (td *ThrottleDevice) String() string {
|
||||
return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
|
||||
}
|
|
@ -1,122 +0,0 @@
|
|||
package configs
|
||||
|
||||
type FreezerState string
|
||||
|
||||
const (
|
||||
Undefined FreezerState = ""
|
||||
Frozen FreezerState = "FROZEN"
|
||||
Thawed FreezerState = "THAWED"
|
||||
)
|
||||
|
||||
type Cgroup struct {
|
||||
// Deprecated, use Path instead
|
||||
Name string `json:"name,omitempty"`
|
||||
|
||||
// name of parent of cgroup or slice
|
||||
// Deprecated, use Path instead
|
||||
Parent string `json:"parent,omitempty"`
|
||||
|
||||
// Path specifies the path to cgroups that are created and/or joined by the container.
|
||||
// The path is assumed to be relative to the host system cgroup mountpoint.
|
||||
Path string `json:"path"`
|
||||
|
||||
// ScopePrefix describes prefix for the scope name
|
||||
ScopePrefix string `json:"scope_prefix"`
|
||||
|
||||
// Paths represent the absolute cgroups paths to join.
|
||||
// This takes precedence over Path.
|
||||
Paths map[string]string
|
||||
|
||||
// Resources contains various cgroups settings to apply
|
||||
*Resources
|
||||
}
|
||||
|
||||
type Resources struct {
|
||||
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
|
||||
// Deprecated
|
||||
AllowAllDevices *bool `json:"allow_all_devices,omitempty"`
|
||||
// Deprecated
|
||||
AllowedDevices []*Device `json:"allowed_devices,omitempty"`
|
||||
// Deprecated
|
||||
DeniedDevices []*Device `json:"denied_devices,omitempty"`
|
||||
|
||||
Devices []*Device `json:"devices"`
|
||||
|
||||
// Memory limit (in bytes)
|
||||
Memory int64 `json:"memory"`
|
||||
|
||||
// Memory reservation or soft_limit (in bytes)
|
||||
MemoryReservation int64 `json:"memory_reservation"`
|
||||
|
||||
// Total memory usage (memory + swap); set `-1` to enable unlimited swap
|
||||
MemorySwap int64 `json:"memory_swap"`
|
||||
|
||||
// Kernel memory limit (in bytes)
|
||||
KernelMemory int64 `json:"kernel_memory"`
|
||||
|
||||
// Kernel memory limit for TCP use (in bytes)
|
||||
KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
|
||||
|
||||
// CPU shares (relative weight vs. other containers)
|
||||
CpuShares uint64 `json:"cpu_shares"`
|
||||
|
||||
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
|
||||
CpuQuota int64 `json:"cpu_quota"`
|
||||
|
||||
// CPU period to be used for hardcapping (in usecs). 0 to use system default.
|
||||
CpuPeriod uint64 `json:"cpu_period"`
|
||||
|
||||
// How many time CPU will use in realtime scheduling (in usecs).
|
||||
CpuRtRuntime int64 `json:"cpu_rt_quota"`
|
||||
|
||||
// CPU period to be used for realtime scheduling (in usecs).
|
||||
CpuRtPeriod uint64 `json:"cpu_rt_period"`
|
||||
|
||||
// CPU to use
|
||||
CpusetCpus string `json:"cpuset_cpus"`
|
||||
|
||||
// MEM to use
|
||||
CpusetMems string `json:"cpuset_mems"`
|
||||
|
||||
// Process limit; set <= `0' to disable limit.
|
||||
PidsLimit int64 `json:"pids_limit"`
|
||||
|
||||
// Specifies per cgroup weight, range is from 10 to 1000.
|
||||
BlkioWeight uint16 `json:"blkio_weight"`
|
||||
|
||||
// Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
|
||||
BlkioLeafWeight uint16 `json:"blkio_leaf_weight"`
|
||||
|
||||
// Weight per cgroup per device, can override BlkioWeight.
|
||||
BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"`
|
||||
|
||||
// IO read rate limit per cgroup per device, bytes per second.
|
||||
BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
|
||||
|
||||
// IO write rate limit per cgroup per device, bytes per second.
|
||||
BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
|
||||
|
||||
// IO read rate limit per cgroup per device, IO per second.
|
||||
BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"`
|
||||
|
||||
// IO write rate limit per cgroup per device, IO per second.
|
||||
BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"`
|
||||
|
||||
// set the freeze value for the process
|
||||
Freezer FreezerState `json:"freezer"`
|
||||
|
||||
// Hugetlb limit (in bytes)
|
||||
HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
|
||||
|
||||
// Whether to disable OOM Killer
|
||||
OomKillDisable bool `json:"oom_kill_disable"`
|
||||
|
||||
// Tuning swappiness behaviour per cgroup
|
||||
MemorySwappiness *uint64 `json:"memory_swappiness"`
|
||||
|
||||
// Set priority of network traffic for container
|
||||
NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
|
||||
|
||||
// Set class identifier for container's network packets
|
||||
NetClsClassid uint32 `json:"net_cls_classid_u"`
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
package configs
|
||||
|
||||
// TODO Windows: This can ultimately be entirely factored out on Windows as
|
||||
// cgroups are a Unix-specific construct.
|
||||
type Cgroup struct {
|
||||
}
|
|
@ -1,349 +0,0 @@
|
|||
package configs
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type Rlimit struct {
|
||||
Type int `json:"type"`
|
||||
Hard uint64 `json:"hard"`
|
||||
Soft uint64 `json:"soft"`
|
||||
}
|
||||
|
||||
// IDMap represents UID/GID Mappings for User Namespaces.
|
||||
type IDMap struct {
|
||||
ContainerID int `json:"container_id"`
|
||||
HostID int `json:"host_id"`
|
||||
Size int `json:"size"`
|
||||
}
|
||||
|
||||
// Seccomp represents syscall restrictions
|
||||
// By default, only the native architecture of the kernel is allowed to be used
|
||||
// for syscalls. Additional architectures can be added by specifying them in
|
||||
// Architectures.
|
||||
type Seccomp struct {
|
||||
DefaultAction Action `json:"default_action"`
|
||||
Architectures []string `json:"architectures"`
|
||||
Syscalls []*Syscall `json:"syscalls"`
|
||||
}
|
||||
|
||||
// Action is taken upon rule match in Seccomp
|
||||
type Action int
|
||||
|
||||
const (
|
||||
Kill Action = iota + 1
|
||||
Errno
|
||||
Trap
|
||||
Allow
|
||||
Trace
|
||||
)
|
||||
|
||||
// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
|
||||
type Operator int
|
||||
|
||||
const (
|
||||
EqualTo Operator = iota + 1
|
||||
NotEqualTo
|
||||
GreaterThan
|
||||
GreaterThanOrEqualTo
|
||||
LessThan
|
||||
LessThanOrEqualTo
|
||||
MaskEqualTo
|
||||
)
|
||||
|
||||
// Arg is a rule to match a specific syscall argument in Seccomp
|
||||
type Arg struct {
|
||||
Index uint `json:"index"`
|
||||
Value uint64 `json:"value"`
|
||||
ValueTwo uint64 `json:"value_two"`
|
||||
Op Operator `json:"op"`
|
||||
}
|
||||
|
||||
// Syscall is a rule to match a syscall in Seccomp
|
||||
type Syscall struct {
|
||||
Name string `json:"name"`
|
||||
Action Action `json:"action"`
|
||||
Args []*Arg `json:"args"`
|
||||
}
|
||||
|
||||
// TODO Windows. Many of these fields should be factored out into those parts
|
||||
// which are common across platforms, and those which are platform specific.
|
||||
|
||||
// Config defines configuration options for executing a process inside a contained environment.
|
||||
type Config struct {
|
||||
// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
|
||||
// This is a common option when the container is running in ramdisk
|
||||
NoPivotRoot bool `json:"no_pivot_root"`
|
||||
|
||||
// ParentDeathSignal specifies the signal that is sent to the container's process in the case
|
||||
// that the parent process dies.
|
||||
ParentDeathSignal int `json:"parent_death_signal"`
|
||||
|
||||
// Path to a directory containing the container's root filesystem.
|
||||
Rootfs string `json:"rootfs"`
|
||||
|
||||
// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
|
||||
// bind mounts are writtable.
|
||||
Readonlyfs bool `json:"readonlyfs"`
|
||||
|
||||
// Specifies the mount propagation flags to be applied to /.
|
||||
RootPropagation int `json:"rootPropagation"`
|
||||
|
||||
// Mounts specify additional source and destination paths that will be mounted inside the container's
|
||||
// rootfs and mount namespace if specified
|
||||
Mounts []*Mount `json:"mounts"`
|
||||
|
||||
// The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well!
|
||||
Devices []*Device `json:"devices"`
|
||||
|
||||
MountLabel string `json:"mount_label"`
|
||||
|
||||
// Hostname optionally sets the container's hostname if provided
|
||||
Hostname string `json:"hostname"`
|
||||
|
||||
// Namespaces specifies the container's namespaces that it should setup when cloning the init process
|
||||
// If a namespace is not provided that namespace is shared from the container's parent process
|
||||
Namespaces Namespaces `json:"namespaces"`
|
||||
|
||||
// Capabilities specify the capabilities to keep when executing the process inside the container
|
||||
// All capabilities not specified will be dropped from the processes capability mask
|
||||
Capabilities *Capabilities `json:"capabilities"`
|
||||
|
||||
// Networks specifies the container's network setup to be created
|
||||
Networks []*Network `json:"networks"`
|
||||
|
||||
// Routes can be specified to create entries in the route table as the container is started
|
||||
Routes []*Route `json:"routes"`
|
||||
|
||||
// Cgroups specifies specific cgroup settings for the various subsystems that the container is
|
||||
// placed into to limit the resources the container has available
|
||||
Cgroups *Cgroup `json:"cgroups"`
|
||||
|
||||
// AppArmorProfile specifies the profile to apply to the process running in the container and is
|
||||
// change at the time the process is execed
|
||||
AppArmorProfile string `json:"apparmor_profile,omitempty"`
|
||||
|
||||
// ProcessLabel specifies the label to apply to the process running in the container. It is
|
||||
// commonly used by selinux
|
||||
ProcessLabel string `json:"process_label,omitempty"`
|
||||
|
||||
// Rlimits specifies the resource limits, such as max open files, to set in the container
|
||||
// If Rlimits are not set, the container will inherit rlimits from the parent process
|
||||
Rlimits []Rlimit `json:"rlimits,omitempty"`
|
||||
|
||||
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
|
||||
// for a process. Valid values are between the range [-1000, '1000'], where processes with
|
||||
// higher scores are preferred for being killed. If it is unset then we don't touch the current
|
||||
// value.
|
||||
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
|
||||
OomScoreAdj *int `json:"oom_score_adj,omitempty"`
|
||||
|
||||
// UidMappings is an array of User ID mappings for User Namespaces
|
||||
UidMappings []IDMap `json:"uid_mappings"`
|
||||
|
||||
// GidMappings is an array of Group ID mappings for User Namespaces
|
||||
GidMappings []IDMap `json:"gid_mappings"`
|
||||
|
||||
// MaskPaths specifies paths within the container's rootfs to mask over with a bind
|
||||
// mount pointing to /dev/null as to prevent reads of the file.
|
||||
MaskPaths []string `json:"mask_paths"`
|
||||
|
||||
// ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
|
||||
// so that these files prevent any writes.
|
||||
ReadonlyPaths []string `json:"readonly_paths"`
|
||||
|
||||
// Sysctl is a map of properties and their values. It is the equivalent of using
|
||||
// sysctl -w my.property.name value in Linux.
|
||||
Sysctl map[string]string `json:"sysctl"`
|
||||
|
||||
// Seccomp allows actions to be taken whenever a syscall is made within the container.
|
||||
// A number of rules are given, each having an action to be taken if a syscall matches it.
|
||||
// A default action to be taken if no rules match is also given.
|
||||
Seccomp *Seccomp `json:"seccomp"`
|
||||
|
||||
// NoNewPrivileges controls whether processes in the container can gain additional privileges.
|
||||
NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
|
||||
|
||||
// Hooks are a collection of actions to perform at various container lifecycle events.
|
||||
// CommandHooks are serialized to JSON, but other hooks are not.
|
||||
Hooks *Hooks
|
||||
|
||||
// Version is the version of opencontainer specification that is supported.
|
||||
Version string `json:"version"`
|
||||
|
||||
// Labels are user defined metadata that is stored in the config and populated on the state
|
||||
Labels []string `json:"labels"`
|
||||
|
||||
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
|
||||
// callers keyring in this case.
|
||||
NoNewKeyring bool `json:"no_new_keyring"`
|
||||
|
||||
// Rootless specifies whether the container is a rootless container.
|
||||
Rootless bool `json:"rootless"`
|
||||
|
||||
// IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into
|
||||
// to limit the resources (e.g., L3 cache) the container has available
|
||||
IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
|
||||
}
|
||||
|
||||
type Hooks struct {
|
||||
// Prestart commands are executed after the container namespaces are created,
|
||||
// but before the user supplied command is executed from init.
|
||||
Prestart []Hook
|
||||
|
||||
// Poststart commands are executed after the container init process starts.
|
||||
Poststart []Hook
|
||||
|
||||
// Poststop commands are executed after the container init process exits.
|
||||
Poststop []Hook
|
||||
}
|
||||
|
||||
type Capabilities struct {
|
||||
// Bounding is the set of capabilities checked by the kernel.
|
||||
Bounding []string
|
||||
// Effective is the set of capabilities checked by the kernel.
|
||||
Effective []string
|
||||
// Inheritable is the capabilities preserved across execve.
|
||||
Inheritable []string
|
||||
// Permitted is the limiting superset for effective capabilities.
|
||||
Permitted []string
|
||||
// Ambient is the ambient set of capabilities that are kept.
|
||||
Ambient []string
|
||||
}
|
||||
|
||||
func (hooks *Hooks) UnmarshalJSON(b []byte) error {
|
||||
var state struct {
|
||||
Prestart []CommandHook
|
||||
Poststart []CommandHook
|
||||
Poststop []CommandHook
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(b, &state); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
deserialize := func(shooks []CommandHook) (hooks []Hook) {
|
||||
for _, shook := range shooks {
|
||||
hooks = append(hooks, shook)
|
||||
}
|
||||
|
||||
return hooks
|
||||
}
|
||||
|
||||
hooks.Prestart = deserialize(state.Prestart)
|
||||
hooks.Poststart = deserialize(state.Poststart)
|
||||
hooks.Poststop = deserialize(state.Poststop)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (hooks Hooks) MarshalJSON() ([]byte, error) {
|
||||
serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
|
||||
for _, hook := range hooks {
|
||||
switch chook := hook.(type) {
|
||||
case CommandHook:
|
||||
serializableHooks = append(serializableHooks, chook)
|
||||
default:
|
||||
logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
|
||||
}
|
||||
}
|
||||
|
||||
return serializableHooks
|
||||
}
|
||||
|
||||
return json.Marshal(map[string]interface{}{
|
||||
"prestart": serialize(hooks.Prestart),
|
||||
"poststart": serialize(hooks.Poststart),
|
||||
"poststop": serialize(hooks.Poststop),
|
||||
})
|
||||
}
|
||||
|
||||
// HookState is the payload provided to a hook on execution.
|
||||
type HookState specs.State
|
||||
|
||||
type Hook interface {
|
||||
// Run executes the hook with the provided state.
|
||||
Run(HookState) error
|
||||
}
|
||||
|
||||
// NewFunctionHook will call the provided function when the hook is run.
|
||||
func NewFunctionHook(f func(HookState) error) FuncHook {
|
||||
return FuncHook{
|
||||
run: f,
|
||||
}
|
||||
}
|
||||
|
||||
type FuncHook struct {
|
||||
run func(HookState) error
|
||||
}
|
||||
|
||||
func (f FuncHook) Run(s HookState) error {
|
||||
return f.run(s)
|
||||
}
|
||||
|
||||
type Command struct {
|
||||
Path string `json:"path"`
|
||||
Args []string `json:"args"`
|
||||
Env []string `json:"env"`
|
||||
Dir string `json:"dir"`
|
||||
Timeout *time.Duration `json:"timeout"`
|
||||
}
|
||||
|
||||
// NewCommandHook will execute the provided command when the hook is run.
|
||||
func NewCommandHook(cmd Command) CommandHook {
|
||||
return CommandHook{
|
||||
Command: cmd,
|
||||
}
|
||||
}
|
||||
|
||||
type CommandHook struct {
|
||||
Command
|
||||
}
|
||||
|
||||
func (c Command) Run(s HookState) error {
|
||||
b, err := json.Marshal(s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd := exec.Cmd{
|
||||
Path: c.Path,
|
||||
Args: c.Args,
|
||||
Env: c.Env,
|
||||
Stdin: bytes.NewReader(b),
|
||||
Stdout: &stdout,
|
||||
Stderr: &stderr,
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
errC := make(chan error, 1)
|
||||
go func() {
|
||||
err := cmd.Wait()
|
||||
if err != nil {
|
||||
err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
|
||||
}
|
||||
errC <- err
|
||||
}()
|
||||
var timerCh <-chan time.Time
|
||||
if c.Timeout != nil {
|
||||
timer := time.NewTimer(*c.Timeout)
|
||||
defer timer.Stop()
|
||||
timerCh = timer.C
|
||||
}
|
||||
select {
|
||||
case err := <-errC:
|
||||
return err
|
||||
case <-timerCh:
|
||||
cmd.Process.Kill()
|
||||
cmd.Wait()
|
||||
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
|
||||
}
|
||||
}
|
|
@ -1,61 +0,0 @@
|
|||
package configs
|
||||
|
||||
import "fmt"
|
||||
|
||||
// HostUID gets the translated uid for the process on host which could be
|
||||
// different when user namespaces are enabled.
|
||||
func (c Config) HostUID(containerId int) (int, error) {
|
||||
if c.Namespaces.Contains(NEWUSER) {
|
||||
if c.UidMappings == nil {
|
||||
return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.")
|
||||
}
|
||||
id, found := c.hostIDFromMapping(containerId, c.UidMappings)
|
||||
if !found {
|
||||
return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.")
|
||||
}
|
||||
return id, nil
|
||||
}
|
||||
// Return unchanged id.
|
||||
return containerId, nil
|
||||
}
|
||||
|
||||
// HostRootUID gets the root uid for the process on host which could be non-zero
|
||||
// when user namespaces are enabled.
|
||||
func (c Config) HostRootUID() (int, error) {
|
||||
return c.HostUID(0)
|
||||
}
|
||||
|
||||
// HostGID gets the translated gid for the process on host which could be
|
||||
// different when user namespaces are enabled.
|
||||
func (c Config) HostGID(containerId int) (int, error) {
|
||||
if c.Namespaces.Contains(NEWUSER) {
|
||||
if c.GidMappings == nil {
|
||||
return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.")
|
||||
}
|
||||
id, found := c.hostIDFromMapping(containerId, c.GidMappings)
|
||||
if !found {
|
||||
return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.")
|
||||
}
|
||||
return id, nil
|
||||
}
|
||||
// Return unchanged id.
|
||||
return containerId, nil
|
||||
}
|
||||
|
||||
// HostRootGID gets the root gid for the process on host which could be non-zero
|
||||
// when user namespaces are enabled.
|
||||
func (c Config) HostRootGID() (int, error) {
|
||||
return c.HostGID(0)
|
||||
}
|
||||
|
||||
// Utility function that gets a host ID for a container ID from user namespace map
|
||||
// if that ID is present in the map.
|
||||
func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
|
||||
for _, m := range uMap {
|
||||
if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
|
||||
hostID := m.HostID + (containerID - m.ContainerID)
|
||||
return hostID, true
|
||||
}
|
||||
}
|
||||
return -1, false
|
||||
}
|
|
@ -1,57 +0,0 @@
|
|||
package configs
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
)
|
||||
|
||||
const (
|
||||
Wildcard = -1
|
||||
)
|
||||
|
||||
// TODO Windows: This can be factored out in the future
|
||||
|
||||
type Device struct {
|
||||
// Device type, block, char, etc.
|
||||
Type rune `json:"type"`
|
||||
|
||||
// Path to the device.
|
||||
Path string `json:"path"`
|
||||
|
||||
// Major is the device's major number.
|
||||
Major int64 `json:"major"`
|
||||
|
||||
// Minor is the device's minor number.
|
||||
Minor int64 `json:"minor"`
|
||||
|
||||
// Cgroup permissions format, rwm.
|
||||
Permissions string `json:"permissions"`
|
||||
|
||||
// FileMode permission bits for the device.
|
||||
FileMode os.FileMode `json:"file_mode"`
|
||||
|
||||
// Uid of the device.
|
||||
Uid uint32 `json:"uid"`
|
||||
|
||||
// Gid of the device.
|
||||
Gid uint32 `json:"gid"`
|
||||
|
||||
// Write the file to the allowed list
|
||||
Allow bool `json:"allow"`
|
||||
}
|
||||
|
||||
func (d *Device) CgroupString() string {
|
||||
return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions)
|
||||
}
|
||||
|
||||
func (d *Device) Mkdev() int {
|
||||
return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12))
|
||||
}
|
||||
|
||||
// deviceNumberString converts the device number to a string return result.
|
||||
func deviceNumberString(number int64) string {
|
||||
if number == Wildcard {
|
||||
return "*"
|
||||
}
|
||||
return fmt.Sprint(number)
|
||||
}
|
111
vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go
generated
vendored
111
vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go
generated
vendored
|
@ -1,111 +0,0 @@
|
|||
// +build linux
|
||||
|
||||
package configs
|
||||
|
||||
var (
|
||||
// DefaultSimpleDevices are devices that are to be both allowed and created.
|
||||
DefaultSimpleDevices = []*Device{
|
||||
// /dev/null and zero
|
||||
{
|
||||
Path: "/dev/null",
|
||||
Type: 'c',
|
||||
Major: 1,
|
||||
Minor: 3,
|
||||
Permissions: "rwm",
|
||||
FileMode: 0666,
|
||||
},
|
||||
{
|
||||
Path: "/dev/zero",
|
||||
Type: 'c',
|
||||
Major: 1,
|
||||
Minor: 5,
|
||||
Permissions: "rwm",
|
||||
FileMode: 0666,
|
||||
},
|
||||
|
||||
{
|
||||
Path: "/dev/full",
|
||||
Type: 'c',
|
||||
Major: 1,
|
||||
Minor: 7,
|
||||
Permissions: "rwm",
|
||||
FileMode: 0666,
|
||||
},
|
||||
|
||||
// consoles and ttys
|
||||
{
|
||||
Path: "/dev/tty",
|
||||
Type: 'c',
|
||||
Major: 5,
|
||||
Minor: 0,
|
||||
Permissions: "rwm",
|
||||
FileMode: 0666,
|
||||
},
|
||||
|
||||
// /dev/urandom,/dev/random
|
||||
{
|
||||
Path: "/dev/urandom",
|
||||
Type: 'c',
|
||||
Major: 1,
|
||||
Minor: 9,
|
||||
Permissions: "rwm",
|
||||
FileMode: 0666,
|
||||
},
|
||||
{
|
||||
Path: "/dev/random",
|
||||
Type: 'c',
|
||||
Major: 1,
|
||||
Minor: 8,
|
||||
Permissions: "rwm",
|
||||
FileMode: 0666,
|
||||
},
|
||||
}
|
||||
DefaultAllowedDevices = append([]*Device{
|
||||
// allow mknod for any device
|
||||
{
|
||||
Type: 'c',
|
||||
Major: Wildcard,
|
||||
Minor: Wildcard,
|
||||
Permissions: "m",
|
||||
},
|
||||
{
|
||||
Type: 'b',
|
||||
Major: Wildcard,
|
||||
Minor: Wildcard,
|
||||
Permissions: "m",
|
||||
},
|
||||
|
||||
{
|
||||
Path: "/dev/console",
|
||||
Type: 'c',
|
||||
Major: 5,
|
||||
Minor: 1,
|
||||
Permissions: "rwm",
|
||||
},
|
||||
// /dev/pts/ - pts namespaces are "coming soon"
|
||||
{
|
||||
Path: "",
|
||||
Type: 'c',
|
||||
Major: 136,
|
||||
Minor: Wildcard,
|
||||
Permissions: "rwm",
|
||||
},
|
||||
{
|
||||
Path: "",
|
||||
Type: 'c',
|
||||
Major: 5,
|
||||
Minor: 2,
|
||||
Permissions: "rwm",
|
||||
},
|
||||
|
||||
// tuntap
|
||||
{
|
||||
Path: "",
|
||||
Type: 'c',
|
||||
Major: 10,
|
||||
Minor: 200,
|
||||
Permissions: "rwm",
|
||||
},
|
||||
}, DefaultSimpleDevices...)
|
||||
DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...)
|
||||
)
|
|
@ -1,9 +0,0 @@
|
|||
package configs
|
||||
|
||||
type HugepageLimit struct {
|
||||
// which type of hugepage to limit.
|
||||
Pagesize string `json:"page_size"`
|
||||
|
||||
// usage limit for hugepage.
|
||||
Limit uint64 `json:"limit"`
|
||||
}
|
|
@ -1,7 +0,0 @@
|
|||
package configs
|
||||
|
||||
type IntelRdt struct {
|
||||
// The schema for L3 cache id and capacity bitmask (CBM)
|
||||
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
|
||||
L3CacheSchema string `json:"l3_cache_schema,omitempty"`
|
||||
}
|
14
vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go
generated
vendored
14
vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go
generated
vendored
|
@ -1,14 +0,0 @@
|
|||
package configs
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
type IfPrioMap struct {
|
||||
Interface string `json:"interface"`
|
||||
Priority int64 `json:"priority"`
|
||||
}
|
||||
|
||||
func (i *IfPrioMap) CgroupString() string {
|
||||
return fmt.Sprintf("%s %d", i.Interface, i.Priority)
|
||||
}
|
|
@ -1,39 +0,0 @@
|
|||
package configs
|
||||
|
||||
const (
|
||||
// EXT_COPYUP is a directive to copy up the contents of a directory when
|
||||
// a tmpfs is mounted over it.
|
||||
EXT_COPYUP = 1 << iota
|
||||
)
|
||||
|
||||
type Mount struct {
|
||||
// Source path for the mount.
|
||||
Source string `json:"source"`
|
||||
|
||||
// Destination path for the mount inside the container.
|
||||
Destination string `json:"destination"`
|
||||
|
||||
// Device the mount is for.
|
||||
Device string `json:"device"`
|
||||
|
||||
// Mount flags.
|
||||
Flags int `json:"flags"`
|
||||
|
||||
// Propagation Flags
|
||||
PropagationFlags []int `json:"propagation_flags"`
|
||||
|
||||
// Mount data applied to the mount.
|
||||
Data string `json:"data"`
|
||||
|
||||
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
|
||||
Relabel string `json:"relabel"`
|
||||
|
||||
// Extensions are additional flags that are specific to runc.
|
||||
Extensions int `json:"extensions"`
|
||||
|
||||
// Optional Command to be run before Source is mounted.
|
||||
PremountCmds []Command `json:"premount_cmds"`
|
||||
|
||||
// Optional Command to be run after Source is mounted.
|
||||
PostmountCmds []Command `json:"postmount_cmds"`
|
||||
}
|
|
@ -1,5 +0,0 @@
|
|||
package configs
|
||||
|
||||
type NamespaceType string
|
||||
|
||||
type Namespaces []Namespace
|
122
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
generated
vendored
122
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
generated
vendored
|
@ -1,122 +0,0 @@
|
|||
package configs
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
const (
|
||||
NEWNET NamespaceType = "NEWNET"
|
||||
NEWPID NamespaceType = "NEWPID"
|
||||
NEWNS NamespaceType = "NEWNS"
|
||||
NEWUTS NamespaceType = "NEWUTS"
|
||||
NEWIPC NamespaceType = "NEWIPC"
|
||||
NEWUSER NamespaceType = "NEWUSER"
|
||||
)
|
||||
|
||||
var (
|
||||
nsLock sync.Mutex
|
||||
supportedNamespaces = make(map[NamespaceType]bool)
|
||||
)
|
||||
|
||||
// NsName converts the namespace type to its filename
|
||||
func NsName(ns NamespaceType) string {
|
||||
switch ns {
|
||||
case NEWNET:
|
||||
return "net"
|
||||
case NEWNS:
|
||||
return "mnt"
|
||||
case NEWPID:
|
||||
return "pid"
|
||||
case NEWIPC:
|
||||
return "ipc"
|
||||
case NEWUSER:
|
||||
return "user"
|
||||
case NEWUTS:
|
||||
return "uts"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// IsNamespaceSupported returns whether a namespace is available or
|
||||
// not
|
||||
func IsNamespaceSupported(ns NamespaceType) bool {
|
||||
nsLock.Lock()
|
||||
defer nsLock.Unlock()
|
||||
supported, ok := supportedNamespaces[ns]
|
||||
if ok {
|
||||
return supported
|
||||
}
|
||||
nsFile := NsName(ns)
|
||||
// if the namespace type is unknown, just return false
|
||||
if nsFile == "" {
|
||||
return false
|
||||
}
|
||||
_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
|
||||
// a namespace is supported if it exists and we have permissions to read it
|
||||
supported = err == nil
|
||||
supportedNamespaces[ns] = supported
|
||||
return supported
|
||||
}
|
||||
|
||||
func NamespaceTypes() []NamespaceType {
|
||||
return []NamespaceType{
|
||||
NEWUSER, // Keep user NS always first, don't move it.
|
||||
NEWIPC,
|
||||
NEWUTS,
|
||||
NEWNET,
|
||||
NEWPID,
|
||||
NEWNS,
|
||||
}
|
||||
}
|
||||
|
||||
// Namespace defines configuration for each namespace. It specifies an
|
||||
// alternate path that is able to be joined via setns.
|
||||
type Namespace struct {
|
||||
Type NamespaceType `json:"type"`
|
||||
Path string `json:"path"`
|
||||
}
|
||||
|
||||
func (n *Namespace) GetPath(pid int) string {
|
||||
return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
|
||||
}
|
||||
|
||||
func (n *Namespaces) Remove(t NamespaceType) bool {
|
||||
i := n.index(t)
|
||||
if i == -1 {
|
||||
return false
|
||||
}
|
||||
*n = append((*n)[:i], (*n)[i+1:]...)
|
||||
return true
|
||||
}
|
||||
|
||||
func (n *Namespaces) Add(t NamespaceType, path string) {
|
||||
i := n.index(t)
|
||||
if i == -1 {
|
||||
*n = append(*n, Namespace{Type: t, Path: path})
|
||||
return
|
||||
}
|
||||
(*n)[i].Path = path
|
||||
}
|
||||
|
||||
func (n *Namespaces) index(t NamespaceType) int {
|
||||
for i, ns := range *n {
|
||||
if ns.Type == t {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func (n *Namespaces) Contains(t NamespaceType) bool {
|
||||
return n.index(t) != -1
|
||||
}
|
||||
|
||||
func (n *Namespaces) PathOf(t NamespaceType) string {
|
||||
i := n.index(t)
|
||||
if i == -1 {
|
||||
return ""
|
||||
}
|
||||
return (*n)[i].Path
|
||||
}
|
31
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go
generated
vendored
31
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go
generated
vendored
|
@ -1,31 +0,0 @@
|
|||
// +build linux
|
||||
|
||||
package configs
|
||||
|
||||
import "golang.org/x/sys/unix"
|
||||
|
||||
func (n *Namespace) Syscall() int {
|
||||
return namespaceInfo[n.Type]
|
||||
}
|
||||
|
||||
var namespaceInfo = map[NamespaceType]int{
|
||||
NEWNET: unix.CLONE_NEWNET,
|
||||
NEWNS: unix.CLONE_NEWNS,
|
||||
NEWUSER: unix.CLONE_NEWUSER,
|
||||
NEWIPC: unix.CLONE_NEWIPC,
|
||||
NEWUTS: unix.CLONE_NEWUTS,
|
||||
NEWPID: unix.CLONE_NEWPID,
|
||||
}
|
||||
|
||||
// CloneFlags parses the container's Namespaces options to set the correct
|
||||
// flags on clone, unshare. This function returns flags only for new namespaces.
|
||||
func (n *Namespaces) CloneFlags() uintptr {
|
||||
var flag int
|
||||
for _, v := range *n {
|
||||
if v.Path != "" {
|
||||
continue
|
||||
}
|
||||
flag |= namespaceInfo[v.Type]
|
||||
}
|
||||
return uintptr(flag)
|
||||
}
|
|
@ -1,13 +0,0 @@
|
|||
// +build !linux,!windows
|
||||
|
||||
package configs
|
||||
|
||||
func (n *Namespace) Syscall() int {
|
||||
panic("No namespace syscall support")
|
||||
}
|
||||
|
||||
// CloneFlags parses the container's Namespaces options to set the correct
|
||||
// flags on clone, unshare. This function returns flags only for new namespaces.
|
||||
func (n *Namespaces) CloneFlags() uintptr {
|
||||
panic("No namespace syscall support")
|
||||
}
|
8
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go
generated
vendored
8
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go
generated
vendored
|
@ -1,8 +0,0 @@
|
|||
// +build !linux
|
||||
|
||||
package configs
|
||||
|
||||
// Namespace defines configuration for each namespace. It specifies an
|
||||
// alternate path that is able to be joined via setns.
|
||||
type Namespace struct {
|
||||
}
|
|
@ -1,72 +0,0 @@
|
|||
package configs
|
||||
|
||||
// Network defines configuration for a container's networking stack
|
||||
//
|
||||
// The network configuration can be omitted from a container causing the
|
||||
// container to be setup with the host's networking stack
|
||||
type Network struct {
|
||||
// Type sets the networks type, commonly veth and loopback
|
||||
Type string `json:"type"`
|
||||
|
||||
// Name of the network interface
|
||||
Name string `json:"name"`
|
||||
|
||||
// The bridge to use.
|
||||
Bridge string `json:"bridge"`
|
||||
|
||||
// MacAddress contains the MAC address to set on the network interface
|
||||
MacAddress string `json:"mac_address"`
|
||||
|
||||
// Address contains the IPv4 and mask to set on the network interface
|
||||
Address string `json:"address"`
|
||||
|
||||
// Gateway sets the gateway address that is used as the default for the interface
|
||||
Gateway string `json:"gateway"`
|
||||
|
||||
// IPv6Address contains the IPv6 and mask to set on the network interface
|
||||
IPv6Address string `json:"ipv6_address"`
|
||||
|
||||
// IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface
|
||||
IPv6Gateway string `json:"ipv6_gateway"`
|
||||
|
||||
// Mtu sets the mtu value for the interface and will be mirrored on both the host and
|
||||
// container's interfaces if a pair is created, specifically in the case of type veth
|
||||
// Note: This does not apply to loopback interfaces.
|
||||
Mtu int `json:"mtu"`
|
||||
|
||||
// TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and
|
||||
// container's interfaces if a pair is created, specifically in the case of type veth
|
||||
// Note: This does not apply to loopback interfaces.
|
||||
TxQueueLen int `json:"txqueuelen"`
|
||||
|
||||
// HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the
|
||||
// container.
|
||||
HostInterfaceName string `json:"host_interface_name"`
|
||||
|
||||
// HairpinMode specifies if hairpin NAT should be enabled on the virtual interface
|
||||
// bridge port in the case of type veth
|
||||
// Note: This is unsupported on some systems.
|
||||
// Note: This does not apply to loopback interfaces.
|
||||
HairpinMode bool `json:"hairpin_mode"`
|
||||
}
|
||||
|
||||
// Routes can be specified to create entries in the route table as the container is started
|
||||
//
|
||||
// All of destination, source, and gateway should be either IPv4 or IPv6.
|
||||
// One of the three options must be present, and omitted entries will use their
|
||||
// IP family default for the route table. For IPv4 for example, setting the
|
||||
// gateway to 1.2.3.4 and the interface to eth0 will set up a standard
|
||||
// destination of 0.0.0.0(or *) when viewed in the route table.
|
||||
type Route struct {
|
||||
// Sets the destination and mask, should be a CIDR. Accepts IPv4 and IPv6
|
||||
Destination string `json:"destination"`
|
||||
|
||||
// Sets the source and mask, should be a CIDR. Accepts IPv4 and IPv6
|
||||
Source string `json:"source"`
|
||||
|
||||
// Sets the gateway. Accepts IPv4 and IPv6
|
||||
Gateway string `json:"gateway"`
|
||||
|
||||
// The device to set this route up for, for example: eth0
|
||||
InterfaceName string `json:"interface_name"`
|
||||
}
|
|
@ -82,7 +82,7 @@ struct nlconfig_t {
|
|||
uint8_t is_setgroup;
|
||||
|
||||
/* Rootless container settings. */
|
||||
uint8_t is_rootless;
|
||||
uint8_t is_rootless_euid; /* boolean */
|
||||
char *uidmappath;
|
||||
size_t uidmappath_len;
|
||||
char *gidmappath;
|
||||
|
@ -100,7 +100,7 @@ struct nlconfig_t {
|
|||
#define GIDMAP_ATTR 27284
|
||||
#define SETGROUP_ATTR 27285
|
||||
#define OOM_SCORE_ADJ_ATTR 27286
|
||||
#define ROOTLESS_ATTR 27287
|
||||
#define ROOTLESS_EUID_ATTR 27287
|
||||
#define UIDMAPPATH_ATTR 27288
|
||||
#define GIDMAPPATH_ATTR 27289
|
||||
|
||||
|
@ -419,8 +419,8 @@ static void nl_parse(int fd, struct nlconfig_t *config)
|
|||
case CLONE_FLAGS_ATTR:
|
||||
config->cloneflags = readint32(current);
|
||||
break;
|
||||
case ROOTLESS_ATTR:
|
||||
config->is_rootless = readint8(current);
|
||||
case ROOTLESS_EUID_ATTR:
|
||||
config->is_rootless_euid = readint8(current); /* boolean */
|
||||
break;
|
||||
case OOM_SCORE_ADJ_ATTR:
|
||||
config->oom_score_adj = current;
|
||||
|
@ -687,7 +687,7 @@ void nsexec(void)
|
|||
* newuidmap/newgidmap shall be used.
|
||||
*/
|
||||
|
||||
if (config.is_rootless && !config.is_setgroup)
|
||||
if (config.is_rootless_euid && !config.is_setgroup)
|
||||
update_setgroups(child, SETGROUPS_DENY);
|
||||
|
||||
/* Set up mappings. */
|
||||
|
@ -953,7 +953,7 @@ void nsexec(void)
|
|||
if (setgid(0) < 0)
|
||||
bail("setgid failed");
|
||||
|
||||
if (!config.is_rootless && config.is_setgroup) {
|
||||
if (!config.is_rootless_euid && config.is_setgroup) {
|
||||
if (setgroups(0, NULL) < 0)
|
||||
bail("setgroups failed");
|
||||
}
|
||||
|
|
|
@ -1,76 +0,0 @@
|
|||
package seccomp
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
||||
var operators = map[string]configs.Operator{
|
||||
"SCMP_CMP_NE": configs.NotEqualTo,
|
||||
"SCMP_CMP_LT": configs.LessThan,
|
||||
"SCMP_CMP_LE": configs.LessThanOrEqualTo,
|
||||
"SCMP_CMP_EQ": configs.EqualTo,
|
||||
"SCMP_CMP_GE": configs.GreaterThanOrEqualTo,
|
||||
"SCMP_CMP_GT": configs.GreaterThan,
|
||||
"SCMP_CMP_MASKED_EQ": configs.MaskEqualTo,
|
||||
}
|
||||
|
||||
var actions = map[string]configs.Action{
|
||||
"SCMP_ACT_KILL": configs.Kill,
|
||||
"SCMP_ACT_ERRNO": configs.Errno,
|
||||
"SCMP_ACT_TRAP": configs.Trap,
|
||||
"SCMP_ACT_ALLOW": configs.Allow,
|
||||
"SCMP_ACT_TRACE": configs.Trace,
|
||||
}
|
||||
|
||||
var archs = map[string]string{
|
||||
"SCMP_ARCH_X86": "x86",
|
||||
"SCMP_ARCH_X86_64": "amd64",
|
||||
"SCMP_ARCH_X32": "x32",
|
||||
"SCMP_ARCH_ARM": "arm",
|
||||
"SCMP_ARCH_AARCH64": "arm64",
|
||||
"SCMP_ARCH_MIPS": "mips",
|
||||
"SCMP_ARCH_MIPS64": "mips64",
|
||||
"SCMP_ARCH_MIPS64N32": "mips64n32",
|
||||
"SCMP_ARCH_MIPSEL": "mipsel",
|
||||
"SCMP_ARCH_MIPSEL64": "mipsel64",
|
||||
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
|
||||
"SCMP_ARCH_PPC": "ppc",
|
||||
"SCMP_ARCH_PPC64": "ppc64",
|
||||
"SCMP_ARCH_PPC64LE": "ppc64le",
|
||||
"SCMP_ARCH_S390": "s390",
|
||||
"SCMP_ARCH_S390X": "s390x",
|
||||
}
|
||||
|
||||
// ConvertStringToOperator converts a string into a Seccomp comparison operator.
|
||||
// Comparison operators use the names they are assigned by Libseccomp's header.
|
||||
// Attempting to convert a string that is not a valid operator results in an
|
||||
// error.
|
||||
func ConvertStringToOperator(in string) (configs.Operator, error) {
|
||||
if op, ok := operators[in]; ok == true {
|
||||
return op, nil
|
||||
}
|
||||
return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
|
||||
}
|
||||
|
||||
// ConvertStringToAction converts a string into a Seccomp rule match action.
|
||||
// Actions use the names they are assigned in Libseccomp's header, though some
|
||||
// (notable, SCMP_ACT_TRACE) are not available in this implementation and will
|
||||
// return errors.
|
||||
// Attempting to convert a string that is not a valid action results in an
|
||||
// error.
|
||||
func ConvertStringToAction(in string) (configs.Action, error) {
|
||||
if act, ok := actions[in]; ok == true {
|
||||
return act, nil
|
||||
}
|
||||
return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
|
||||
}
|
||||
|
||||
// ConvertStringToArch converts a string into a Seccomp comparison arch.
|
||||
func ConvertStringToArch(in string) (string, error) {
|
||||
if arch, ok := archs[in]; ok == true {
|
||||
return arch, nil
|
||||
}
|
||||
return "", fmt.Errorf("string %s is not a valid arch for seccomp", in)
|
||||
}
|
|
@ -1,258 +0,0 @@
|
|||
// +build linux,cgo,seccomp
|
||||
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
libseccomp "github.com/seccomp/libseccomp-golang"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
var (
|
||||
actAllow = libseccomp.ActAllow
|
||||
actTrap = libseccomp.ActTrap
|
||||
actKill = libseccomp.ActKill
|
||||
actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM))
|
||||
actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
|
||||
)
|
||||
|
||||
const (
|
||||
// Linux system calls can have at most 6 arguments
|
||||
syscallMaxArguments int = 6
|
||||
)
|
||||
|
||||
// Filters given syscalls in a container, preventing them from being used
|
||||
// Started in the container init process, and carried over to all child processes
|
||||
// Setns calls, however, require a separate invocation, as they are not children
|
||||
// of the init until they join the namespace
|
||||
func InitSeccomp(config *configs.Seccomp) error {
|
||||
if config == nil {
|
||||
return fmt.Errorf("cannot initialize Seccomp - nil config passed")
|
||||
}
|
||||
|
||||
defaultAction, err := getAction(config.DefaultAction)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error initializing seccomp - invalid default action")
|
||||
}
|
||||
|
||||
filter, err := libseccomp.NewFilter(defaultAction)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating filter: %s", err)
|
||||
}
|
||||
|
||||
// Add extra architectures
|
||||
for _, arch := range config.Architectures {
|
||||
scmpArch, err := libseccomp.GetArchFromString(arch)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error validating Seccomp architecture: %s", err)
|
||||
}
|
||||
|
||||
if err := filter.AddArch(scmpArch); err != nil {
|
||||
return fmt.Errorf("error adding architecture to seccomp filter: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Unset no new privs bit
|
||||
if err := filter.SetNoNewPrivsBit(false); err != nil {
|
||||
return fmt.Errorf("error setting no new privileges: %s", err)
|
||||
}
|
||||
|
||||
// Add a rule for each syscall
|
||||
for _, call := range config.Syscalls {
|
||||
if call == nil {
|
||||
return fmt.Errorf("encountered nil syscall while initializing Seccomp")
|
||||
}
|
||||
|
||||
if err = matchCall(filter, call); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if err = filter.Load(); err != nil {
|
||||
return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsEnabled returns if the kernel has been configured to support seccomp.
|
||||
func IsEnabled() bool {
|
||||
// Try to read from /proc/self/status for kernels > 3.8
|
||||
s, err := parseStatusFile("/proc/self/status")
|
||||
if err != nil {
|
||||
// Check if Seccomp is supported, via CONFIG_SECCOMP.
|
||||
if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL {
|
||||
// Make sure the kernel has CONFIG_SECCOMP_FILTER.
|
||||
if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
_, ok := s["Seccomp"]
|
||||
return ok
|
||||
}
|
||||
|
||||
// Convert Libcontainer Action to Libseccomp ScmpAction
|
||||
func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
|
||||
switch act {
|
||||
case configs.Kill:
|
||||
return actKill, nil
|
||||
case configs.Errno:
|
||||
return actErrno, nil
|
||||
case configs.Trap:
|
||||
return actTrap, nil
|
||||
case configs.Allow:
|
||||
return actAllow, nil
|
||||
case configs.Trace:
|
||||
return actTrace, nil
|
||||
default:
|
||||
return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule")
|
||||
}
|
||||
}
|
||||
|
||||
// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
|
||||
func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
|
||||
switch op {
|
||||
case configs.EqualTo:
|
||||
return libseccomp.CompareEqual, nil
|
||||
case configs.NotEqualTo:
|
||||
return libseccomp.CompareNotEqual, nil
|
||||
case configs.GreaterThan:
|
||||
return libseccomp.CompareGreater, nil
|
||||
case configs.GreaterThanOrEqualTo:
|
||||
return libseccomp.CompareGreaterEqual, nil
|
||||
case configs.LessThan:
|
||||
return libseccomp.CompareLess, nil
|
||||
case configs.LessThanOrEqualTo:
|
||||
return libseccomp.CompareLessOrEqual, nil
|
||||
case configs.MaskEqualTo:
|
||||
return libseccomp.CompareMaskedEqual, nil
|
||||
default:
|
||||
return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule")
|
||||
}
|
||||
}
|
||||
|
||||
// Convert Libcontainer Arg to Libseccomp ScmpCondition
|
||||
func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
|
||||
cond := libseccomp.ScmpCondition{}
|
||||
|
||||
if arg == nil {
|
||||
return cond, fmt.Errorf("cannot convert nil to syscall condition")
|
||||
}
|
||||
|
||||
op, err := getOperator(arg.Op)
|
||||
if err != nil {
|
||||
return cond, err
|
||||
}
|
||||
|
||||
return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
|
||||
}
|
||||
|
||||
// Add a rule to match a single syscall
|
||||
func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
|
||||
if call == nil || filter == nil {
|
||||
return fmt.Errorf("cannot use nil as syscall to block")
|
||||
}
|
||||
|
||||
if len(call.Name) == 0 {
|
||||
return fmt.Errorf("empty string is not a valid syscall")
|
||||
}
|
||||
|
||||
// If we can't resolve the syscall, assume it's not supported on this kernel
|
||||
// Ignore it, don't error out
|
||||
callNum, err := libseccomp.GetSyscallFromName(call.Name)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Convert the call's action to the libseccomp equivalent
|
||||
callAct, err := getAction(call.Action)
|
||||
if err != nil {
|
||||
return fmt.Errorf("action in seccomp profile is invalid: %s", err)
|
||||
}
|
||||
|
||||
// Unconditional match - just add the rule
|
||||
if len(call.Args) == 0 {
|
||||
if err = filter.AddRule(callNum, callAct); err != nil {
|
||||
return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err)
|
||||
}
|
||||
} else {
|
||||
// If two or more arguments have the same condition,
|
||||
// Revert to old behavior, adding each condition as a separate rule
|
||||
argCounts := make([]uint, syscallMaxArguments)
|
||||
conditions := []libseccomp.ScmpCondition{}
|
||||
|
||||
for _, cond := range call.Args {
|
||||
newCond, err := getCondition(cond)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err)
|
||||
}
|
||||
|
||||
argCounts[cond.Index] += 1
|
||||
|
||||
conditions = append(conditions, newCond)
|
||||
}
|
||||
|
||||
hasMultipleArgs := false
|
||||
for _, count := range argCounts {
|
||||
if count > 1 {
|
||||
hasMultipleArgs = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if hasMultipleArgs {
|
||||
// Revert to old behavior
|
||||
// Add each condition attached to a separate rule
|
||||
for _, cond := range conditions {
|
||||
condArr := []libseccomp.ScmpCondition{cond}
|
||||
|
||||
if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
|
||||
return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No conditions share same argument
|
||||
// Use new, proper behavior
|
||||
if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
|
||||
return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseStatusFile(path string) (map[string]string, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
s := bufio.NewScanner(f)
|
||||
status := make(map[string]string)
|
||||
|
||||
for s.Scan() {
|
||||
text := s.Text()
|
||||
parts := strings.Split(text, ":")
|
||||
|
||||
if len(parts) <= 1 {
|
||||
continue
|
||||
}
|
||||
|
||||
status[parts[0]] = parts[1]
|
||||
}
|
||||
if err := s.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return status, nil
|
||||
}
|
24
vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go
generated
vendored
24
vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go
generated
vendored
|
@ -1,24 +0,0 @@
|
|||
// +build !linux !cgo !seccomp
|
||||
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
||||
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
|
||||
|
||||
// InitSeccomp does nothing because seccomp is not supported.
|
||||
func InitSeccomp(config *configs.Seccomp) error {
|
||||
if config != nil {
|
||||
return ErrSeccompNotEnabled
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsEnabled returns false, because it is not supported.
|
||||
func IsEnabled() bool {
|
||||
return false
|
||||
}
|
|
@ -1,221 +0,0 @@
|
|||
package specconv
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
)
|
||||
|
||||
// Example returns an example spec file, with many options set so a user can
|
||||
// see what a standard spec file looks like.
|
||||
func Example() *specs.Spec {
|
||||
return &specs.Spec{
|
||||
Version: specs.Version,
|
||||
Root: &specs.Root{
|
||||
Path: "rootfs",
|
||||
Readonly: true,
|
||||
},
|
||||
Process: &specs.Process{
|
||||
Terminal: true,
|
||||
User: specs.User{},
|
||||
Args: []string{
|
||||
"sh",
|
||||
},
|
||||
Env: []string{
|
||||
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
|
||||
"TERM=xterm",
|
||||
},
|
||||
Cwd: "/",
|
||||
NoNewPrivileges: true,
|
||||
Capabilities: &specs.LinuxCapabilities{
|
||||
Bounding: []string{
|
||||
"CAP_AUDIT_WRITE",
|
||||
"CAP_KILL",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
},
|
||||
Permitted: []string{
|
||||
"CAP_AUDIT_WRITE",
|
||||
"CAP_KILL",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
},
|
||||
Inheritable: []string{
|
||||
"CAP_AUDIT_WRITE",
|
||||
"CAP_KILL",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
},
|
||||
Ambient: []string{
|
||||
"CAP_AUDIT_WRITE",
|
||||
"CAP_KILL",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
},
|
||||
Effective: []string{
|
||||
"CAP_AUDIT_WRITE",
|
||||
"CAP_KILL",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
},
|
||||
},
|
||||
Rlimits: []specs.POSIXRlimit{
|
||||
{
|
||||
Type: "RLIMIT_NOFILE",
|
||||
Hard: uint64(1024),
|
||||
Soft: uint64(1024),
|
||||
},
|
||||
},
|
||||
},
|
||||
Hostname: "runc",
|
||||
Mounts: []specs.Mount{
|
||||
{
|
||||
Destination: "/proc",
|
||||
Type: "proc",
|
||||
Source: "proc",
|
||||
Options: nil,
|
||||
},
|
||||
{
|
||||
Destination: "/dev",
|
||||
Type: "tmpfs",
|
||||
Source: "tmpfs",
|
||||
Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
|
||||
},
|
||||
{
|
||||
Destination: "/dev/pts",
|
||||
Type: "devpts",
|
||||
Source: "devpts",
|
||||
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
|
||||
},
|
||||
{
|
||||
Destination: "/dev/shm",
|
||||
Type: "tmpfs",
|
||||
Source: "shm",
|
||||
Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
|
||||
},
|
||||
{
|
||||
Destination: "/dev/mqueue",
|
||||
Type: "mqueue",
|
||||
Source: "mqueue",
|
||||
Options: []string{"nosuid", "noexec", "nodev"},
|
||||
},
|
||||
{
|
||||
Destination: "/sys",
|
||||
Type: "sysfs",
|
||||
Source: "sysfs",
|
||||
Options: []string{"nosuid", "noexec", "nodev", "ro"},
|
||||
},
|
||||
{
|
||||
Destination: "/sys/fs/cgroup",
|
||||
Type: "cgroup",
|
||||
Source: "cgroup",
|
||||
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
|
||||
},
|
||||
},
|
||||
Linux: &specs.Linux{
|
||||
MaskedPaths: []string{
|
||||
"/proc/kcore",
|
||||
"/proc/latency_stats",
|
||||
"/proc/timer_list",
|
||||
"/proc/timer_stats",
|
||||
"/proc/sched_debug",
|
||||
"/sys/firmware",
|
||||
"/proc/scsi",
|
||||
},
|
||||
ReadonlyPaths: []string{
|
||||
"/proc/asound",
|
||||
"/proc/bus",
|
||||
"/proc/fs",
|
||||
"/proc/irq",
|
||||
"/proc/sys",
|
||||
"/proc/sysrq-trigger",
|
||||
},
|
||||
Resources: &specs.LinuxResources{
|
||||
Devices: []specs.LinuxDeviceCgroup{
|
||||
{
|
||||
Allow: false,
|
||||
Access: "rwm",
|
||||
},
|
||||
},
|
||||
},
|
||||
Namespaces: []specs.LinuxNamespace{
|
||||
{
|
||||
Type: "pid",
|
||||
},
|
||||
{
|
||||
Type: "network",
|
||||
},
|
||||
{
|
||||
Type: "ipc",
|
||||
},
|
||||
{
|
||||
Type: "uts",
|
||||
},
|
||||
{
|
||||
Type: "mount",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ToRootless converts the given spec file into one that should work with
|
||||
// rootless containers, by removing incompatible options and adding others that
|
||||
// are needed.
|
||||
func ToRootless(spec *specs.Spec) {
|
||||
var namespaces []specs.LinuxNamespace
|
||||
|
||||
// Remove networkns from the spec.
|
||||
for _, ns := range spec.Linux.Namespaces {
|
||||
switch ns.Type {
|
||||
case specs.NetworkNamespace, specs.UserNamespace:
|
||||
// Do nothing.
|
||||
default:
|
||||
namespaces = append(namespaces, ns)
|
||||
}
|
||||
}
|
||||
// Add userns to the spec.
|
||||
namespaces = append(namespaces, specs.LinuxNamespace{
|
||||
Type: specs.UserNamespace,
|
||||
})
|
||||
spec.Linux.Namespaces = namespaces
|
||||
|
||||
// Add mappings for the current user.
|
||||
spec.Linux.UIDMappings = []specs.LinuxIDMapping{{
|
||||
HostID: uint32(os.Geteuid()),
|
||||
ContainerID: 0,
|
||||
Size: 1,
|
||||
}}
|
||||
spec.Linux.GIDMappings = []specs.LinuxIDMapping{{
|
||||
HostID: uint32(os.Getegid()),
|
||||
ContainerID: 0,
|
||||
Size: 1,
|
||||
}}
|
||||
|
||||
// Fix up mounts.
|
||||
var mounts []specs.Mount
|
||||
for _, mount := range spec.Mounts {
|
||||
// Ignore all mounts that are under /sys.
|
||||
if strings.HasPrefix(mount.Destination, "/sys") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Remove all gid= and uid= mappings.
|
||||
var options []string
|
||||
for _, option := range mount.Options {
|
||||
if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") {
|
||||
options = append(options, option)
|
||||
}
|
||||
}
|
||||
|
||||
mount.Options = options
|
||||
mounts = append(mounts, mount)
|
||||
}
|
||||
// Add the sysfs mount as an rbind.
|
||||
mounts = append(mounts, specs.Mount{
|
||||
Source: "/sys",
|
||||
Destination: "/sys",
|
||||
Type: "none",
|
||||
Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
|
||||
})
|
||||
spec.Mounts = mounts
|
||||
|
||||
// Remove cgroup settings.
|
||||
spec.Linux.Resources = nil
|
||||
}
|
|
@ -1,836 +0,0 @@
|
|||
// +build linux
|
||||
|
||||
// Package specconv implements conversion of specifications to libcontainer
|
||||
// configurations
|
||||
package specconv
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/seccomp"
|
||||
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
const wildcard = -1
|
||||
|
||||
var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{
|
||||
specs.PIDNamespace: configs.NEWPID,
|
||||
specs.NetworkNamespace: configs.NEWNET,
|
||||
specs.MountNamespace: configs.NEWNS,
|
||||
specs.UserNamespace: configs.NEWUSER,
|
||||
specs.IPCNamespace: configs.NEWIPC,
|
||||
specs.UTSNamespace: configs.NEWUTS,
|
||||
}
|
||||
|
||||
var mountPropagationMapping = map[string]int{
|
||||
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
|
||||
"private": unix.MS_PRIVATE,
|
||||
"rslave": unix.MS_SLAVE | unix.MS_REC,
|
||||
"slave": unix.MS_SLAVE,
|
||||
"rshared": unix.MS_SHARED | unix.MS_REC,
|
||||
"shared": unix.MS_SHARED,
|
||||
"runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
|
||||
"unbindable": unix.MS_UNBINDABLE,
|
||||
"": 0,
|
||||
}
|
||||
|
||||
var allowedDevices = []*configs.Device{
|
||||
// allow mknod for any device
|
||||
{
|
||||
Type: 'c',
|
||||
Major: wildcard,
|
||||
Minor: wildcard,
|
||||
Permissions: "m",
|
||||
Allow: true,
|
||||
},
|
||||
{
|
||||
Type: 'b',
|
||||
Major: wildcard,
|
||||
Minor: wildcard,
|
||||
Permissions: "m",
|
||||
Allow: true,
|
||||
},
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/null",
|
||||
Major: 1,
|
||||
Minor: 3,
|
||||
Permissions: "rwm",
|
||||
Allow: true,
|
||||
},
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/random",
|
||||
Major: 1,
|
||||
Minor: 8,
|
||||
Permissions: "rwm",
|
||||
Allow: true,
|
||||
},
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/full",
|
||||
Major: 1,
|
||||
Minor: 7,
|
||||
Permissions: "rwm",
|
||||
Allow: true,
|
||||
},
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/tty",
|
||||
Major: 5,
|
||||
Minor: 0,
|
||||
Permissions: "rwm",
|
||||
Allow: true,
|
||||
},
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/zero",
|
||||
Major: 1,
|
||||
Minor: 5,
|
||||
Permissions: "rwm",
|
||||
Allow: true,
|
||||
},
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/urandom",
|
||||
Major: 1,
|
||||
Minor: 9,
|
||||
Permissions: "rwm",
|
||||
Allow: true,
|
||||
},
|
||||
{
|
||||
Path: "/dev/console",
|
||||
Type: 'c',
|
||||
Major: 5,
|
||||
Minor: 1,
|
||||
Permissions: "rwm",
|
||||
Allow: true,
|
||||
},
|
||||
// /dev/pts/ - pts namespaces are "coming soon"
|
||||
{
|
||||
Path: "",
|
||||
Type: 'c',
|
||||
Major: 136,
|
||||
Minor: wildcard,
|
||||
Permissions: "rwm",
|
||||
Allow: true,
|
||||
},
|
||||
{
|
||||
Path: "",
|
||||
Type: 'c',
|
||||
Major: 5,
|
||||
Minor: 2,
|
||||
Permissions: "rwm",
|
||||
Allow: true,
|
||||
},
|
||||
// tuntap
|
||||
{
|
||||
Path: "",
|
||||
Type: 'c',
|
||||
Major: 10,
|
||||
Minor: 200,
|
||||
Permissions: "rwm",
|
||||
Allow: true,
|
||||
},
|
||||
}
|
||||
|
||||
type CreateOpts struct {
|
||||
CgroupName string
|
||||
UseSystemdCgroup bool
|
||||
NoPivotRoot bool
|
||||
NoNewKeyring bool
|
||||
Spec *specs.Spec
|
||||
Rootless bool
|
||||
}
|
||||
|
||||
// CreateLibcontainerConfig creates a new libcontainer configuration from a
|
||||
// given specification and a cgroup name
|
||||
func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
|
||||
// runc's cwd will always be the bundle path
|
||||
rcwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cwd, err := filepath.Abs(rcwd)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
spec := opts.Spec
|
||||
if spec.Root == nil {
|
||||
return nil, fmt.Errorf("Root must be specified")
|
||||
}
|
||||
rootfsPath := spec.Root.Path
|
||||
if !filepath.IsAbs(rootfsPath) {
|
||||
rootfsPath = filepath.Join(cwd, rootfsPath)
|
||||
}
|
||||
labels := []string{}
|
||||
for k, v := range spec.Annotations {
|
||||
labels = append(labels, fmt.Sprintf("%s=%s", k, v))
|
||||
}
|
||||
config := &configs.Config{
|
||||
Rootfs: rootfsPath,
|
||||
NoPivotRoot: opts.NoPivotRoot,
|
||||
Readonlyfs: spec.Root.Readonly,
|
||||
Hostname: spec.Hostname,
|
||||
Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)),
|
||||
NoNewKeyring: opts.NoNewKeyring,
|
||||
Rootless: opts.Rootless,
|
||||
}
|
||||
|
||||
exists := false
|
||||
for _, m := range spec.Mounts {
|
||||
config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
|
||||
}
|
||||
if err := createDevices(spec, config); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
c, err := createCgroupConfig(opts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
config.Cgroups = c
|
||||
// set linux-specific config
|
||||
if spec.Linux != nil {
|
||||
if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists {
|
||||
return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
|
||||
}
|
||||
if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) {
|
||||
return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root")
|
||||
}
|
||||
|
||||
for _, ns := range spec.Linux.Namespaces {
|
||||
t, exists := namespaceMapping[ns.Type]
|
||||
if !exists {
|
||||
return nil, fmt.Errorf("namespace %q does not exist", ns)
|
||||
}
|
||||
if config.Namespaces.Contains(t) {
|
||||
return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
|
||||
}
|
||||
config.Namespaces.Add(t, ns.Path)
|
||||
}
|
||||
if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" {
|
||||
config.Networks = []*configs.Network{
|
||||
{
|
||||
Type: "loopback",
|
||||
},
|
||||
}
|
||||
}
|
||||
if config.Namespaces.Contains(configs.NEWUSER) {
|
||||
if err := setupUserNamespace(spec, config); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
config.MaskPaths = spec.Linux.MaskedPaths
|
||||
config.ReadonlyPaths = spec.Linux.ReadonlyPaths
|
||||
config.MountLabel = spec.Linux.MountLabel
|
||||
config.Sysctl = spec.Linux.Sysctl
|
||||
if spec.Linux.Seccomp != nil {
|
||||
seccomp, err := SetupSeccomp(spec.Linux.Seccomp)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
config.Seccomp = seccomp
|
||||
}
|
||||
}
|
||||
if spec.Process.SelinuxLabel != "" {
|
||||
config.ProcessLabel = spec.Process.SelinuxLabel
|
||||
}
|
||||
if spec.Process != nil {
|
||||
config.OomScoreAdj = spec.Process.OOMScoreAdj
|
||||
}
|
||||
if spec.Process.Capabilities != nil {
|
||||
config.Capabilities = &configs.Capabilities{
|
||||
Bounding: spec.Process.Capabilities.Bounding,
|
||||
Effective: spec.Process.Capabilities.Effective,
|
||||
Permitted: spec.Process.Capabilities.Permitted,
|
||||
Inheritable: spec.Process.Capabilities.Inheritable,
|
||||
Ambient: spec.Process.Capabilities.Ambient,
|
||||
}
|
||||
}
|
||||
createHooks(spec, config)
|
||||
config.Version = specs.Version
|
||||
if spec.Linux.IntelRdt != nil {
|
||||
config.IntelRdt = &configs.IntelRdt{}
|
||||
if spec.Linux.IntelRdt.L3CacheSchema != "" {
|
||||
config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema
|
||||
}
|
||||
}
|
||||
return config, nil
|
||||
}
|
||||
|
||||
func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
|
||||
flags, pgflags, data, ext := parseMountOptions(m.Options)
|
||||
source := m.Source
|
||||
device := m.Type
|
||||
if flags&unix.MS_BIND != 0 {
|
||||
if device == "" {
|
||||
device = "bind"
|
||||
}
|
||||
if !filepath.IsAbs(source) {
|
||||
source = filepath.Join(cwd, m.Source)
|
||||
}
|
||||
}
|
||||
return &configs.Mount{
|
||||
Device: device,
|
||||
Source: source,
|
||||
Destination: m.Destination,
|
||||
Data: data,
|
||||
Flags: flags,
|
||||
PropagationFlags: pgflags,
|
||||
Extensions: ext,
|
||||
}
|
||||
}
|
||||
|
||||
func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
|
||||
var (
|
||||
myCgroupPath string
|
||||
|
||||
spec = opts.Spec
|
||||
useSystemdCgroup = opts.UseSystemdCgroup
|
||||
name = opts.CgroupName
|
||||
)
|
||||
|
||||
c := &configs.Cgroup{
|
||||
Resources: &configs.Resources{},
|
||||
}
|
||||
|
||||
if spec.Linux != nil && spec.Linux.CgroupsPath != "" {
|
||||
myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath)
|
||||
if useSystemdCgroup {
|
||||
myCgroupPath = spec.Linux.CgroupsPath
|
||||
}
|
||||
}
|
||||
|
||||
if useSystemdCgroup {
|
||||
if myCgroupPath == "" {
|
||||
c.Parent = "system.slice"
|
||||
c.ScopePrefix = "runc"
|
||||
c.Name = name
|
||||
} else {
|
||||
// Parse the path from expected "slice:prefix:name"
|
||||
// for e.g. "system.slice:docker:1234"
|
||||
parts := strings.Split(myCgroupPath, ":")
|
||||
if len(parts) != 3 {
|
||||
return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups")
|
||||
}
|
||||
c.Parent = parts[0]
|
||||
c.ScopePrefix = parts[1]
|
||||
c.Name = parts[2]
|
||||
}
|
||||
} else {
|
||||
if myCgroupPath == "" {
|
||||
c.Name = name
|
||||
}
|
||||
c.Path = myCgroupPath
|
||||
}
|
||||
|
||||
// In rootless containers, any attempt to make cgroup changes will fail.
|
||||
// libcontainer will validate this and we shouldn't add any cgroup options
|
||||
// the user didn't specify.
|
||||
if !opts.Rootless {
|
||||
c.Resources.AllowedDevices = allowedDevices
|
||||
}
|
||||
if spec.Linux != nil {
|
||||
r := spec.Linux.Resources
|
||||
if r == nil {
|
||||
return c, nil
|
||||
}
|
||||
for i, d := range spec.Linux.Resources.Devices {
|
||||
var (
|
||||
t = "a"
|
||||
major = int64(-1)
|
||||
minor = int64(-1)
|
||||
)
|
||||
if d.Type != "" {
|
||||
t = d.Type
|
||||
}
|
||||
if d.Major != nil {
|
||||
major = *d.Major
|
||||
}
|
||||
if d.Minor != nil {
|
||||
minor = *d.Minor
|
||||
}
|
||||
if d.Access == "" {
|
||||
return nil, fmt.Errorf("device access at %d field cannot be empty", i)
|
||||
}
|
||||
dt, err := stringToCgroupDeviceRune(t)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
dd := &configs.Device{
|
||||
Type: dt,
|
||||
Major: major,
|
||||
Minor: minor,
|
||||
Permissions: d.Access,
|
||||
Allow: d.Allow,
|
||||
}
|
||||
c.Resources.Devices = append(c.Resources.Devices, dd)
|
||||
}
|
||||
if r.Memory != nil {
|
||||
if r.Memory.Limit != nil {
|
||||
c.Resources.Memory = *r.Memory.Limit
|
||||
}
|
||||
if r.Memory.Reservation != nil {
|
||||
c.Resources.MemoryReservation = *r.Memory.Reservation
|
||||
}
|
||||
if r.Memory.Swap != nil {
|
||||
c.Resources.MemorySwap = *r.Memory.Swap
|
||||
}
|
||||
if r.Memory.Kernel != nil {
|
||||
c.Resources.KernelMemory = *r.Memory.Kernel
|
||||
}
|
||||
if r.Memory.KernelTCP != nil {
|
||||
c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
|
||||
}
|
||||
if r.Memory.Swappiness != nil {
|
||||
c.Resources.MemorySwappiness = r.Memory.Swappiness
|
||||
}
|
||||
if r.Memory.DisableOOMKiller != nil {
|
||||
c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
|
||||
}
|
||||
}
|
||||
if r.CPU != nil {
|
||||
if r.CPU.Shares != nil {
|
||||
c.Resources.CpuShares = *r.CPU.Shares
|
||||
}
|
||||
if r.CPU.Quota != nil {
|
||||
c.Resources.CpuQuota = *r.CPU.Quota
|
||||
}
|
||||
if r.CPU.Period != nil {
|
||||
c.Resources.CpuPeriod = *r.CPU.Period
|
||||
}
|
||||
if r.CPU.RealtimeRuntime != nil {
|
||||
c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
|
||||
}
|
||||
if r.CPU.RealtimePeriod != nil {
|
||||
c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
|
||||
}
|
||||
if r.CPU.Cpus != "" {
|
||||
c.Resources.CpusetCpus = r.CPU.Cpus
|
||||
}
|
||||
if r.CPU.Mems != "" {
|
||||
c.Resources.CpusetMems = r.CPU.Mems
|
||||
}
|
||||
}
|
||||
if r.Pids != nil {
|
||||
c.Resources.PidsLimit = r.Pids.Limit
|
||||
}
|
||||
if r.BlockIO != nil {
|
||||
if r.BlockIO.Weight != nil {
|
||||
c.Resources.BlkioWeight = *r.BlockIO.Weight
|
||||
}
|
||||
if r.BlockIO.LeafWeight != nil {
|
||||
c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight
|
||||
}
|
||||
if r.BlockIO.WeightDevice != nil {
|
||||
for _, wd := range r.BlockIO.WeightDevice {
|
||||
var weight, leafWeight uint16
|
||||
if wd.Weight != nil {
|
||||
weight = *wd.Weight
|
||||
}
|
||||
if wd.LeafWeight != nil {
|
||||
leafWeight = *wd.LeafWeight
|
||||
}
|
||||
weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight)
|
||||
c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
|
||||
}
|
||||
}
|
||||
if r.BlockIO.ThrottleReadBpsDevice != nil {
|
||||
for _, td := range r.BlockIO.ThrottleReadBpsDevice {
|
||||
rate := td.Rate
|
||||
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
||||
c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
|
||||
}
|
||||
}
|
||||
if r.BlockIO.ThrottleWriteBpsDevice != nil {
|
||||
for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
|
||||
rate := td.Rate
|
||||
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
||||
c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
|
||||
}
|
||||
}
|
||||
if r.BlockIO.ThrottleReadIOPSDevice != nil {
|
||||
for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
|
||||
rate := td.Rate
|
||||
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
||||
c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
|
||||
}
|
||||
}
|
||||
if r.BlockIO.ThrottleWriteIOPSDevice != nil {
|
||||
for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
|
||||
rate := td.Rate
|
||||
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
||||
c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, l := range r.HugepageLimits {
|
||||
c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{
|
||||
Pagesize: l.Pagesize,
|
||||
Limit: l.Limit,
|
||||
})
|
||||
}
|
||||
if r.Network != nil {
|
||||
if r.Network.ClassID != nil {
|
||||
c.Resources.NetClsClassid = *r.Network.ClassID
|
||||
}
|
||||
for _, m := range r.Network.Priorities {
|
||||
c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
|
||||
Interface: m.Name,
|
||||
Priority: int64(m.Priority),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
if !opts.Rootless {
|
||||
// append the default allowed devices to the end of the list
|
||||
c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
func stringToCgroupDeviceRune(s string) (rune, error) {
|
||||
switch s {
|
||||
case "a":
|
||||
return 'a', nil
|
||||
case "b":
|
||||
return 'b', nil
|
||||
case "c":
|
||||
return 'c', nil
|
||||
default:
|
||||
return 0, fmt.Errorf("invalid cgroup device type %q", s)
|
||||
}
|
||||
}
|
||||
|
||||
func stringToDeviceRune(s string) (rune, error) {
|
||||
switch s {
|
||||
case "p":
|
||||
return 'p', nil
|
||||
case "u":
|
||||
return 'u', nil
|
||||
case "b":
|
||||
return 'b', nil
|
||||
case "c":
|
||||
return 'c', nil
|
||||
default:
|
||||
return 0, fmt.Errorf("invalid device type %q", s)
|
||||
}
|
||||
}
|
||||
|
||||
func createDevices(spec *specs.Spec, config *configs.Config) error {
|
||||
// add whitelisted devices
|
||||
config.Devices = []*configs.Device{
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/null",
|
||||
Major: 1,
|
||||
Minor: 3,
|
||||
FileMode: 0666,
|
||||
Uid: 0,
|
||||
Gid: 0,
|
||||
},
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/random",
|
||||
Major: 1,
|
||||
Minor: 8,
|
||||
FileMode: 0666,
|
||||
Uid: 0,
|
||||
Gid: 0,
|
||||
},
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/full",
|
||||
Major: 1,
|
||||
Minor: 7,
|
||||
FileMode: 0666,
|
||||
Uid: 0,
|
||||
Gid: 0,
|
||||
},
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/tty",
|
||||
Major: 5,
|
||||
Minor: 0,
|
||||
FileMode: 0666,
|
||||
Uid: 0,
|
||||
Gid: 0,
|
||||
},
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/zero",
|
||||
Major: 1,
|
||||
Minor: 5,
|
||||
FileMode: 0666,
|
||||
Uid: 0,
|
||||
Gid: 0,
|
||||
},
|
||||
{
|
||||
Type: 'c',
|
||||
Path: "/dev/urandom",
|
||||
Major: 1,
|
||||
Minor: 9,
|
||||
FileMode: 0666,
|
||||
Uid: 0,
|
||||
Gid: 0,
|
||||
},
|
||||
}
|
||||
// merge in additional devices from the spec
|
||||
if spec.Linux != nil {
|
||||
for _, d := range spec.Linux.Devices {
|
||||
var uid, gid uint32
|
||||
var filemode os.FileMode = 0666
|
||||
|
||||
if d.UID != nil {
|
||||
uid = *d.UID
|
||||
}
|
||||
if d.GID != nil {
|
||||
gid = *d.GID
|
||||
}
|
||||
dt, err := stringToDeviceRune(d.Type)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if d.FileMode != nil {
|
||||
filemode = *d.FileMode
|
||||
}
|
||||
device := &configs.Device{
|
||||
Type: dt,
|
||||
Path: d.Path,
|
||||
Major: d.Major,
|
||||
Minor: d.Minor,
|
||||
FileMode: filemode,
|
||||
Uid: uid,
|
||||
Gid: gid,
|
||||
}
|
||||
config.Devices = append(config.Devices, device)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
|
||||
create := func(m specs.LinuxIDMapping) configs.IDMap {
|
||||
return configs.IDMap{
|
||||
HostID: int(m.HostID),
|
||||
ContainerID: int(m.ContainerID),
|
||||
Size: int(m.Size),
|
||||
}
|
||||
}
|
||||
if spec.Linux != nil {
|
||||
for _, m := range spec.Linux.UIDMappings {
|
||||
config.UidMappings = append(config.UidMappings, create(m))
|
||||
}
|
||||
for _, m := range spec.Linux.GIDMappings {
|
||||
config.GidMappings = append(config.GidMappings, create(m))
|
||||
}
|
||||
}
|
||||
rootUID, err := config.HostRootUID()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rootGID, err := config.HostRootGID()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, node := range config.Devices {
|
||||
node.Uid = uint32(rootUID)
|
||||
node.Gid = uint32(rootGID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseMountOptions parses the string and returns the flags, propagation
|
||||
// flags and any mount data that it contains.
|
||||
func parseMountOptions(options []string) (int, []int, string, int) {
|
||||
var (
|
||||
flag int
|
||||
pgflag []int
|
||||
data []string
|
||||
extFlags int
|
||||
)
|
||||
flags := map[string]struct {
|
||||
clear bool
|
||||
flag int
|
||||
}{
|
||||
"acl": {false, unix.MS_POSIXACL},
|
||||
"async": {true, unix.MS_SYNCHRONOUS},
|
||||
"atime": {true, unix.MS_NOATIME},
|
||||
"bind": {false, unix.MS_BIND},
|
||||
"defaults": {false, 0},
|
||||
"dev": {true, unix.MS_NODEV},
|
||||
"diratime": {true, unix.MS_NODIRATIME},
|
||||
"dirsync": {false, unix.MS_DIRSYNC},
|
||||
"exec": {true, unix.MS_NOEXEC},
|
||||
"iversion": {false, unix.MS_I_VERSION},
|
||||
"lazytime": {false, unix.MS_LAZYTIME},
|
||||
"loud": {true, unix.MS_SILENT},
|
||||
"mand": {false, unix.MS_MANDLOCK},
|
||||
"noacl": {true, unix.MS_POSIXACL},
|
||||
"noatime": {false, unix.MS_NOATIME},
|
||||
"nodev": {false, unix.MS_NODEV},
|
||||
"nodiratime": {false, unix.MS_NODIRATIME},
|
||||
"noexec": {false, unix.MS_NOEXEC},
|
||||
"noiversion": {true, unix.MS_I_VERSION},
|
||||
"nolazytime": {true, unix.MS_LAZYTIME},
|
||||
"nomand": {true, unix.MS_MANDLOCK},
|
||||
"norelatime": {true, unix.MS_RELATIME},
|
||||
"nostrictatime": {true, unix.MS_STRICTATIME},
|
||||
"nosuid": {false, unix.MS_NOSUID},
|
||||
"rbind": {false, unix.MS_BIND | unix.MS_REC},
|
||||
"relatime": {false, unix.MS_RELATIME},
|
||||
"remount": {false, unix.MS_REMOUNT},
|
||||
"ro": {false, unix.MS_RDONLY},
|
||||
"rw": {true, unix.MS_RDONLY},
|
||||
"silent": {false, unix.MS_SILENT},
|
||||
"strictatime": {false, unix.MS_STRICTATIME},
|
||||
"suid": {true, unix.MS_NOSUID},
|
||||
"sync": {false, unix.MS_SYNCHRONOUS},
|
||||
}
|
||||
propagationFlags := map[string]int{
|
||||
"private": unix.MS_PRIVATE,
|
||||
"shared": unix.MS_SHARED,
|
||||
"slave": unix.MS_SLAVE,
|
||||
"unbindable": unix.MS_UNBINDABLE,
|
||||
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
|
||||
"rshared": unix.MS_SHARED | unix.MS_REC,
|
||||
"rslave": unix.MS_SLAVE | unix.MS_REC,
|
||||
"runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
|
||||
}
|
||||
extensionFlags := map[string]struct {
|
||||
clear bool
|
||||
flag int
|
||||
}{
|
||||
"tmpcopyup": {false, configs.EXT_COPYUP},
|
||||
}
|
||||
for _, o := range options {
|
||||
// If the option does not exist in the flags table or the flag
|
||||
// is not supported on the platform,
|
||||
// then it is a data value for a specific fs type
|
||||
if f, exists := flags[o]; exists && f.flag != 0 {
|
||||
if f.clear {
|
||||
flag &= ^f.flag
|
||||
} else {
|
||||
flag |= f.flag
|
||||
}
|
||||
} else if f, exists := propagationFlags[o]; exists && f != 0 {
|
||||
pgflag = append(pgflag, f)
|
||||
} else if f, exists := extensionFlags[o]; exists && f.flag != 0 {
|
||||
if f.clear {
|
||||
extFlags &= ^f.flag
|
||||
} else {
|
||||
extFlags |= f.flag
|
||||
}
|
||||
} else {
|
||||
data = append(data, o)
|
||||
}
|
||||
}
|
||||
return flag, pgflag, strings.Join(data, ","), extFlags
|
||||
}
|
||||
|
||||
func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
|
||||
if config == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// No default action specified, no syscalls listed, assume seccomp disabled
|
||||
if config.DefaultAction == "" && len(config.Syscalls) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
newConfig := new(configs.Seccomp)
|
||||
newConfig.Syscalls = []*configs.Syscall{}
|
||||
|
||||
if len(config.Architectures) > 0 {
|
||||
newConfig.Architectures = []string{}
|
||||
for _, arch := range config.Architectures {
|
||||
newArch, err := seccomp.ConvertStringToArch(string(arch))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
newConfig.Architectures = append(newConfig.Architectures, newArch)
|
||||
}
|
||||
}
|
||||
|
||||
// Convert default action from string representation
|
||||
newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
newConfig.DefaultAction = newDefaultAction
|
||||
|
||||
// Loop through all syscall blocks and convert them to libcontainer format
|
||||
for _, call := range config.Syscalls {
|
||||
newAction, err := seccomp.ConvertStringToAction(string(call.Action))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, name := range call.Names {
|
||||
newCall := configs.Syscall{
|
||||
Name: name,
|
||||
Action: newAction,
|
||||
Args: []*configs.Arg{},
|
||||
}
|
||||
// Loop through all the arguments of the syscall and convert them
|
||||
for _, arg := range call.Args {
|
||||
newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
newArg := configs.Arg{
|
||||
Index: arg.Index,
|
||||
Value: arg.Value,
|
||||
ValueTwo: arg.ValueTwo,
|
||||
Op: newOp,
|
||||
}
|
||||
|
||||
newCall.Args = append(newCall.Args, &newArg)
|
||||
}
|
||||
newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
|
||||
}
|
||||
}
|
||||
|
||||
return newConfig, nil
|
||||
}
|
||||
|
||||
func createHooks(rspec *specs.Spec, config *configs.Config) {
|
||||
config.Hooks = &configs.Hooks{}
|
||||
if rspec.Hooks != nil {
|
||||
|
||||
for _, h := range rspec.Hooks.Prestart {
|
||||
cmd := createCommandHook(h)
|
||||
config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd))
|
||||
}
|
||||
for _, h := range rspec.Hooks.Poststart {
|
||||
cmd := createCommandHook(h)
|
||||
config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd))
|
||||
}
|
||||
for _, h := range rspec.Hooks.Poststop {
|
||||
cmd := createCommandHook(h)
|
||||
config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func createCommandHook(h specs.Hook) configs.Command {
|
||||
cmd := configs.Command{
|
||||
Path: h.Path,
|
||||
Args: h.Args,
|
||||
Env: h.Env,
|
||||
}
|
||||
if h.Timeout != nil {
|
||||
d := time.Duration(*h.Timeout) * time.Second
|
||||
cmd.Timeout = &d
|
||||
}
|
||||
return cmd
|
||||
}
|
|
@ -1,93 +0,0 @@
|
|||
// +build linux
|
||||
|
||||
package utils
|
||||
|
||||
/*
|
||||
* Copyright 2016, 2017 SUSE LLC
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// MaxSendfdLen is the maximum length of the name of a file descriptor being
|
||||
// sent using SendFd. The name of the file handle returned by RecvFd will never
|
||||
// be larger than this value.
|
||||
const MaxNameLen = 4096
|
||||
|
||||
// oobSpace is the size of the oob slice required to store a single FD. Note
|
||||
// that unix.UnixRights appears to make the assumption that fd is always int32,
|
||||
// so sizeof(fd) = 4.
|
||||
var oobSpace = unix.CmsgSpace(4)
|
||||
|
||||
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
|
||||
// socket. The file name of the remote file descriptor will be recreated
|
||||
// locally (it is sent as non-auxiliary data in the same payload).
|
||||
func RecvFd(socket *os.File) (*os.File, error) {
|
||||
// For some reason, unix.Recvmsg uses the length rather than the capacity
|
||||
// when passing the msg_controllen and other attributes to recvmsg. So we
|
||||
// have to actually set the length.
|
||||
name := make([]byte, MaxNameLen)
|
||||
oob := make([]byte, oobSpace)
|
||||
|
||||
sockfd := socket.Fd()
|
||||
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if n >= MaxNameLen || oobn != oobSpace {
|
||||
return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
|
||||
}
|
||||
|
||||
// Truncate.
|
||||
name = name[:n]
|
||||
oob = oob[:oobn]
|
||||
|
||||
scms, err := unix.ParseSocketControlMessage(oob)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(scms) != 1 {
|
||||
return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
|
||||
}
|
||||
scm := scms[0]
|
||||
|
||||
fds, err := unix.ParseUnixRights(&scm)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(fds) != 1 {
|
||||
return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
|
||||
}
|
||||
fd := uintptr(fds[0])
|
||||
|
||||
return os.NewFile(fd, string(name)), nil
|
||||
}
|
||||
|
||||
// SendFd sends a file descriptor over the given AF_UNIX socket. In
|
||||
// addition, the file.Name() of the given file will also be sent as
|
||||
// non-auxiliary data in the same payload (allowing to send contextual
|
||||
// information for a file descriptor).
|
||||
func SendFd(socket *os.File, name string, fd uintptr) error {
|
||||
if len(name) >= MaxNameLen {
|
||||
return fmt.Errorf("sendfd: filename too long: %s", name)
|
||||
}
|
||||
oob := unix.UnixRights(int(fd))
|
||||
return unix.Sendmsg(int(socket.Fd()), []byte(name), oob, nil, 0)
|
||||
}
|
|
@ -1,112 +0,0 @@
|
|||
package utils
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
const (
|
||||
exitSignalOffset = 128
|
||||
)
|
||||
|
||||
// ResolveRootfs ensures that the current working directory is
|
||||
// not a symlink and returns the absolute path to the rootfs
|
||||
func ResolveRootfs(uncleanRootfs string) (string, error) {
|
||||
rootfs, err := filepath.Abs(uncleanRootfs)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return filepath.EvalSymlinks(rootfs)
|
||||
}
|
||||
|
||||
// ExitStatus returns the correct exit status for a process based on if it
|
||||
// was signaled or exited cleanly
|
||||
func ExitStatus(status unix.WaitStatus) int {
|
||||
if status.Signaled() {
|
||||
return exitSignalOffset + int(status.Signal())
|
||||
}
|
||||
return status.ExitStatus()
|
||||
}
|
||||
|
||||
// WriteJSON writes the provided struct v to w using standard json marshaling
|
||||
func WriteJSON(w io.Writer, v interface{}) error {
|
||||
data, err := json.Marshal(v)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = w.Write(data)
|
||||
return err
|
||||
}
|
||||
|
||||
// CleanPath makes a path safe for use with filepath.Join. This is done by not
|
||||
// only cleaning the path, but also (if the path is relative) adding a leading
|
||||
// '/' and cleaning it (then removing the leading '/'). This ensures that a
|
||||
// path resulting from prepending another path will always resolve to lexically
|
||||
// be a subdirectory of the prefixed path. This is all done lexically, so paths
|
||||
// that include symlinks won't be safe as a result of using CleanPath.
|
||||
func CleanPath(path string) string {
|
||||
// Deal with empty strings nicely.
|
||||
if path == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Ensure that all paths are cleaned (especially problematic ones like
|
||||
// "/../../../../../" which can cause lots of issues).
|
||||
path = filepath.Clean(path)
|
||||
|
||||
// If the path isn't absolute, we need to do more processing to fix paths
|
||||
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
|
||||
// paths to relative ones.
|
||||
if !filepath.IsAbs(path) {
|
||||
path = filepath.Clean(string(os.PathSeparator) + path)
|
||||
// This can't fail, as (by definition) all paths are relative to root.
|
||||
path, _ = filepath.Rel(string(os.PathSeparator), path)
|
||||
}
|
||||
|
||||
// Clean the path again for good measure.
|
||||
return filepath.Clean(path)
|
||||
}
|
||||
|
||||
// SearchLabels searches a list of key-value pairs for the provided key and
|
||||
// returns the corresponding value. The pairs must be separated with '='.
|
||||
func SearchLabels(labels []string, query string) string {
|
||||
for _, l := range labels {
|
||||
parts := strings.SplitN(l, "=", 2)
|
||||
if len(parts) < 2 {
|
||||
continue
|
||||
}
|
||||
if parts[0] == query {
|
||||
return parts[1]
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Annotations returns the bundle path and user defined annotations from the
|
||||
// libcontainer state. We need to remove the bundle because that is a label
|
||||
// added by libcontainer.
|
||||
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
|
||||
userAnnotations = make(map[string]string)
|
||||
for _, l := range labels {
|
||||
parts := strings.SplitN(l, "=", 2)
|
||||
if len(parts) < 2 {
|
||||
continue
|
||||
}
|
||||
if parts[0] == "bundle" {
|
||||
bundle = parts[1]
|
||||
} else {
|
||||
userAnnotations[parts[0]] = parts[1]
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func GetIntSize() int {
|
||||
return int(unsafe.Sizeof(1))
|
||||
}
|
|
@ -1,44 +0,0 @@
|
|||
// +build !windows
|
||||
|
||||
package utils
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
func CloseExecFrom(minFd int) error {
|
||||
fdList, err := ioutil.ReadDir("/proc/self/fd")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, fi := range fdList {
|
||||
fd, err := strconv.Atoi(fi.Name())
|
||||
if err != nil {
|
||||
// ignore non-numeric file names
|
||||
continue
|
||||
}
|
||||
|
||||
if fd < minFd {
|
||||
// ignore descriptors lower than our specified minimum
|
||||
continue
|
||||
}
|
||||
|
||||
// intentionally ignore errors from unix.CloseOnExec
|
||||
unix.CloseOnExec(fd)
|
||||
// the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// NewSockPair returns a new unix socket pair
|
||||
func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
|
||||
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
|
||||
}
|
|
@ -1,22 +0,0 @@
|
|||
Copyright (c) 2015 Matthew Heon <mheon@redhat.com>
|
||||
Copyright (c) 2015 Paul Moore <pmoore@redhat.com>
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
- Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -1,26 +0,0 @@
|
|||
libseccomp-golang: Go Language Bindings for the libseccomp Project
|
||||
===============================================================================
|
||||
https://github.com/seccomp/libseccomp-golang
|
||||
https://github.com/seccomp/libseccomp
|
||||
|
||||
The libseccomp library provides an easy to use, platform independent, interface
|
||||
to the Linux Kernel's syscall filtering mechanism. The libseccomp API is
|
||||
designed to abstract away the underlying BPF based syscall filter language and
|
||||
present a more conventional function-call based filtering interface that should
|
||||
be familiar to, and easily adopted by, application developers.
|
||||
|
||||
The libseccomp-golang library provides a Go based interface to the libseccomp
|
||||
library.
|
||||
|
||||
* Online Resources
|
||||
|
||||
The library source repository currently lives on GitHub at the following URLs:
|
||||
|
||||
-> https://github.com/seccomp/libseccomp-golang
|
||||
-> https://github.com/seccomp/libseccomp
|
||||
|
||||
The project mailing list is currently hosted on Google Groups at the URL below,
|
||||
please note that a Google account is not required to subscribe to the mailing
|
||||
list.
|
||||
|
||||
-> https://groups.google.com/d/forum/libseccomp
|
|
@ -1,857 +0,0 @@
|
|||
// +build linux
|
||||
|
||||
// Public API specification for libseccomp Go bindings
|
||||
// Contains public API for the bindings
|
||||
|
||||
// Package seccomp provides bindings for libseccomp, a library wrapping the Linux
|
||||
// seccomp syscall. Seccomp enables an application to restrict system call use
|
||||
// for itself and its children.
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// C wrapping code
|
||||
|
||||
// #cgo pkg-config: libseccomp
|
||||
// #include <stdlib.h>
|
||||
// #include <seccomp.h>
|
||||
import "C"
|
||||
|
||||
// Exported types
|
||||
|
||||
// ScmpArch represents a CPU architecture. Seccomp can restrict syscalls on a
|
||||
// per-architecture basis.
|
||||
type ScmpArch uint
|
||||
|
||||
// ScmpAction represents an action to be taken on a filter rule match in
|
||||
// libseccomp
|
||||
type ScmpAction uint
|
||||
|
||||
// ScmpCompareOp represents a comparison operator which can be used in a filter
|
||||
// rule
|
||||
type ScmpCompareOp uint
|
||||
|
||||
// ScmpCondition represents a rule in a libseccomp filter context
|
||||
type ScmpCondition struct {
|
||||
Argument uint `json:"argument,omitempty"`
|
||||
Op ScmpCompareOp `json:"operator,omitempty"`
|
||||
Operand1 uint64 `json:"operand_one,omitempty"`
|
||||
Operand2 uint64 `json:"operand_two,omitempty"`
|
||||
}
|
||||
|
||||
// ScmpSyscall represents a Linux System Call
|
||||
type ScmpSyscall int32
|
||||
|
||||
// Exported Constants
|
||||
|
||||
const (
|
||||
// Valid architectures recognized by libseccomp
|
||||
// ARM64 and all MIPS architectures are unsupported by versions of the
|
||||
// library before v2.2 and will return errors if used
|
||||
|
||||
// ArchInvalid is a placeholder to ensure uninitialized ScmpArch
|
||||
// variables are invalid
|
||||
ArchInvalid ScmpArch = iota
|
||||
// ArchNative is the native architecture of the kernel
|
||||
ArchNative ScmpArch = iota
|
||||
// ArchX86 represents 32-bit x86 syscalls
|
||||
ArchX86 ScmpArch = iota
|
||||
// ArchAMD64 represents 64-bit x86-64 syscalls
|
||||
ArchAMD64 ScmpArch = iota
|
||||
// ArchX32 represents 64-bit x86-64 syscalls (32-bit pointers)
|
||||
ArchX32 ScmpArch = iota
|
||||
// ArchARM represents 32-bit ARM syscalls
|
||||
ArchARM ScmpArch = iota
|
||||
// ArchARM64 represents 64-bit ARM syscalls
|
||||
ArchARM64 ScmpArch = iota
|
||||
// ArchMIPS represents 32-bit MIPS syscalls
|
||||
ArchMIPS ScmpArch = iota
|
||||
// ArchMIPS64 represents 64-bit MIPS syscalls
|
||||
ArchMIPS64 ScmpArch = iota
|
||||
// ArchMIPS64N32 represents 64-bit MIPS syscalls (32-bit pointers)
|
||||
ArchMIPS64N32 ScmpArch = iota
|
||||
// ArchMIPSEL represents 32-bit MIPS syscalls (little endian)
|
||||
ArchMIPSEL ScmpArch = iota
|
||||
// ArchMIPSEL64 represents 64-bit MIPS syscalls (little endian)
|
||||
ArchMIPSEL64 ScmpArch = iota
|
||||
// ArchMIPSEL64N32 represents 64-bit MIPS syscalls (little endian,
|
||||
// 32-bit pointers)
|
||||
ArchMIPSEL64N32 ScmpArch = iota
|
||||
// ArchPPC represents 32-bit POWERPC syscalls
|
||||
ArchPPC ScmpArch = iota
|
||||
// ArchPPC64 represents 64-bit POWER syscalls (big endian)
|
||||
ArchPPC64 ScmpArch = iota
|
||||
// ArchPPC64LE represents 64-bit POWER syscalls (little endian)
|
||||
ArchPPC64LE ScmpArch = iota
|
||||
// ArchS390 represents 31-bit System z/390 syscalls
|
||||
ArchS390 ScmpArch = iota
|
||||
// ArchS390X represents 64-bit System z/390 syscalls
|
||||
ArchS390X ScmpArch = iota
|
||||
)
|
||||
|
||||
const (
|
||||
// Supported actions on filter match
|
||||
|
||||
// ActInvalid is a placeholder to ensure uninitialized ScmpAction
|
||||
// variables are invalid
|
||||
ActInvalid ScmpAction = iota
|
||||
// ActKill kills the process
|
||||
ActKill ScmpAction = iota
|
||||
// ActTrap throws SIGSYS
|
||||
ActTrap ScmpAction = iota
|
||||
// ActErrno causes the syscall to return a negative error code. This
|
||||
// code can be set with the SetReturnCode method
|
||||
ActErrno ScmpAction = iota
|
||||
// ActTrace causes the syscall to notify tracing processes with the
|
||||
// given error code. This code can be set with the SetReturnCode method
|
||||
ActTrace ScmpAction = iota
|
||||
// ActAllow permits the syscall to continue execution
|
||||
ActAllow ScmpAction = iota
|
||||
)
|
||||
|
||||
const (
|
||||
// These are comparison operators used in conditional seccomp rules
|
||||
// They are used to compare the value of a single argument of a syscall
|
||||
// against a user-defined constant
|
||||
|
||||
// CompareInvalid is a placeholder to ensure uninitialized ScmpCompareOp
|
||||
// variables are invalid
|
||||
CompareInvalid ScmpCompareOp = iota
|
||||
// CompareNotEqual returns true if the argument is not equal to the
|
||||
// given value
|
||||
CompareNotEqual ScmpCompareOp = iota
|
||||
// CompareLess returns true if the argument is less than the given value
|
||||
CompareLess ScmpCompareOp = iota
|
||||
// CompareLessOrEqual returns true if the argument is less than or equal
|
||||
// to the given value
|
||||
CompareLessOrEqual ScmpCompareOp = iota
|
||||
// CompareEqual returns true if the argument is equal to the given value
|
||||
CompareEqual ScmpCompareOp = iota
|
||||
// CompareGreaterEqual returns true if the argument is greater than or
|
||||
// equal to the given value
|
||||
CompareGreaterEqual ScmpCompareOp = iota
|
||||
// CompareGreater returns true if the argument is greater than the given
|
||||
// value
|
||||
CompareGreater ScmpCompareOp = iota
|
||||
// CompareMaskedEqual returns true if the argument is equal to the given
|
||||
// value, when masked (bitwise &) against the second given value
|
||||
CompareMaskedEqual ScmpCompareOp = iota
|
||||
)
|
||||
|
||||
// Helpers for types
|
||||
|
||||
// GetArchFromString returns an ScmpArch constant from a string representing an
|
||||
// architecture
|
||||
func GetArchFromString(arch string) (ScmpArch, error) {
|
||||
switch strings.ToLower(arch) {
|
||||
case "x86":
|
||||
return ArchX86, nil
|
||||
case "amd64", "x86-64", "x86_64", "x64":
|
||||
return ArchAMD64, nil
|
||||
case "x32":
|
||||
return ArchX32, nil
|
||||
case "arm":
|
||||
return ArchARM, nil
|
||||
case "arm64", "aarch64":
|
||||
return ArchARM64, nil
|
||||
case "mips":
|
||||
return ArchMIPS, nil
|
||||
case "mips64":
|
||||
return ArchMIPS64, nil
|
||||
case "mips64n32":
|
||||
return ArchMIPS64N32, nil
|
||||
case "mipsel":
|
||||
return ArchMIPSEL, nil
|
||||
case "mipsel64":
|
||||
return ArchMIPSEL64, nil
|
||||
case "mipsel64n32":
|
||||
return ArchMIPSEL64N32, nil
|
||||
case "ppc":
|
||||
return ArchPPC, nil
|
||||
case "ppc64":
|
||||
return ArchPPC64, nil
|
||||
case "ppc64le":
|
||||
return ArchPPC64LE, nil
|
||||
case "s390":
|
||||
return ArchS390, nil
|
||||
case "s390x":
|
||||
return ArchS390X, nil
|
||||
default:
|
||||
return ArchInvalid, fmt.Errorf("cannot convert unrecognized string %s", arch)
|
||||
}
|
||||
}
|
||||
|
||||
// String returns a string representation of an architecture constant
|
||||
func (a ScmpArch) String() string {
|
||||
switch a {
|
||||
case ArchX86:
|
||||
return "x86"
|
||||
case ArchAMD64:
|
||||
return "amd64"
|
||||
case ArchX32:
|
||||
return "x32"
|
||||
case ArchARM:
|
||||
return "arm"
|
||||
case ArchARM64:
|
||||
return "arm64"
|
||||
case ArchMIPS:
|
||||
return "mips"
|
||||
case ArchMIPS64:
|
||||
return "mips64"
|
||||
case ArchMIPS64N32:
|
||||
return "mips64n32"
|
||||
case ArchMIPSEL:
|
||||
return "mipsel"
|
||||
case ArchMIPSEL64:
|
||||
return "mipsel64"
|
||||
case ArchMIPSEL64N32:
|
||||
return "mipsel64n32"
|
||||
case ArchPPC:
|
||||
return "ppc"
|
||||
case ArchPPC64:
|
||||
return "ppc64"
|
||||
case ArchPPC64LE:
|
||||
return "ppc64le"
|
||||
case ArchS390:
|
||||
return "s390"
|
||||
case ArchS390X:
|
||||
return "s390x"
|
||||
case ArchNative:
|
||||
return "native"
|
||||
case ArchInvalid:
|
||||
return "Invalid architecture"
|
||||
default:
|
||||
return "Unknown architecture"
|
||||
}
|
||||
}
|
||||
|
||||
// String returns a string representation of a comparison operator constant
|
||||
func (a ScmpCompareOp) String() string {
|
||||
switch a {
|
||||
case CompareNotEqual:
|
||||
return "Not equal"
|
||||
case CompareLess:
|
||||
return "Less than"
|
||||
case CompareLessOrEqual:
|
||||
return "Less than or equal to"
|
||||
case CompareEqual:
|
||||
return "Equal"
|
||||
case CompareGreaterEqual:
|
||||
return "Greater than or equal to"
|
||||
case CompareGreater:
|
||||
return "Greater than"
|
||||
case CompareMaskedEqual:
|
||||
return "Masked equality"
|
||||
case CompareInvalid:
|
||||
return "Invalid comparison operator"
|
||||
default:
|
||||
return "Unrecognized comparison operator"
|
||||
}
|
||||
}
|
||||
|
||||
// String returns a string representation of a seccomp match action
|
||||
func (a ScmpAction) String() string {
|
||||
switch a & 0xFFFF {
|
||||
case ActKill:
|
||||
return "Action: Kill Process"
|
||||
case ActTrap:
|
||||
return "Action: Send SIGSYS"
|
||||
case ActErrno:
|
||||
return fmt.Sprintf("Action: Return error code %d", (a >> 16))
|
||||
case ActTrace:
|
||||
return fmt.Sprintf("Action: Notify tracing processes with code %d",
|
||||
(a >> 16))
|
||||
case ActAllow:
|
||||
return "Action: Allow system call"
|
||||
default:
|
||||
return "Unrecognized Action"
|
||||
}
|
||||
}
|
||||
|
||||
// SetReturnCode adds a return code to a supporting ScmpAction, clearing any
|
||||
// existing code Only valid on ActErrno and ActTrace. Takes no action otherwise.
|
||||
// Accepts 16-bit return code as argument.
|
||||
// Returns a valid ScmpAction of the original type with the new error code set.
|
||||
func (a ScmpAction) SetReturnCode(code int16) ScmpAction {
|
||||
aTmp := a & 0x0000FFFF
|
||||
if aTmp == ActErrno || aTmp == ActTrace {
|
||||
return (aTmp | (ScmpAction(code)&0xFFFF)<<16)
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// GetReturnCode returns the return code of an ScmpAction
|
||||
func (a ScmpAction) GetReturnCode() int16 {
|
||||
return int16(a >> 16)
|
||||
}
|
||||
|
||||
// General utility functions
|
||||
|
||||
// GetLibraryVersion returns the version of the library the bindings are built
|
||||
// against.
|
||||
// The version is formatted as follows: Major.Minor.Micro
|
||||
func GetLibraryVersion() (major, minor, micro int) {
|
||||
return verMajor, verMinor, verMicro
|
||||
}
|
||||
|
||||
// Syscall functions
|
||||
|
||||
// GetName retrieves the name of a syscall from its number.
|
||||
// Acts on any syscall number.
|
||||
// Returns either a string containing the name of the syscall, or an error.
|
||||
func (s ScmpSyscall) GetName() (string, error) {
|
||||
return s.GetNameByArch(ArchNative)
|
||||
}
|
||||
|
||||
// GetNameByArch retrieves the name of a syscall from its number for a given
|
||||
// architecture.
|
||||
// Acts on any syscall number.
|
||||
// Accepts a valid architecture constant.
|
||||
// Returns either a string containing the name of the syscall, or an error.
|
||||
// if the syscall is unrecognized or an issue occurred.
|
||||
func (s ScmpSyscall) GetNameByArch(arch ScmpArch) (string, error) {
|
||||
if err := sanitizeArch(arch); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
cString := C.seccomp_syscall_resolve_num_arch(arch.toNative(), C.int(s))
|
||||
if cString == nil {
|
||||
return "", fmt.Errorf("could not resolve syscall name")
|
||||
}
|
||||
defer C.free(unsafe.Pointer(cString))
|
||||
|
||||
finalStr := C.GoString(cString)
|
||||
return finalStr, nil
|
||||
}
|
||||
|
||||
// GetSyscallFromName returns the number of a syscall by name on the kernel's
|
||||
// native architecture.
|
||||
// Accepts a string containing the name of a syscall.
|
||||
// Returns the number of the syscall, or an error if no syscall with that name
|
||||
// was found.
|
||||
func GetSyscallFromName(name string) (ScmpSyscall, error) {
|
||||
cString := C.CString(name)
|
||||
defer C.free(unsafe.Pointer(cString))
|
||||
|
||||
result := C.seccomp_syscall_resolve_name(cString)
|
||||
if result == scmpError {
|
||||
return 0, fmt.Errorf("could not resolve name to syscall")
|
||||
}
|
||||
|
||||
return ScmpSyscall(result), nil
|
||||
}
|
||||
|
||||
// GetSyscallFromNameByArch returns the number of a syscall by name for a given
|
||||
// architecture's ABI.
|
||||
// Accepts the name of a syscall and an architecture constant.
|
||||
// Returns the number of the syscall, or an error if an invalid architecture is
|
||||
// passed or a syscall with that name was not found.
|
||||
func GetSyscallFromNameByArch(name string, arch ScmpArch) (ScmpSyscall, error) {
|
||||
if err := sanitizeArch(arch); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
cString := C.CString(name)
|
||||
defer C.free(unsafe.Pointer(cString))
|
||||
|
||||
result := C.seccomp_syscall_resolve_name_arch(arch.toNative(), cString)
|
||||
if result == scmpError {
|
||||
return 0, fmt.Errorf("could not resolve name to syscall")
|
||||
}
|
||||
|
||||
return ScmpSyscall(result), nil
|
||||
}
|
||||
|
||||
// MakeCondition creates and returns a new condition to attach to a filter rule.
|
||||
// Associated rules will only match if this condition is true.
|
||||
// Accepts the number the argument we are checking, and a comparison operator
|
||||
// and value to compare to.
|
||||
// The rule will match if argument $arg (zero-indexed) of the syscall is
|
||||
// $COMPARE_OP the provided comparison value.
|
||||
// Some comparison operators accept two values. Masked equals, for example,
|
||||
// will mask $arg of the syscall with the second value provided (via bitwise
|
||||
// AND) and then compare against the first value provided.
|
||||
// For example, in the less than or equal case, if the syscall argument was
|
||||
// 0 and the value provided was 1, the condition would match, as 0 is less
|
||||
// than or equal to 1.
|
||||
// Return either an error on bad argument or a valid ScmpCondition struct.
|
||||
func MakeCondition(arg uint, comparison ScmpCompareOp, values ...uint64) (ScmpCondition, error) {
|
||||
var condStruct ScmpCondition
|
||||
|
||||
if comparison == CompareInvalid {
|
||||
return condStruct, fmt.Errorf("invalid comparison operator")
|
||||
} else if arg > 5 {
|
||||
return condStruct, fmt.Errorf("syscalls only have up to 6 arguments")
|
||||
} else if len(values) > 2 {
|
||||
return condStruct, fmt.Errorf("conditions can have at most 2 arguments")
|
||||
} else if len(values) == 0 {
|
||||
return condStruct, fmt.Errorf("must provide at least one value to compare against")
|
||||
}
|
||||
|
||||
condStruct.Argument = arg
|
||||
condStruct.Op = comparison
|
||||
condStruct.Operand1 = values[0]
|
||||
if len(values) == 2 {
|
||||
condStruct.Operand2 = values[1]
|
||||
} else {
|
||||
condStruct.Operand2 = 0 // Unused
|
||||
}
|
||||
|
||||
return condStruct, nil
|
||||
}
|
||||
|
||||
// Utility Functions
|
||||
|
||||
// GetNativeArch returns architecture token representing the native kernel
|
||||
// architecture
|
||||
func GetNativeArch() (ScmpArch, error) {
|
||||
arch := C.seccomp_arch_native()
|
||||
|
||||
return archFromNative(arch)
|
||||
}
|
||||
|
||||
// Public Filter API
|
||||
|
||||
// ScmpFilter represents a filter context in libseccomp.
|
||||
// A filter context is initially empty. Rules can be added to it, and it can
|
||||
// then be loaded into the kernel.
|
||||
type ScmpFilter struct {
|
||||
filterCtx C.scmp_filter_ctx
|
||||
valid bool
|
||||
lock sync.Mutex
|
||||
}
|
||||
|
||||
// NewFilter creates and returns a new filter context.
|
||||
// Accepts a default action to be taken for syscalls which match no rules in
|
||||
// the filter.
|
||||
// Returns a reference to a valid filter context, or nil and an error if the
|
||||
// filter context could not be created or an invalid default action was given.
|
||||
func NewFilter(defaultAction ScmpAction) (*ScmpFilter, error) {
|
||||
if err := sanitizeAction(defaultAction); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fPtr := C.seccomp_init(defaultAction.toNative())
|
||||
if fPtr == nil {
|
||||
return nil, fmt.Errorf("could not create filter")
|
||||
}
|
||||
|
||||
filter := new(ScmpFilter)
|
||||
filter.filterCtx = fPtr
|
||||
filter.valid = true
|
||||
runtime.SetFinalizer(filter, filterFinalizer)
|
||||
|
||||
return filter, nil
|
||||
}
|
||||
|
||||
// IsValid determines whether a filter context is valid to use.
|
||||
// Some operations (Release and Merge) render filter contexts invalid and
|
||||
// consequently prevent further use.
|
||||
func (f *ScmpFilter) IsValid() bool {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
return f.valid
|
||||
}
|
||||
|
||||
// Reset resets a filter context, removing all its existing state.
|
||||
// Accepts a new default action to be taken for syscalls which do not match.
|
||||
// Returns an error if the filter or action provided are invalid.
|
||||
func (f *ScmpFilter) Reset(defaultAction ScmpAction) error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
if err := sanitizeAction(defaultAction); err != nil {
|
||||
return err
|
||||
} else if !f.valid {
|
||||
return errBadFilter
|
||||
}
|
||||
|
||||
retCode := C.seccomp_reset(f.filterCtx, defaultAction.toNative())
|
||||
if retCode != 0 {
|
||||
return syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Release releases a filter context, freeing its memory. Should be called after
|
||||
// loading into the kernel, when the filter is no longer needed.
|
||||
// After calling this function, the given filter is no longer valid and cannot
|
||||
// be used.
|
||||
// Release() will be invoked automatically when a filter context is garbage
|
||||
// collected, but can also be called manually to free memory.
|
||||
func (f *ScmpFilter) Release() {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
if !f.valid {
|
||||
return
|
||||
}
|
||||
|
||||
f.valid = false
|
||||
C.seccomp_release(f.filterCtx)
|
||||
}
|
||||
|
||||
// Merge merges two filter contexts.
|
||||
// The source filter src will be released as part of the process, and will no
|
||||
// longer be usable or valid after this call.
|
||||
// To be merged, filters must NOT share any architectures, and all their
|
||||
// attributes (Default Action, Bad Arch Action, No New Privs and TSync bools)
|
||||
// must match.
|
||||
// The filter src will be merged into the filter this is called on.
|
||||
// The architectures of the src filter not present in the destination, and all
|
||||
// associated rules, will be added to the destination.
|
||||
// Returns an error if merging the filters failed.
|
||||
func (f *ScmpFilter) Merge(src *ScmpFilter) error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
src.lock.Lock()
|
||||
defer src.lock.Unlock()
|
||||
|
||||
if !src.valid || !f.valid {
|
||||
return fmt.Errorf("one or more of the filter contexts is invalid or uninitialized")
|
||||
}
|
||||
|
||||
// Merge the filters
|
||||
retCode := C.seccomp_merge(f.filterCtx, src.filterCtx)
|
||||
if syscall.Errno(-1*retCode) == syscall.EINVAL {
|
||||
return fmt.Errorf("filters could not be merged due to a mismatch in attributes or invalid filter")
|
||||
} else if retCode != 0 {
|
||||
return syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
src.valid = false
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsArchPresent checks if an architecture is present in a filter.
|
||||
// If a filter contains an architecture, it uses its default action for
|
||||
// syscalls which do not match rules in it, and its rules can match syscalls
|
||||
// for that ABI.
|
||||
// If a filter does not contain an architecture, all syscalls made to that
|
||||
// kernel ABI will fail with the filter's default Bad Architecture Action
|
||||
// (by default, killing the process).
|
||||
// Accepts an architecture constant.
|
||||
// Returns true if the architecture is present in the filter, false otherwise,
|
||||
// and an error on an invalid filter context, architecture constant, or an
|
||||
// issue with the call to libseccomp.
|
||||
func (f *ScmpFilter) IsArchPresent(arch ScmpArch) (bool, error) {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
if err := sanitizeArch(arch); err != nil {
|
||||
return false, err
|
||||
} else if !f.valid {
|
||||
return false, errBadFilter
|
||||
}
|
||||
|
||||
retCode := C.seccomp_arch_exist(f.filterCtx, arch.toNative())
|
||||
if syscall.Errno(-1*retCode) == syscall.EEXIST {
|
||||
// -EEXIST is "arch not present"
|
||||
return false, nil
|
||||
} else if retCode != 0 {
|
||||
return false, syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// AddArch adds an architecture to the filter.
|
||||
// Accepts an architecture constant.
|
||||
// Returns an error on invalid filter context or architecture token, or an
|
||||
// issue with the call to libseccomp.
|
||||
func (f *ScmpFilter) AddArch(arch ScmpArch) error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
if err := sanitizeArch(arch); err != nil {
|
||||
return err
|
||||
} else if !f.valid {
|
||||
return errBadFilter
|
||||
}
|
||||
|
||||
// Libseccomp returns -EEXIST if the specified architecture is already
|
||||
// present. Succeed silently in this case, as it's not fatal, and the
|
||||
// architecture is present already.
|
||||
retCode := C.seccomp_arch_add(f.filterCtx, arch.toNative())
|
||||
if retCode != 0 && syscall.Errno(-1*retCode) != syscall.EEXIST {
|
||||
return syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// RemoveArch removes an architecture from the filter.
|
||||
// Accepts an architecture constant.
|
||||
// Returns an error on invalid filter context or architecture token, or an
|
||||
// issue with the call to libseccomp.
|
||||
func (f *ScmpFilter) RemoveArch(arch ScmpArch) error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
if err := sanitizeArch(arch); err != nil {
|
||||
return err
|
||||
} else if !f.valid {
|
||||
return errBadFilter
|
||||
}
|
||||
|
||||
// Similar to AddArch, -EEXIST is returned if the arch is not present
|
||||
// Succeed silently in that case, this is not fatal and the architecture
|
||||
// is not present in the filter after RemoveArch
|
||||
retCode := C.seccomp_arch_remove(f.filterCtx, arch.toNative())
|
||||
if retCode != 0 && syscall.Errno(-1*retCode) != syscall.EEXIST {
|
||||
return syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Load loads a filter context into the kernel.
|
||||
// Returns an error if the filter context is invalid or the syscall failed.
|
||||
func (f *ScmpFilter) Load() error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
if !f.valid {
|
||||
return errBadFilter
|
||||
}
|
||||
|
||||
if retCode := C.seccomp_load(f.filterCtx); retCode != 0 {
|
||||
return syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetDefaultAction returns the default action taken on a syscall which does not
|
||||
// match a rule in the filter, or an error if an issue was encountered
|
||||
// retrieving the value.
|
||||
func (f *ScmpFilter) GetDefaultAction() (ScmpAction, error) {
|
||||
action, err := f.getFilterAttr(filterAttrActDefault)
|
||||
if err != nil {
|
||||
return 0x0, err
|
||||
}
|
||||
|
||||
return actionFromNative(action)
|
||||
}
|
||||
|
||||
// GetBadArchAction returns the default action taken on a syscall for an
|
||||
// architecture not in the filter, or an error if an issue was encountered
|
||||
// retrieving the value.
|
||||
func (f *ScmpFilter) GetBadArchAction() (ScmpAction, error) {
|
||||
action, err := f.getFilterAttr(filterAttrActBadArch)
|
||||
if err != nil {
|
||||
return 0x0, err
|
||||
}
|
||||
|
||||
return actionFromNative(action)
|
||||
}
|
||||
|
||||
// GetNoNewPrivsBit returns the current state the No New Privileges bit will be set
|
||||
// to on the filter being loaded, or an error if an issue was encountered
|
||||
// retrieving the value.
|
||||
// The No New Privileges bit tells the kernel that new processes run with exec()
|
||||
// cannot gain more privileges than the process that ran exec().
|
||||
// For example, a process with No New Privileges set would be unable to exec
|
||||
// setuid/setgid executables.
|
||||
func (f *ScmpFilter) GetNoNewPrivsBit() (bool, error) {
|
||||
noNewPrivs, err := f.getFilterAttr(filterAttrNNP)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if noNewPrivs == 0 {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// GetTsyncBit returns whether Thread Synchronization will be enabled on the
|
||||
// filter being loaded, or an error if an issue was encountered retrieving the
|
||||
// value.
|
||||
// Thread Sync ensures that all members of the thread group of the calling
|
||||
// process will share the same Seccomp filter set.
|
||||
// Tsync is a fairly recent addition to the Linux kernel and older kernels
|
||||
// lack support. If the running kernel does not support Tsync and it is
|
||||
// requested in a filter, Libseccomp will not enable TSync support and will
|
||||
// proceed as normal.
|
||||
// This function is unavailable before v2.2 of libseccomp and will return an
|
||||
// error.
|
||||
func (f *ScmpFilter) GetTsyncBit() (bool, error) {
|
||||
tSync, err := f.getFilterAttr(filterAttrTsync)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if tSync == 0 {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// SetBadArchAction sets the default action taken on a syscall for an
|
||||
// architecture not in the filter, or an error if an issue was encountered
|
||||
// setting the value.
|
||||
func (f *ScmpFilter) SetBadArchAction(action ScmpAction) error {
|
||||
if err := sanitizeAction(action); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return f.setFilterAttr(filterAttrActBadArch, action.toNative())
|
||||
}
|
||||
|
||||
// SetNoNewPrivsBit sets the state of the No New Privileges bit, which will be
|
||||
// applied on filter load, or an error if an issue was encountered setting the
|
||||
// value.
|
||||
// Filters with No New Privileges set to 0 can only be loaded if the process
|
||||
// has the CAP_SYS_ADMIN capability.
|
||||
func (f *ScmpFilter) SetNoNewPrivsBit(state bool) error {
|
||||
var toSet C.uint32_t = 0x0
|
||||
|
||||
if state {
|
||||
toSet = 0x1
|
||||
}
|
||||
|
||||
return f.setFilterAttr(filterAttrNNP, toSet)
|
||||
}
|
||||
|
||||
// SetTsync sets whether Thread Synchronization will be enabled on the filter
|
||||
// being loaded. Returns an error if setting Tsync failed, or the filter is
|
||||
// invalid.
|
||||
// Thread Sync ensures that all members of the thread group of the calling
|
||||
// process will share the same Seccomp filter set.
|
||||
// Tsync is a fairly recent addition to the Linux kernel and older kernels
|
||||
// lack support. If the running kernel does not support Tsync and it is
|
||||
// requested in a filter, Libseccomp will not enable TSync support and will
|
||||
// proceed as normal.
|
||||
// This function is unavailable before v2.2 of libseccomp and will return an
|
||||
// error.
|
||||
func (f *ScmpFilter) SetTsync(enable bool) error {
|
||||
var toSet C.uint32_t = 0x0
|
||||
|
||||
if enable {
|
||||
toSet = 0x1
|
||||
}
|
||||
|
||||
return f.setFilterAttr(filterAttrTsync, toSet)
|
||||
}
|
||||
|
||||
// SetSyscallPriority sets a syscall's priority.
|
||||
// This provides a hint to the filter generator in libseccomp about the
|
||||
// importance of this syscall. High-priority syscalls are placed
|
||||
// first in the filter code, and incur less overhead (at the expense of
|
||||
// lower-priority syscalls).
|
||||
func (f *ScmpFilter) SetSyscallPriority(call ScmpSyscall, priority uint8) error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
if !f.valid {
|
||||
return errBadFilter
|
||||
}
|
||||
|
||||
if retCode := C.seccomp_syscall_priority(f.filterCtx, C.int(call),
|
||||
C.uint8_t(priority)); retCode != 0 {
|
||||
return syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// AddRule adds a single rule for an unconditional action on a syscall.
|
||||
// Accepts the number of the syscall and the action to be taken on the call
|
||||
// being made.
|
||||
// Returns an error if an issue was encountered adding the rule.
|
||||
func (f *ScmpFilter) AddRule(call ScmpSyscall, action ScmpAction) error {
|
||||
return f.addRuleGeneric(call, action, false, nil)
|
||||
}
|
||||
|
||||
// AddRuleExact adds a single rule for an unconditional action on a syscall.
|
||||
// Accepts the number of the syscall and the action to be taken on the call
|
||||
// being made.
|
||||
// No modifications will be made to the rule, and it will fail to add if it
|
||||
// cannot be applied to the current architecture without modification.
|
||||
// The rule will function exactly as described, but it may not function identically
|
||||
// (or be able to be applied to) all architectures.
|
||||
// Returns an error if an issue was encountered adding the rule.
|
||||
func (f *ScmpFilter) AddRuleExact(call ScmpSyscall, action ScmpAction) error {
|
||||
return f.addRuleGeneric(call, action, true, nil)
|
||||
}
|
||||
|
||||
// AddRuleConditional adds a single rule for a conditional action on a syscall.
|
||||
// Returns an error if an issue was encountered adding the rule.
|
||||
// All conditions must match for the rule to match.
|
||||
// There is a bug in library versions below v2.2.1 which can, in some cases,
|
||||
// cause conditions to be lost when more than one are used. Consequently,
|
||||
// AddRuleConditional is disabled on library versions lower than v2.2.1
|
||||
func (f *ScmpFilter) AddRuleConditional(call ScmpSyscall, action ScmpAction, conds []ScmpCondition) error {
|
||||
return f.addRuleGeneric(call, action, false, conds)
|
||||
}
|
||||
|
||||
// AddRuleConditionalExact adds a single rule for a conditional action on a
|
||||
// syscall.
|
||||
// No modifications will be made to the rule, and it will fail to add if it
|
||||
// cannot be applied to the current architecture without modification.
|
||||
// The rule will function exactly as described, but it may not function identically
|
||||
// (or be able to be applied to) all architectures.
|
||||
// Returns an error if an issue was encountered adding the rule.
|
||||
// There is a bug in library versions below v2.2.1 which can, in some cases,
|
||||
// cause conditions to be lost when more than one are used. Consequently,
|
||||
// AddRuleConditionalExact is disabled on library versions lower than v2.2.1
|
||||
func (f *ScmpFilter) AddRuleConditionalExact(call ScmpSyscall, action ScmpAction, conds []ScmpCondition) error {
|
||||
return f.addRuleGeneric(call, action, true, conds)
|
||||
}
|
||||
|
||||
// ExportPFC output PFC-formatted, human-readable dump of a filter context's
|
||||
// rules to a file.
|
||||
// Accepts file to write to (must be open for writing).
|
||||
// Returns an error if writing to the file fails.
|
||||
func (f *ScmpFilter) ExportPFC(file *os.File) error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
fd := file.Fd()
|
||||
|
||||
if !f.valid {
|
||||
return errBadFilter
|
||||
}
|
||||
|
||||
if retCode := C.seccomp_export_pfc(f.filterCtx, C.int(fd)); retCode != 0 {
|
||||
return syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ExportBPF outputs Berkeley Packet Filter-formatted, kernel-readable dump of a
|
||||
// filter context's rules to a file.
|
||||
// Accepts file to write to (must be open for writing).
|
||||
// Returns an error if writing to the file fails.
|
||||
func (f *ScmpFilter) ExportBPF(file *os.File) error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
fd := file.Fd()
|
||||
|
||||
if !f.valid {
|
||||
return errBadFilter
|
||||
}
|
||||
|
||||
if retCode := C.seccomp_export_bpf(f.filterCtx, C.int(fd)); retCode != 0 {
|
||||
return syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -1,506 +0,0 @@
|
|||
// +build linux
|
||||
|
||||
// Internal functions for libseccomp Go bindings
|
||||
// No exported functions
|
||||
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// Unexported C wrapping code - provides the C-Golang interface
|
||||
// Get the seccomp header in scope
|
||||
// Need stdlib.h for free() on cstrings
|
||||
|
||||
// #cgo pkg-config: libseccomp
|
||||
/*
|
||||
#include <stdlib.h>
|
||||
#include <seccomp.h>
|
||||
|
||||
#if SCMP_VER_MAJOR < 2
|
||||
#error Minimum supported version of Libseccomp is v2.1.0
|
||||
#elif SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 1
|
||||
#error Minimum supported version of Libseccomp is v2.1.0
|
||||
#endif
|
||||
|
||||
#define ARCH_BAD ~0
|
||||
|
||||
const uint32_t C_ARCH_BAD = ARCH_BAD;
|
||||
|
||||
#ifndef SCMP_ARCH_AARCH64
|
||||
#define SCMP_ARCH_AARCH64 ARCH_BAD
|
||||
#endif
|
||||
|
||||
#ifndef SCMP_ARCH_MIPS
|
||||
#define SCMP_ARCH_MIPS ARCH_BAD
|
||||
#endif
|
||||
|
||||
#ifndef SCMP_ARCH_MIPS64
|
||||
#define SCMP_ARCH_MIPS64 ARCH_BAD
|
||||
#endif
|
||||
|
||||
#ifndef SCMP_ARCH_MIPS64N32
|
||||
#define SCMP_ARCH_MIPS64N32 ARCH_BAD
|
||||
#endif
|
||||
|
||||
#ifndef SCMP_ARCH_MIPSEL
|
||||
#define SCMP_ARCH_MIPSEL ARCH_BAD
|
||||
#endif
|
||||
|
||||
#ifndef SCMP_ARCH_MIPSEL64
|
||||
#define SCMP_ARCH_MIPSEL64 ARCH_BAD
|
||||
#endif
|
||||
|
||||
#ifndef SCMP_ARCH_MIPSEL64N32
|
||||
#define SCMP_ARCH_MIPSEL64N32 ARCH_BAD
|
||||
#endif
|
||||
|
||||
#ifndef SCMP_ARCH_PPC
|
||||
#define SCMP_ARCH_PPC ARCH_BAD
|
||||
#endif
|
||||
|
||||
#ifndef SCMP_ARCH_PPC64
|
||||
#define SCMP_ARCH_PPC64 ARCH_BAD
|
||||
#endif
|
||||
|
||||
#ifndef SCMP_ARCH_PPC64LE
|
||||
#define SCMP_ARCH_PPC64LE ARCH_BAD
|
||||
#endif
|
||||
|
||||
#ifndef SCMP_ARCH_S390
|
||||
#define SCMP_ARCH_S390 ARCH_BAD
|
||||
#endif
|
||||
|
||||
#ifndef SCMP_ARCH_S390X
|
||||
#define SCMP_ARCH_S390X ARCH_BAD
|
||||
#endif
|
||||
|
||||
const uint32_t C_ARCH_NATIVE = SCMP_ARCH_NATIVE;
|
||||
const uint32_t C_ARCH_X86 = SCMP_ARCH_X86;
|
||||
const uint32_t C_ARCH_X86_64 = SCMP_ARCH_X86_64;
|
||||
const uint32_t C_ARCH_X32 = SCMP_ARCH_X32;
|
||||
const uint32_t C_ARCH_ARM = SCMP_ARCH_ARM;
|
||||
const uint32_t C_ARCH_AARCH64 = SCMP_ARCH_AARCH64;
|
||||
const uint32_t C_ARCH_MIPS = SCMP_ARCH_MIPS;
|
||||
const uint32_t C_ARCH_MIPS64 = SCMP_ARCH_MIPS64;
|
||||
const uint32_t C_ARCH_MIPS64N32 = SCMP_ARCH_MIPS64N32;
|
||||
const uint32_t C_ARCH_MIPSEL = SCMP_ARCH_MIPSEL;
|
||||
const uint32_t C_ARCH_MIPSEL64 = SCMP_ARCH_MIPSEL64;
|
||||
const uint32_t C_ARCH_MIPSEL64N32 = SCMP_ARCH_MIPSEL64N32;
|
||||
const uint32_t C_ARCH_PPC = SCMP_ARCH_PPC;
|
||||
const uint32_t C_ARCH_PPC64 = SCMP_ARCH_PPC64;
|
||||
const uint32_t C_ARCH_PPC64LE = SCMP_ARCH_PPC64LE;
|
||||
const uint32_t C_ARCH_S390 = SCMP_ARCH_S390;
|
||||
const uint32_t C_ARCH_S390X = SCMP_ARCH_S390X;
|
||||
|
||||
const uint32_t C_ACT_KILL = SCMP_ACT_KILL;
|
||||
const uint32_t C_ACT_TRAP = SCMP_ACT_TRAP;
|
||||
const uint32_t C_ACT_ERRNO = SCMP_ACT_ERRNO(0);
|
||||
const uint32_t C_ACT_TRACE = SCMP_ACT_TRACE(0);
|
||||
const uint32_t C_ACT_ALLOW = SCMP_ACT_ALLOW;
|
||||
|
||||
// If TSync is not supported, make sure it doesn't map to a supported filter attribute
|
||||
// Don't worry about major version < 2, the minimum version checks should catch that case
|
||||
#if SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR < 2
|
||||
#define SCMP_FLTATR_CTL_TSYNC _SCMP_CMP_MIN
|
||||
#endif
|
||||
|
||||
const uint32_t C_ATTRIBUTE_DEFAULT = (uint32_t)SCMP_FLTATR_ACT_DEFAULT;
|
||||
const uint32_t C_ATTRIBUTE_BADARCH = (uint32_t)SCMP_FLTATR_ACT_BADARCH;
|
||||
const uint32_t C_ATTRIBUTE_NNP = (uint32_t)SCMP_FLTATR_CTL_NNP;
|
||||
const uint32_t C_ATTRIBUTE_TSYNC = (uint32_t)SCMP_FLTATR_CTL_TSYNC;
|
||||
|
||||
const int C_CMP_NE = (int)SCMP_CMP_NE;
|
||||
const int C_CMP_LT = (int)SCMP_CMP_LT;
|
||||
const int C_CMP_LE = (int)SCMP_CMP_LE;
|
||||
const int C_CMP_EQ = (int)SCMP_CMP_EQ;
|
||||
const int C_CMP_GE = (int)SCMP_CMP_GE;
|
||||
const int C_CMP_GT = (int)SCMP_CMP_GT;
|
||||
const int C_CMP_MASKED_EQ = (int)SCMP_CMP_MASKED_EQ;
|
||||
|
||||
const int C_VERSION_MAJOR = SCMP_VER_MAJOR;
|
||||
const int C_VERSION_MINOR = SCMP_VER_MINOR;
|
||||
const int C_VERSION_MICRO = SCMP_VER_MICRO;
|
||||
|
||||
typedef struct scmp_arg_cmp* scmp_cast_t;
|
||||
|
||||
// Wrapper to create an scmp_arg_cmp struct
|
||||
void*
|
||||
make_struct_arg_cmp(
|
||||
unsigned int arg,
|
||||
int compare,
|
||||
uint64_t a,
|
||||
uint64_t b
|
||||
)
|
||||
{
|
||||
struct scmp_arg_cmp *s = malloc(sizeof(struct scmp_arg_cmp));
|
||||
|
||||
s->arg = arg;
|
||||
s->op = compare;
|
||||
s->datum_a = a;
|
||||
s->datum_b = b;
|
||||
|
||||
return s;
|
||||
}
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// Nonexported types
|
||||
type scmpFilterAttr uint32
|
||||
|
||||
// Nonexported constants
|
||||
|
||||
const (
|
||||
filterAttrActDefault scmpFilterAttr = iota
|
||||
filterAttrActBadArch scmpFilterAttr = iota
|
||||
filterAttrNNP scmpFilterAttr = iota
|
||||
filterAttrTsync scmpFilterAttr = iota
|
||||
)
|
||||
|
||||
const (
|
||||
// An error return from certain libseccomp functions
|
||||
scmpError C.int = -1
|
||||
// Comparison boundaries to check for architecture validity
|
||||
archStart ScmpArch = ArchNative
|
||||
archEnd ScmpArch = ArchS390X
|
||||
// Comparison boundaries to check for action validity
|
||||
actionStart ScmpAction = ActKill
|
||||
actionEnd ScmpAction = ActAllow
|
||||
// Comparison boundaries to check for comparison operator validity
|
||||
compareOpStart ScmpCompareOp = CompareNotEqual
|
||||
compareOpEnd ScmpCompareOp = CompareMaskedEqual
|
||||
)
|
||||
|
||||
var (
|
||||
// Error thrown on bad filter context
|
||||
errBadFilter = fmt.Errorf("filter is invalid or uninitialized")
|
||||
// Constants representing library major, minor, and micro versions
|
||||
verMajor = int(C.C_VERSION_MAJOR)
|
||||
verMinor = int(C.C_VERSION_MINOR)
|
||||
verMicro = int(C.C_VERSION_MICRO)
|
||||
)
|
||||
|
||||
// Nonexported functions
|
||||
|
||||
// Check if library version is greater than or equal to the given one
|
||||
func checkVersionAbove(major, minor, micro int) bool {
|
||||
return (verMajor > major) ||
|
||||
(verMajor == major && verMinor > minor) ||
|
||||
(verMajor == major && verMinor == minor && verMicro >= micro)
|
||||
}
|
||||
|
||||
// Init function: Verify library version is appropriate
|
||||
func init() {
|
||||
if !checkVersionAbove(2, 1, 0) {
|
||||
fmt.Fprintf(os.Stderr, "Libseccomp version too low: minimum supported is 2.1.0, detected %d.%d.%d", C.C_VERSION_MAJOR, C.C_VERSION_MINOR, C.C_VERSION_MICRO)
|
||||
os.Exit(-1)
|
||||
}
|
||||
}
|
||||
|
||||
// Filter helpers
|
||||
|
||||
// Filter finalizer - ensure that kernel context for filters is freed
|
||||
func filterFinalizer(f *ScmpFilter) {
|
||||
f.Release()
|
||||
}
|
||||
|
||||
// Get a raw filter attribute
|
||||
func (f *ScmpFilter) getFilterAttr(attr scmpFilterAttr) (C.uint32_t, error) {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
if !f.valid {
|
||||
return 0x0, errBadFilter
|
||||
}
|
||||
|
||||
if !checkVersionAbove(2, 2, 0) && attr == filterAttrTsync {
|
||||
return 0x0, fmt.Errorf("the thread synchronization attribute is not supported in this version of the library")
|
||||
}
|
||||
|
||||
var attribute C.uint32_t
|
||||
|
||||
retCode := C.seccomp_attr_get(f.filterCtx, attr.toNative(), &attribute)
|
||||
if retCode != 0 {
|
||||
return 0x0, syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
return attribute, nil
|
||||
}
|
||||
|
||||
// Set a raw filter attribute
|
||||
func (f *ScmpFilter) setFilterAttr(attr scmpFilterAttr, value C.uint32_t) error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
if !f.valid {
|
||||
return errBadFilter
|
||||
}
|
||||
|
||||
if !checkVersionAbove(2, 2, 0) && attr == filterAttrTsync {
|
||||
return fmt.Errorf("the thread synchronization attribute is not supported in this version of the library")
|
||||
}
|
||||
|
||||
retCode := C.seccomp_attr_set(f.filterCtx, attr.toNative(), value)
|
||||
if retCode != 0 {
|
||||
return syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DOES NOT LOCK OR CHECK VALIDITY
|
||||
// Assumes caller has already done this
|
||||
// Wrapper for seccomp_rule_add_... functions
|
||||
func (f *ScmpFilter) addRuleWrapper(call ScmpSyscall, action ScmpAction, exact bool, cond C.scmp_cast_t) error {
|
||||
var length C.uint
|
||||
if cond != nil {
|
||||
length = 1
|
||||
} else {
|
||||
length = 0
|
||||
}
|
||||
|
||||
var retCode C.int
|
||||
if exact {
|
||||
retCode = C.seccomp_rule_add_exact_array(f.filterCtx, action.toNative(), C.int(call), length, cond)
|
||||
} else {
|
||||
retCode = C.seccomp_rule_add_array(f.filterCtx, action.toNative(), C.int(call), length, cond)
|
||||
}
|
||||
|
||||
if syscall.Errno(-1*retCode) == syscall.EFAULT {
|
||||
return fmt.Errorf("unrecognized syscall")
|
||||
} else if syscall.Errno(-1*retCode) == syscall.EPERM {
|
||||
return fmt.Errorf("requested action matches default action of filter")
|
||||
} else if retCode != 0 {
|
||||
return syscall.Errno(-1 * retCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Generic add function for filter rules
|
||||
func (f *ScmpFilter) addRuleGeneric(call ScmpSyscall, action ScmpAction, exact bool, conds []ScmpCondition) error {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
|
||||
if !f.valid {
|
||||
return errBadFilter
|
||||
}
|
||||
|
||||
if len(conds) == 0 {
|
||||
if err := f.addRuleWrapper(call, action, exact, nil); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
// We don't support conditional filtering in library version v2.1
|
||||
if !checkVersionAbove(2, 2, 1) {
|
||||
return fmt.Errorf("conditional filtering requires libseccomp version >= 2.2.1")
|
||||
}
|
||||
|
||||
for _, cond := range conds {
|
||||
cmpStruct := C.make_struct_arg_cmp(C.uint(cond.Argument), cond.Op.toNative(), C.uint64_t(cond.Operand1), C.uint64_t(cond.Operand2))
|
||||
defer C.free(cmpStruct)
|
||||
|
||||
if err := f.addRuleWrapper(call, action, exact, C.scmp_cast_t(cmpStruct)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Generic Helpers
|
||||
|
||||
// Helper - Sanitize Arch token input
|
||||
func sanitizeArch(in ScmpArch) error {
|
||||
if in < archStart || in > archEnd {
|
||||
return fmt.Errorf("unrecognized architecture")
|
||||
}
|
||||
|
||||
if in.toNative() == C.C_ARCH_BAD {
|
||||
return fmt.Errorf("architecture is not supported on this version of the library")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func sanitizeAction(in ScmpAction) error {
|
||||
inTmp := in & 0x0000FFFF
|
||||
if inTmp < actionStart || inTmp > actionEnd {
|
||||
return fmt.Errorf("unrecognized action")
|
||||
}
|
||||
|
||||
if inTmp != ActTrace && inTmp != ActErrno && (in&0xFFFF0000) != 0 {
|
||||
return fmt.Errorf("highest 16 bits must be zeroed except for Trace and Errno")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func sanitizeCompareOp(in ScmpCompareOp) error {
|
||||
if in < compareOpStart || in > compareOpEnd {
|
||||
return fmt.Errorf("unrecognized comparison operator")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func archFromNative(a C.uint32_t) (ScmpArch, error) {
|
||||
switch a {
|
||||
case C.C_ARCH_X86:
|
||||
return ArchX86, nil
|
||||
case C.C_ARCH_X86_64:
|
||||
return ArchAMD64, nil
|
||||
case C.C_ARCH_X32:
|
||||
return ArchX32, nil
|
||||
case C.C_ARCH_ARM:
|
||||
return ArchARM, nil
|
||||
case C.C_ARCH_NATIVE:
|
||||
return ArchNative, nil
|
||||
case C.C_ARCH_AARCH64:
|
||||
return ArchARM64, nil
|
||||
case C.C_ARCH_MIPS:
|
||||
return ArchMIPS, nil
|
||||
case C.C_ARCH_MIPS64:
|
||||
return ArchMIPS64, nil
|
||||
case C.C_ARCH_MIPS64N32:
|
||||
return ArchMIPS64N32, nil
|
||||
case C.C_ARCH_MIPSEL:
|
||||
return ArchMIPSEL, nil
|
||||
case C.C_ARCH_MIPSEL64:
|
||||
return ArchMIPSEL64, nil
|
||||
case C.C_ARCH_MIPSEL64N32:
|
||||
return ArchMIPSEL64N32, nil
|
||||
case C.C_ARCH_PPC:
|
||||
return ArchPPC, nil
|
||||
case C.C_ARCH_PPC64:
|
||||
return ArchPPC64, nil
|
||||
case C.C_ARCH_PPC64LE:
|
||||
return ArchPPC64LE, nil
|
||||
case C.C_ARCH_S390:
|
||||
return ArchS390, nil
|
||||
case C.C_ARCH_S390X:
|
||||
return ArchS390X, nil
|
||||
default:
|
||||
return 0x0, fmt.Errorf("unrecognized architecture")
|
||||
}
|
||||
}
|
||||
|
||||
// Only use with sanitized arches, no error handling
|
||||
func (a ScmpArch) toNative() C.uint32_t {
|
||||
switch a {
|
||||
case ArchX86:
|
||||
return C.C_ARCH_X86
|
||||
case ArchAMD64:
|
||||
return C.C_ARCH_X86_64
|
||||
case ArchX32:
|
||||
return C.C_ARCH_X32
|
||||
case ArchARM:
|
||||
return C.C_ARCH_ARM
|
||||
case ArchARM64:
|
||||
return C.C_ARCH_AARCH64
|
||||
case ArchMIPS:
|
||||
return C.C_ARCH_MIPS
|
||||
case ArchMIPS64:
|
||||
return C.C_ARCH_MIPS64
|
||||
case ArchMIPS64N32:
|
||||
return C.C_ARCH_MIPS64N32
|
||||
case ArchMIPSEL:
|
||||
return C.C_ARCH_MIPSEL
|
||||
case ArchMIPSEL64:
|
||||
return C.C_ARCH_MIPSEL64
|
||||
case ArchMIPSEL64N32:
|
||||
return C.C_ARCH_MIPSEL64N32
|
||||
case ArchPPC:
|
||||
return C.C_ARCH_PPC
|
||||
case ArchPPC64:
|
||||
return C.C_ARCH_PPC64
|
||||
case ArchPPC64LE:
|
||||
return C.C_ARCH_PPC64LE
|
||||
case ArchS390:
|
||||
return C.C_ARCH_S390
|
||||
case ArchS390X:
|
||||
return C.C_ARCH_S390X
|
||||
case ArchNative:
|
||||
return C.C_ARCH_NATIVE
|
||||
default:
|
||||
return 0x0
|
||||
}
|
||||
}
|
||||
|
||||
// Only use with sanitized ops, no error handling
|
||||
func (a ScmpCompareOp) toNative() C.int {
|
||||
switch a {
|
||||
case CompareNotEqual:
|
||||
return C.C_CMP_NE
|
||||
case CompareLess:
|
||||
return C.C_CMP_LT
|
||||
case CompareLessOrEqual:
|
||||
return C.C_CMP_LE
|
||||
case CompareEqual:
|
||||
return C.C_CMP_EQ
|
||||
case CompareGreaterEqual:
|
||||
return C.C_CMP_GE
|
||||
case CompareGreater:
|
||||
return C.C_CMP_GT
|
||||
case CompareMaskedEqual:
|
||||
return C.C_CMP_MASKED_EQ
|
||||
default:
|
||||
return 0x0
|
||||
}
|
||||
}
|
||||
|
||||
func actionFromNative(a C.uint32_t) (ScmpAction, error) {
|
||||
aTmp := a & 0xFFFF
|
||||
switch a & 0xFFFF0000 {
|
||||
case C.C_ACT_KILL:
|
||||
return ActKill, nil
|
||||
case C.C_ACT_TRAP:
|
||||
return ActTrap, nil
|
||||
case C.C_ACT_ERRNO:
|
||||
return ActErrno.SetReturnCode(int16(aTmp)), nil
|
||||
case C.C_ACT_TRACE:
|
||||
return ActTrace.SetReturnCode(int16(aTmp)), nil
|
||||
case C.C_ACT_ALLOW:
|
||||
return ActAllow, nil
|
||||
default:
|
||||
return 0x0, fmt.Errorf("unrecognized action")
|
||||
}
|
||||
}
|
||||
|
||||
// Only use with sanitized actions, no error handling
|
||||
func (a ScmpAction) toNative() C.uint32_t {
|
||||
switch a & 0xFFFF {
|
||||
case ActKill:
|
||||
return C.C_ACT_KILL
|
||||
case ActTrap:
|
||||
return C.C_ACT_TRAP
|
||||
case ActErrno:
|
||||
return C.C_ACT_ERRNO | (C.uint32_t(a) >> 16)
|
||||
case ActTrace:
|
||||
return C.C_ACT_TRACE | (C.uint32_t(a) >> 16)
|
||||
case ActAllow:
|
||||
return C.C_ACT_ALLOW
|
||||
default:
|
||||
return 0x0
|
||||
}
|
||||
}
|
||||
|
||||
// Internal only, assumes safe attribute
|
||||
func (a scmpFilterAttr) toNative() uint32 {
|
||||
switch a {
|
||||
case filterAttrActDefault:
|
||||
return uint32(C.C_ATTRIBUTE_DEFAULT)
|
||||
case filterAttrActBadArch:
|
||||
return uint32(C.C_ATTRIBUTE_BADARCH)
|
||||
case filterAttrNNP:
|
||||
return uint32(C.C_ATTRIBUTE_NNP)
|
||||
case filterAttrTsync:
|
||||
return uint32(C.C_ATTRIBUTE_TSYNC)
|
||||
default:
|
||||
return 0x0
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue