-
Notifications
You must be signed in to change notification settings - Fork 19
/
runtime.go
452 lines (389 loc) · 12.8 KB
/
runtime.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
// Package lxcri provides an OCI specific runtime interface for lxc.
package lxcri
import (
"context"
"fmt"
"net"
"os"
"os/exec"
"path/filepath"
"time"
"github.com/creack/pty"
"github.com/drachenfels-de/gocapability/capability"
"github.com/lxc/lxcri/pkg/specki"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/rs/zerolog"
"golang.org/x/sys/unix"
"gopkg.in/lxc/go-lxc.v2"
)
const (
// BundleConfigFile is the name of the OCI container bundle config file.
// The content is the JSON encoded specs.Spec.
BundleConfigFile = "config.json"
)
// Required runtime executables loaded from Runtime.LibexecDir
var (
// ExecStart starts the liblxc monitor process, similar to lxc-start
ExecStart = "lxcri-start"
// ExecHook is run as liblxc hook and creates additional devices and remounts masked paths.
ExecHook = "lxcri-hook"
ExecHookBuiltin = "lxcri-hook-builtin"
// ExecInit is the container init process that execs the container process.
ExecInit = "lxcri-init"
)
var (
// ErrNotExist is returned if the container (runtime dir) does not exist.
ErrNotExist = fmt.Errorf("container does not exist")
)
// RuntimeFeatures are (security) features supported by the Runtime.
// The supported features are enabled on any Container instance
// created by Runtime.Create.
type RuntimeFeatures struct {
Seccomp bool
Capabilities bool
Apparmor bool
CgroupDevices bool
}
// Runtime is a factory for creating and managing containers.
// The exported methods of Runtime are required to implement the
// OCI container runtime interface spec (CRI).
// It shares the common settings
type Runtime struct {
// Log is the logger used by the runtime.
Log zerolog.Logger `json:"-"`
// Root is the file path to the runtime directory.
// Directories for containers created by the runtime
// are created within this directory.
Root string `json:",omitempty"`
// Path for lxc monitor cgroup (lxc specific feature).
// This is the cgroup where the liblxc monitor process (lxcri-start)
// will be placed in. It's similar to /etc/crio/crio.conf#conmon_cgroup
MonitorCgroup string `json:",omitempty"`
// LibexecDir is the the directory that contains the runtime executables.
LibexecDir string `json:",omitempty"`
// Featuress are runtime (security) features that apply to all containers
// created by the runtime.
Features RuntimeFeatures
// Environment passed to `lxcri-start`
env []string
caps capability.Capabilities
specs.Hooks `json:",omitempty"`
}
func (rt *Runtime) libexec(name string) string {
return filepath.Join(rt.LibexecDir, name)
}
func (rt *Runtime) hasCapability(s string) bool {
c, exist := capability.Parse(s)
if !exist {
rt.Log.Warn().Msgf("undefined capability %q", s)
return false
}
return rt.caps.Get(capability.EFFECTIVE, c)
}
// Init initializes the runtime instance.
// It creates required directories and checks the runtimes system configuration.
// Unsupported runtime features are disabled and a warning message is logged.
// Init must be called once for a runtime instance before calling any other method.
func (rt *Runtime) Init() error {
caps, err := capability.NewPid2(0)
if err != nil {
return errorf("failed to create capabilities object: %w", err)
}
if err := caps.Load(); err != nil {
return errorf("failed to load process capabilities: %w", err)
}
rt.caps = caps
rt.keepEnv("HOME", "XDG_RUNTIME_DIR", "PATH")
err = canExecute(rt.libexec(ExecStart), rt.libexec(ExecHook), rt.libexec(ExecInit))
if err != nil {
return errorf("access check failed: %w", err)
}
if err := isFilesystem("/proc", "proc"); err != nil {
return errorf("procfs not mounted on /proc: %w", err)
}
cgroupRoot, err = detectCgroupRoot()
if err != nil {
rt.Log.Warn().Msgf("cgroup root detection failed: %s", err)
}
rt.Log.Info().Msgf("using cgroup root %s", cgroupRoot)
if !lxc.VersionAtLeast(3, 1, 0) {
return errorf("liblxc runtime version is %s, but >= 3.1.0 is required", lxc.Version())
}
if !lxc.VersionAtLeast(4, 0, 5) {
rt.Log.Warn().Msgf("liblxc runtime version >= 4.0.5 is recommended (was %s)", lxc.Version())
}
rt.Hooks.CreateContainer = []specs.Hook{
specs.Hook{Path: rt.libexec(ExecHookBuiltin)},
}
return nil
}
func (rt *Runtime) checkConfig(cfg *ContainerConfig) error {
if len(cfg.ContainerID) == 0 {
return errorf("missing container ID")
}
return rt.checkSpec(cfg.Spec)
}
func (rt *Runtime) checkSpec(spec *specs.Spec) error {
if spec.Root == nil {
return errorf("spec.Root is nil")
}
if len(spec.Root.Path) == 0 {
return errorf("empty spec.Root.Path")
}
if spec.Process == nil {
return errorf("spec.Process is nil")
}
if len(spec.Process.Args) == 0 {
return errorf("specs.Process.Args is empty")
}
if spec.Process.Cwd == "" {
rt.Log.Info().Msg("specs.Process.Cwd is unset defaulting to '/'")
spec.Process.Cwd = "/"
}
yes, err := isNamespaceSharedWithRuntime(getNamespace(spec, specs.MountNamespace))
if err != nil {
return errorf("failed to mount namespace: %s", err)
}
if yes {
return errorf("container wants to share the runtimes mount namespace")
}
// It should be best practise not to do so, but there are containers that
// want to share the runtimes PID namespaces. e.g sonobuoy/sonobuoy-systemd-logs-daemon-set
yes, err = isNamespaceSharedWithRuntime(getNamespace(spec, specs.PIDNamespace))
if err != nil {
return errorf("failed to check PID namespace: %s", err)
}
if yes {
rt.Log.Warn().Msg("container shares the PID namespace with the runtime")
}
return nil
}
func (rt *Runtime) keepEnv(names ...string) {
for _, n := range names {
if val := os.Getenv(n); val != "" {
rt.env = append(rt.env, n+"="+val)
}
}
}
// Load loads a container from the runtime directory.
// The container must have been created with Runtime.Create.
// The logger Container.Log is set to Runtime.Log by default.
// A loaded Container must be released with Container.Release after use.
func (rt *Runtime) Load(containerID string) (*Container, error) {
dir := filepath.Join(rt.Root, containerID)
if _, err := os.Stat(dir); os.IsNotExist(err) {
return nil, ErrNotExist
}
c := &Container{
ContainerConfig: &ContainerConfig{
Log: rt.Log,
},
runtimeDir: dir,
}
if err := c.load(); err != nil {
return nil, err
}
return c, nil
}
// Start starts the given container.
// Start simply unblocks the init process `lxcri-init`,
// which then executes the container process.
// The given container must have been created with Runtime.Create.
func (rt *Runtime) Start(ctx context.Context, c *Container) error {
rt.Log.Info().Msg("notify init to start container process")
state, err := c.State()
if err != nil {
return errorf("failed to get container state: %w", err)
}
if state.SpecState.Status != specs.StateCreated {
return fmt.Errorf("invalid container state. expected %q, but was %q", specs.StateCreated, state.SpecState.Status)
}
err = c.start(ctx)
if err != nil {
return err
}
if c.Spec.Hooks != nil {
state, err := c.State()
if err != nil {
return errorf("failed to get container state: %w", err)
}
specki.RunHooks(ctx, &state.SpecState, c.Spec.Hooks.Poststart, true)
}
return nil
}
func (rt *Runtime) runStartCmd(ctx context.Context, c *Container) (err error) {
// #nosec
cmd := exec.Command(rt.libexec(ExecStart), c.LinuxContainer.Name(), rt.Root, c.ConfigFilePath())
cmd.Env = rt.env
cmd.Dir = c.RuntimePath()
if c.ConsoleSocket == "" && !c.Spec.Process.Terminal {
// Inherit stdio from calling process (conmon).
// lxc.console.path must be set to 'none' or stdio of init process is replaced with a PTY by lxc
if err := c.setConfigItem("lxc.console.path", "none"); err != nil {
return err
}
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
}
// NOTE any config change via clxc.setConfigItem
// must be done before calling SaveConfigFile
err = c.LinuxContainer.SaveConfigFile(c.ConfigFilePath())
if err != nil {
return errorf("failed to save config file to %q: %w", c.ConfigFilePath(), err)
}
rt.Log.Debug().Msg("starting lxc monitor process")
if c.ConsoleSocket != "" {
err = runStartCmdConsole(ctx, cmd, c.ConsoleSocket)
} else {
err = cmd.Start()
}
if err != nil {
return err
}
c.CreatedAt = time.Now()
c.Pid = cmd.Process.Pid
rt.Log.Info().Int("pid", cmd.Process.Pid).Msg("monitor process started")
p := c.RuntimePath("lxcri.json")
err = specki.EncodeJSONFile(p, c, os.O_EXCL|os.O_CREATE, 0440)
if err != nil {
return err
}
ctx, cancel := context.WithCancel(ctx)
defer cancel()
rt.Log.Debug().Msg("waiting for init")
if err := c.waitCreated(ctx); err != nil {
return err
}
return nil
}
func runStartCmdConsole(ctx context.Context, cmd *exec.Cmd, consoleSocket string) error {
dialer := net.Dialer{}
c, err := dialer.DialContext(ctx, "unix", consoleSocket)
if err != nil {
return fmt.Errorf("connecting to console socket failed: %w", err)
}
defer c.Close()
conn, ok := c.(*net.UnixConn)
if !ok {
return fmt.Errorf("expected a unix connection but was %T", conn)
}
if deadline, ok := ctx.Deadline(); ok {
err = conn.SetDeadline(deadline)
if err != nil {
return fmt.Errorf("failed to set connection deadline: %w", err)
}
}
sockFile, err := conn.File()
if err != nil {
return fmt.Errorf("failed to get file from unix connection: %w", err)
}
ptmx, err := pty.Start(cmd)
if err != nil {
return fmt.Errorf("failed to start with pty: %w", err)
}
// Send the pty file descriptor over the console socket (to the 'conmon' process)
// For technical backgrounds see:
// * `man sendmsg 2`, `man unix 3`, `man cmsg 1`
// * https://blog.cloudflare.com/know-your-scm_rights/
oob := unix.UnixRights(int(ptmx.Fd()))
// Don't know whether 'terminal' is the right data to send, but conmon doesn't care anyway.
err = unix.Sendmsg(int(sockFile.Fd()), []byte("terminal"), oob, nil, 0)
if err != nil {
return fmt.Errorf("failed to send console fd: %w", err)
}
return ptmx.Close()
}
// Kill sends the signal signum to the container init process.
func (rt *Runtime) Kill(ctx context.Context, c *Container, signum unix.Signal) error {
state, err := c.ContainerState()
if err != nil {
return err
}
if state == specs.StateStopped {
return errorf("container already stopped")
}
return c.kill(ctx, signum)
}
// Delete removes the container from the runtime directory.
// The container must be stopped or force must be set to true.
// If the container is not stopped but force is set to true,
// the container will be killed with unix.SIGKILL.
func (rt *Runtime) Delete(ctx context.Context, containerID string, force bool) error {
rt.Log.Info().Bool("force", force).Msg("delete container")
c, err := rt.Load(containerID)
if err == ErrNotExist {
return err
}
if err != nil {
// NOTE hooks won't run in this case
rt.Log.Warn().Msgf("deleting runtime dir for unloadable container: %s", err)
return os.RemoveAll(filepath.Join(rt.Root, containerID))
}
defer c.Release()
state, err := c.ContainerState()
if err != nil {
return err
}
if state != specs.StateStopped {
c.Log.Debug().Msgf("delete state:%s", state)
if !force {
return errorf("container is not not stopped (current state %s)", state)
}
if err := c.kill(ctx, unix.SIGKILL); err != nil {
return errorf("failed to kill container: %w", err)
}
}
if err := c.waitMonitorStopped(ctx); err != nil {
c.Log.Error().Msgf("failed to stop monitor process %d", c.Pid)
}
// From OCI runtime spec
// "Note that resources associated with the container, but not
// created by this container, MUST NOT be deleted."
// The *lxc.Container is created with `rootfs.managed=0`,
// so calling *lxc.Container.Destroy will not delete container resources.
if err := c.LinuxContainer.Destroy(); err != nil {
return fmt.Errorf("failed to destroy container: %w", err)
}
// the monitor might be part of the cgroup so wait for it to exit
eventsFile := filepath.Join(cgroupRoot, c.CgroupDir, "cgroup.events")
err = pollCgroupEvents(ctx, eventsFile, func(ev cgroupEvents) bool {
return !ev.populated
})
if err != nil && !os.IsNotExist(err) {
// try to delete the cgroup anyways
c.Log.Warn().Msgf("failed to wait until cgroup.events populated=0: %s", err)
}
err = deleteCgroup(c.CgroupDir)
if err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to delete cgroup: %s", err)
}
if c.Spec.Hooks != nil {
state, err := c.State()
if err != nil {
return errorf("failed to get container state: %w", err)
}
specki.RunHooks(ctx, &state.SpecState, c.Spec.Hooks.Poststop, true)
}
return os.RemoveAll(c.RuntimePath())
}
// List returns the IDs for all existing containers.
func (rt *Runtime) List() ([]string, error) {
dir, err := os.Open(rt.Root)
if err != nil {
return nil, err
}
defer dir.Close()
names, err := dir.Readdirnames(-1)
if err != nil {
return nil, err
}
// ignore hidden elements
visible := make([]string, 0, len(names))
for _, name := range names {
if name[0] != '.' {
visible = append(visible, name)
}
}
return visible, nil
}