Replace godep with dep

This commit is contained in:
Manuel de Brito Fontes 2017-10-06 17:26:14 -03:00
parent 1e7489927c
commit bf5616c65b
14883 changed files with 3937406 additions and 361781 deletions

131
vendor/k8s.io/kubernetes/pkg/kubelet/cm/BUILD generated vendored Normal file
View file

@ -0,0 +1,131 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = [
"cgroup_manager_unsupported.go",
"container_manager.go",
"container_manager_stub.go",
"container_manager_unsupported.go",
"device_plugin_handler.go",
"device_plugin_handler_stub.go",
"fake_internal_container_lifecycle.go",
"helpers_unsupported.go",
"internal_container_lifecycle.go",
"pod_container_manager_stub.go",
"pod_container_manager_unsupported.go",
"types.go",
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"cgroup_manager_linux.go",
"container_manager_linux.go",
"helpers_linux.go",
"node_container_manager.go",
"pod_container_manager_linux.go",
"qos_container_manager_linux.go",
],
"@io_bazel_rules_go//go/platform:windows_amd64": [
"container_manager_windows.go",
],
"//conditions:default": [],
}),
visibility = ["//visibility:public"],
deps = [
"//pkg/features:go_default_library",
"//pkg/kubelet/apis/cri:go_default_library",
"//pkg/kubelet/apis/deviceplugin/v1alpha1:go_default_library",
"//pkg/kubelet/apis/kubeletconfig:go_default_library",
"//pkg/kubelet/cadvisor:go_default_library",
"//pkg/kubelet/cm/cpumanager:go_default_library",
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/deviceplugin:go_default_library",
"//pkg/kubelet/eviction/api:go_default_library",
"//pkg/kubelet/status:go_default_library",
"//pkg/util/mount:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//vendor/k8s.io/client-go/tools/record:go_default_library",
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"//pkg/api:go_default_library",
"//pkg/api/v1/helper:go_default_library",
"//pkg/api/v1/helper/qos:go_default_library",
"//pkg/api/v1/resource:go_default_library",
"//pkg/kubelet/cm/util:go_default_library",
"//pkg/kubelet/events:go_default_library",
"//pkg/kubelet/metrics:go_default_library",
"//pkg/kubelet/qos:go_default_library",
"//pkg/util/file:go_default_library",
"//pkg/util/oom:go_default_library",
"//pkg/util/procfs:go_default_library",
"//pkg/util/sysctl:go_default_library",
"//pkg/util/version:go_default_library",
"//vendor/github.com/docker/go-units:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
],
"//conditions:default": [],
}),
)
go_test(
name = "go_default_test",
srcs = [
"container_manager_unsupported_test.go",
"device_plugin_handler_test.go",
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"cgroup_manager_linux_test.go",
"cgroup_manager_test.go",
"container_manager_linux_test.go",
"helpers_linux_test.go",
"node_container_manager_test.go",
],
"//conditions:default": [],
}),
library = ":go_default_library",
deps = [
"//pkg/kubelet/apis/deviceplugin/v1alpha1:go_default_library",
"//pkg/util/mount:go_default_library",
"//vendor/github.com/stretchr/testify/assert:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"//pkg/kubelet/apis/kubeletconfig:go_default_library",
"//pkg/kubelet/eviction/api:go_default_library",
"//vendor/github.com/stretchr/testify/require:go_default_library",
],
"//conditions:default": [],
}),
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [
":package-srcs",
"//pkg/kubelet/cm/cpumanager:all-srcs",
"//pkg/kubelet/cm/cpuset:all-srcs",
"//pkg/kubelet/cm/util:all-srcs",
],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View file

@ -0,0 +1,570 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"os"
"path"
"path/filepath"
"strings"
"time"
units "github.com/docker/go-units"
"github.com/golang/glog"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
// libcontainerCgroupManagerType defines how to interface with libcontainer
type libcontainerCgroupManagerType string
const (
// libcontainerCgroupfs means use libcontainer with cgroupfs
libcontainerCgroupfs libcontainerCgroupManagerType = "cgroupfs"
// libcontainerSystemd means use libcontainer with systemd
libcontainerSystemd libcontainerCgroupManagerType = "systemd"
)
// hugePageSizeList is useful for converting to the hugetlb canonical unit
// which is what is expected when interacting with libcontainer
var hugePageSizeList = []string{"B", "kB", "MB", "GB", "TB", "PB"}
// ConvertCgroupNameToSystemd converts the internal cgroup name to a systemd name.
// For example, the name /Burstable/pod_123-456 becomes Burstable-pod_123_456.slice
// If outputToCgroupFs is true, it expands the systemd name into the cgroupfs form.
// For example, it will return /Burstable.slice/Burstable-pod_123_456.slice in above scenario.
func ConvertCgroupNameToSystemd(cgroupName CgroupName, outputToCgroupFs bool) string {
name := string(cgroupName)
result := ""
if name != "" && name != "/" {
parts := strings.Split(name, "/")
results := []string{}
for _, part := range parts {
// ignore leading stuff
if part == "" {
continue
}
// detect if we are given a systemd style name.
// if so, we do not want to do double encoding.
if strings.HasSuffix(part, ".slice") {
part = strings.TrimSuffix(part, ".slice")
separatorIndex := strings.LastIndex(part, "-")
if separatorIndex >= 0 && separatorIndex < len(part) {
part = part[separatorIndex+1:]
}
} else {
// systemd treats - as a step in the hierarchy, we convert all - to _
part = strings.Replace(part, "-", "_", -1)
}
results = append(results, part)
}
// each part is appended with systemd style -
result = strings.Join(results, "-")
} else {
// root converts to -
result = "-"
}
// always have a .slice suffix
if !strings.HasSuffix(result, ".slice") {
result = result + ".slice"
}
// if the caller desired the result in cgroupfs format...
if outputToCgroupFs {
var err error
result, err = cgroupsystemd.ExpandSlice(result)
if err != nil {
panic(fmt.Errorf("error adapting cgroup name, input: %v, err: %v", name, err))
}
}
return result
}
// ConvertCgroupFsNameToSystemd converts an expanded cgroupfs name to its systemd name.
// For example, it will convert test.slice/test-a.slice/test-a-b.slice to become test-a-b.slice
// NOTE: this is public right now to allow its usage in dockermanager and dockershim, ideally both those
// code areas could use something from libcontainer if we get this style function upstream.
func ConvertCgroupFsNameToSystemd(cgroupfsName string) (string, error) {
// TODO: see if libcontainer systemd implementation could use something similar, and if so, move
// this function up to that library. At that time, it would most likely do validation specific to systemd
// above and beyond the simple assumption here that the base of the path encodes the hierarchy
// per systemd convention.
return path.Base(cgroupfsName), nil
}
// libcontainerAdapter provides a simplified interface to libcontainer based on libcontainer type.
type libcontainerAdapter struct {
// cgroupManagerType defines how to interface with libcontainer
cgroupManagerType libcontainerCgroupManagerType
}
// newLibcontainerAdapter returns a configured libcontainerAdapter for specified manager.
// it does any initialization required by that manager to function.
func newLibcontainerAdapter(cgroupManagerType libcontainerCgroupManagerType) *libcontainerAdapter {
return &libcontainerAdapter{cgroupManagerType: cgroupManagerType}
}
// newManager returns an implementation of cgroups.Manager
func (l *libcontainerAdapter) newManager(cgroups *libcontainerconfigs.Cgroup, paths map[string]string) (libcontainercgroups.Manager, error) {
switch l.cgroupManagerType {
case libcontainerCgroupfs:
return &cgroupfs.Manager{
Cgroups: cgroups,
Paths: paths,
}, nil
case libcontainerSystemd:
// this means you asked systemd to manage cgroups, but systemd was not on the host, so all you can do is panic...
if !cgroupsystemd.UseSystemd() {
panic("systemd cgroup manager not available")
}
return &cgroupsystemd.Manager{
Cgroups: cgroups,
Paths: paths,
}, nil
}
return nil, fmt.Errorf("invalid cgroup manager configuration")
}
func (l *libcontainerAdapter) revertName(name string) CgroupName {
if l.cgroupManagerType != libcontainerSystemd {
return CgroupName(name)
}
driverName, err := ConvertCgroupFsNameToSystemd(name)
if err != nil {
panic(err)
}
driverName = strings.TrimSuffix(driverName, ".slice")
driverName = strings.Replace(driverName, "-", "/", -1)
driverName = strings.Replace(driverName, "_", "-", -1)
return CgroupName(driverName)
}
// adaptName converts a CgroupName identifier to a driver specific conversion value.
// if outputToCgroupFs is true, the result is returned in the cgroupfs format rather than the driver specific form.
func (l *libcontainerAdapter) adaptName(cgroupName CgroupName, outputToCgroupFs bool) string {
if l.cgroupManagerType != libcontainerSystemd {
name := string(cgroupName)
return name
}
return ConvertCgroupNameToSystemd(cgroupName, outputToCgroupFs)
}
// CgroupSubsystems holds information about the mounted cgroup subsystems
type CgroupSubsystems struct {
// Cgroup subsystem mounts.
// e.g.: "/sys/fs/cgroup/cpu" -> ["cpu", "cpuacct"]
Mounts []libcontainercgroups.Mount
// Cgroup subsystem to their mount location.
// e.g.: "cpu" -> "/sys/fs/cgroup/cpu"
MountPoints map[string]string
}
// cgroupManagerImpl implements the CgroupManager interface.
// Its a stateless object which can be used to
// update,create or delete any number of cgroups
// It uses the Libcontainer raw fs cgroup manager for cgroup management.
type cgroupManagerImpl struct {
// subsystems holds information about all the
// mounted cgroup subsystems on the node
subsystems *CgroupSubsystems
// simplifies interaction with libcontainer and its cgroup managers
adapter *libcontainerAdapter
}
// Make sure that cgroupManagerImpl implements the CgroupManager interface
var _ CgroupManager = &cgroupManagerImpl{}
// NewCgroupManager is a factory method that returns a CgroupManager
func NewCgroupManager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
managerType := libcontainerCgroupfs
if cgroupDriver == string(libcontainerSystemd) {
managerType = libcontainerSystemd
}
return &cgroupManagerImpl{
subsystems: cs,
adapter: newLibcontainerAdapter(managerType),
}
}
// Name converts the cgroup to the driver specific value in cgroupfs form.
func (m *cgroupManagerImpl) Name(name CgroupName) string {
return m.adapter.adaptName(name, true)
}
// CgroupName converts the literal cgroupfs name on the host to an internal identifier.
func (m *cgroupManagerImpl) CgroupName(name string) CgroupName {
return m.adapter.revertName(name)
}
// buildCgroupPaths builds a path to each cgroup subsystem for the specified name.
func (m *cgroupManagerImpl) buildCgroupPaths(name CgroupName) map[string]string {
cgroupFsAdaptedName := m.Name(name)
cgroupPaths := make(map[string]string, len(m.subsystems.MountPoints))
for key, val := range m.subsystems.MountPoints {
cgroupPaths[key] = path.Join(val, cgroupFsAdaptedName)
}
return cgroupPaths
}
// Exists checks if all subsystem cgroups already exist
func (m *cgroupManagerImpl) Exists(name CgroupName) bool {
// Get map of all cgroup paths on the system for the particular cgroup
cgroupPaths := m.buildCgroupPaths(name)
// the presence of alternative control groups not known to runc confuses
// the kubelet existence checks.
// ideally, we would have a mechanism in runc to support Exists() logic
// scoped to the set control groups it understands. this is being discussed
// in https://github.com/opencontainers/runc/issues/1440
// once resolved, we can remove this code.
whitelistControllers := sets.NewString("cpu", "cpuacct", "cpuset", "memory", "systemd")
// If even one cgroup path doesn't exist, then the cgroup doesn't exist.
for controller, path := range cgroupPaths {
// ignore mounts we don't care about
if !whitelistControllers.Has(controller) {
continue
}
if !libcontainercgroups.PathExists(path) {
return false
}
}
return true
}
// Destroy destroys the specified cgroup
func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerLatency.WithLabelValues("destroy").Observe(metrics.SinceInMicroseconds(start))
}()
cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name)
// we take the location in traditional cgroupfs format.
abstractCgroupFsName := string(cgroupConfig.Name)
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
abstractName := CgroupName(path.Base(abstractCgroupFsName))
driverParent := m.adapter.adaptName(abstractParent, false)
driverName := m.adapter.adaptName(abstractName, false)
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
if m.adapter.cgroupManagerType == libcontainerSystemd {
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
}
// Initialize libcontainer's cgroup config with driver specific naming.
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
Name: driverName,
Parent: driverParent,
}
manager, err := m.adapter.newManager(libcontainerCgroupConfig, cgroupPaths)
if err != nil {
return err
}
// Delete cgroups using libcontainers Managers Destroy() method
if err = manager.Destroy(); err != nil {
return fmt.Errorf("Unable to destroy cgroup paths for cgroup %v : %v", cgroupConfig.Name, err)
}
return nil
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// Set the cgroup represented by cgroup.
Set(path string, cgroup *libcontainerconfigs.Cgroup) error
// GetStats returns the statistics associated with the cgroup
GetStats(path string, stats *libcontainercgroups.Stats) error
}
// getSupportedSubsystems returns list of subsystems supported
func getSupportedSubsystems() []subsystem {
supportedSubsystems := []subsystem{
&cgroupfs.MemoryGroup{},
&cgroupfs.CpuGroup{},
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
supportedSubsystems = append(supportedSubsystems, &cgroupfs.HugetlbGroup{})
}
return supportedSubsystems
}
// setSupportedSubsystems sets cgroup resource limits only on the supported
// subsystems. ie. cpu and memory. We don't use libcontainer's cgroup/fs/Set()
// method as it doesn't allow us to skip updates on the devices cgroup
// Allowing or denying all devices by writing 'a' to devices.allow or devices.deny is
// not possible once the device cgroups has children. Once the pod level cgroup are
// created under the QOS level cgroup we cannot update the QOS level device cgroup.
// We would like to skip setting any values on the device cgroup in this case
// but this is not possible with libcontainers Set() method
// See https://github.com/opencontainers/runc/issues/932
func setSupportedSubsystems(cgroupConfig *libcontainerconfigs.Cgroup) error {
for _, sys := range getSupportedSubsystems() {
if _, ok := cgroupConfig.Paths[sys.Name()]; !ok {
return fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name())
}
if err := sys.Set(cgroupConfig.Paths[sys.Name()], cgroupConfig); err != nil {
return fmt.Errorf("Failed to set config for supported subsystems : %v", err)
}
}
return nil
}
func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources {
resources := &libcontainerconfigs.Resources{}
if resourceConfig == nil {
return resources
}
if resourceConfig.Memory != nil {
resources.Memory = *resourceConfig.Memory
}
if resourceConfig.CpuShares != nil {
resources.CpuShares = *resourceConfig.CpuShares
}
if resourceConfig.CpuQuota != nil {
resources.CpuQuota = *resourceConfig.CpuQuota
}
if resourceConfig.CpuPeriod != nil {
resources.CpuPeriod = *resourceConfig.CpuPeriod
}
// if huge pages are enabled, we set them in libcontainer
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
// for each page size enumerated, set that value
pageSizes := sets.NewString()
for pageSize, limit := range resourceConfig.HugePageLimit {
sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, hugePageSizeList)
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: sizeString,
Limit: uint64(limit),
})
pageSizes.Insert(sizeString)
}
// for each page size omitted, limit to 0
for _, pageSize := range cgroupfs.HugePageSizes {
if pageSizes.Has(pageSize) {
continue
}
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: pageSize,
Limit: uint64(0),
})
}
}
return resources
}
// Update updates the cgroup with the specified Cgroup Configuration
func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerLatency.WithLabelValues("update").Observe(metrics.SinceInMicroseconds(start))
}()
// Extract the cgroup resource parameters
resourceConfig := cgroupConfig.ResourceParameters
resources := m.toResources(resourceConfig)
cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name)
// we take the location in traditional cgroupfs format.
abstractCgroupFsName := string(cgroupConfig.Name)
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
abstractName := CgroupName(path.Base(abstractCgroupFsName))
driverParent := m.adapter.adaptName(abstractParent, false)
driverName := m.adapter.adaptName(abstractName, false)
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
if m.adapter.cgroupManagerType == libcontainerSystemd {
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
}
// Initialize libcontainer's cgroup config
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
Name: driverName,
Parent: driverParent,
Resources: resources,
Paths: cgroupPaths,
}
if err := setSupportedSubsystems(libcontainerCgroupConfig); err != nil {
return fmt.Errorf("failed to set supported cgroup subsystems for cgroup %v: %v", cgroupConfig.Name, err)
}
return nil
}
// Create creates the specified cgroup
func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerLatency.WithLabelValues("create").Observe(metrics.SinceInMicroseconds(start))
}()
// we take the location in traditional cgroupfs format.
abstractCgroupFsName := string(cgroupConfig.Name)
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
abstractName := CgroupName(path.Base(abstractCgroupFsName))
driverParent := m.adapter.adaptName(abstractParent, false)
driverName := m.adapter.adaptName(abstractName, false)
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
if m.adapter.cgroupManagerType == libcontainerSystemd {
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
}
resources := m.toResources(cgroupConfig.ResourceParameters)
// Initialize libcontainer's cgroup config with driver specific naming.
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
Name: driverName,
Parent: driverParent,
Resources: resources,
}
// get the manager with the specified cgroup configuration
manager, err := m.adapter.newManager(libcontainerCgroupConfig, nil)
if err != nil {
return err
}
// Apply(-1) is a hack to create the cgroup directories for each resource
// subsystem. The function [cgroups.Manager.apply()] applies cgroup
// configuration to the process with the specified pid.
// It creates cgroup files for each subsystems and writes the pid
// in the tasks file. We use the function to create all the required
// cgroup files but not attach any "real" pid to the cgroup.
if err := manager.Apply(-1); err != nil {
return err
}
// it may confuse why we call set after we do apply, but the issue is that runc
// follows a similar pattern. it's needed to ensure cpu quota is set properly.
m.Update(cgroupConfig)
return nil
}
// Scans through all subsystems to find pids associated with specified cgroup.
func (m *cgroupManagerImpl) Pids(name CgroupName) []int {
// we need the driver specific name
cgroupFsName := m.Name(name)
// Get a list of processes that we need to kill
pidsToKill := sets.NewInt()
var pids []int
for _, val := range m.subsystems.MountPoints {
dir := path.Join(val, cgroupFsName)
_, err := os.Stat(dir)
if os.IsNotExist(err) {
// The subsystem pod cgroup is already deleted
// do nothing, continue
continue
}
// Get a list of pids that are still charged to the pod's cgroup
pids, err = getCgroupProcs(dir)
if err != nil {
continue
}
pidsToKill.Insert(pids...)
// WalkFunc which is called for each file and directory in the pod cgroup dir
visitor := func(path string, info os.FileInfo, err error) error {
if err != nil {
glog.V(4).Infof("cgroup manager encountered error scanning cgroup path %q: %v", path, err)
return filepath.SkipDir
}
if !info.IsDir() {
return nil
}
pids, err = getCgroupProcs(path)
if err != nil {
glog.V(4).Infof("cgroup manager encountered error getting procs for cgroup path %q: %v", path, err)
return filepath.SkipDir
}
pidsToKill.Insert(pids...)
return nil
}
// Walk through the pod cgroup directory to check if
// container cgroups haven't been GCed yet. Get attached processes to
// all such unwanted containers under the pod cgroup
if err = filepath.Walk(dir, visitor); err != nil {
glog.V(4).Infof("cgroup manager encountered error scanning pids for directory: %q: %v", dir, err)
}
}
return pidsToKill.List()
}
// ReduceCPULimits reduces the cgroup's cpu shares to the lowest possible value
func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error {
// Set lowest possible CpuShares value for the cgroup
minimumCPUShares := uint64(MinShares)
resources := &ResourceConfig{
CpuShares: &minimumCPUShares,
}
containerConfig := &CgroupConfig{
Name: cgroupName,
ResourceParameters: resources,
}
return m.Update(containerConfig)
}
func getStatsSupportedSubsystems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) {
stats := libcontainercgroups.NewStats()
for _, sys := range getSupportedSubsystems() {
if _, ok := cgroupPaths[sys.Name()]; !ok {
return nil, fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name())
}
if err := sys.GetStats(cgroupPaths[sys.Name()], stats); err != nil {
return nil, fmt.Errorf("Failed to get stats for supported subsystems : %v", err)
}
}
return stats, nil
}
func toResourceStats(stats *libcontainercgroups.Stats) *ResourceStats {
return &ResourceStats{
MemoryStats: &MemoryStats{
Usage: int64(stats.MemoryStats.Usage.Usage),
},
}
}
// Get sets the ResourceParameters of the specified cgroup as read from the cgroup fs
func (m *cgroupManagerImpl) GetResourceStats(name CgroupName) (*ResourceStats, error) {
cgroupPaths := m.buildCgroupPaths(name)
stats, err := getStatsSupportedSubsystems(cgroupPaths)
if err != nil {
return nil, fmt.Errorf("failed to get stats supported cgroup subsystems for cgroup %v: %v", name, err)
}
return toResourceStats(stats), nil
}

View file

@ -0,0 +1,101 @@
// +build linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import "testing"
func TestLibcontainerAdapterAdaptToSystemd(t *testing.T) {
testCases := []struct {
input string
expected string
}{
{
input: "/",
expected: "-.slice",
},
{
input: "/system.slice",
expected: "system.slice",
},
{
input: "/system.slice/Burstable",
expected: "system-Burstable.slice",
},
{
input: "/Burstable.slice/Burstable-pod_123.slice",
expected: "Burstable-pod_123.slice",
},
{
input: "/test.slice/test-a.slice/test-a-b.slice",
expected: "test-a-b.slice",
},
{
input: "/test.slice/test-a.slice/test-a-b.slice/Burstable",
expected: "test-a-b-Burstable.slice",
},
{
input: "/Burstable",
expected: "Burstable.slice",
},
{
input: "/Burstable/pod_123",
expected: "Burstable-pod_123.slice",
},
{
input: "/BestEffort/pod_6c1a4e95-6bb6-11e6-bc26-28d2444e470d",
expected: "BestEffort-pod_6c1a4e95_6bb6_11e6_bc26_28d2444e470d.slice",
},
}
for _, testCase := range testCases {
f := newLibcontainerAdapter(libcontainerSystemd)
if actual := f.adaptName(CgroupName(testCase.input), false); actual != testCase.expected {
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v", testCase.input, testCase.expected, actual)
}
}
}
func TestLibcontainerAdapterAdaptToSystemdAsCgroupFs(t *testing.T) {
testCases := []struct {
input string
expected string
}{
{
input: "/",
expected: "/",
},
{
input: "/Burstable",
expected: "Burstable.slice/",
},
{
input: "/Burstable/pod_123",
expected: "Burstable.slice/Burstable-pod_123.slice/",
},
{
input: "/BestEffort/pod_6c1a4e95-6bb6-11e6-bc26-28d2444e470d",
expected: "BestEffort.slice/BestEffort-pod_6c1a4e95_6bb6_11e6_bc26_28d2444e470d.slice/",
},
}
for _, testCase := range testCases {
f := newLibcontainerAdapter(libcontainerSystemd)
if actual := f.adaptName(CgroupName(testCase.input), true); actual != testCase.expected {
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v", testCase.input, testCase.expected, actual)
}
}
}

View file

@ -0,0 +1,83 @@
// +build linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"reflect"
"testing"
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
)
func Test(t *testing.T) {
tests := []struct {
input string
expected *map[v1.ResourceName]int64
}{
{
input: "memory",
expected: nil,
},
{
input: "memory=a",
expected: nil,
},
{
input: "memory=a%",
expected: nil,
},
{
input: "memory=200%",
expected: nil,
},
{
input: "memory=0%",
expected: &map[v1.ResourceName]int64{
v1.ResourceMemory: 0,
},
},
{
input: "memory=100%",
expected: &map[v1.ResourceName]int64{
v1.ResourceMemory: 100,
},
},
{
// need to change this when CPU is added as a supported resource
input: "memory=100%,cpu=50%",
expected: nil,
},
}
for _, test := range tests {
m := kubeletconfig.ConfigurationMap{}
m.Set(test.input)
actual, err := ParseQOSReserved(m)
if actual != nil && test.expected == nil {
t.Errorf("Unexpected success, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
}
if actual == nil && test.expected != nil {
t.Errorf("Unexpected failure, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
}
if (actual == nil && test.expected == nil) || reflect.DeepEqual(*actual, *test.expected) {
continue
}
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
}
}

View file

@ -0,0 +1,79 @@
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import "fmt"
type unsupportedCgroupManager struct{}
// Make sure that unsupportedCgroupManager implements the CgroupManager interface
var _ CgroupManager = &unsupportedCgroupManager{}
type CgroupSubsystems struct {
Mounts []interface{}
MountPoints map[string]string
}
func NewCgroupManager(_ interface{}) CgroupManager {
return &unsupportedCgroupManager{}
}
func (m *unsupportedCgroupManager) Name(_ CgroupName) string {
return ""
}
func (m *unsupportedCgroupManager) Exists(_ CgroupName) bool {
return false
}
func (m *unsupportedCgroupManager) Destroy(_ *CgroupConfig) error {
return nil
}
func (m *unsupportedCgroupManager) Update(_ *CgroupConfig) error {
return nil
}
func (m *unsupportedCgroupManager) Create(_ *CgroupConfig) error {
return fmt.Errorf("Cgroup Manager is not supported in this build")
}
func (m *unsupportedCgroupManager) GetResourceStats(name CgroupName) (*ResourceStats, error) {
return nil, fmt.Errorf("Cgroup Manager is not supported in this build")
}
func (m *unsupportedCgroupManager) Pids(_ CgroupName) []int {
return nil
}
func (m *unsupportedCgroupManager) CgroupName(name string) CgroupName {
return ""
}
func (m *unsupportedCgroupManager) ReduceCPULimits(cgroupName CgroupName) error {
return nil
}
func ConvertCgroupFsNameToSystemd(cgroupfsName string) (string, error) {
return "", nil
}
func ConvertCgroupNameToSystemd(cgroupName CgroupName, outputToCgroupFs bool) string {
return ""
}

View file

@ -0,0 +1,154 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"time"
"k8s.io/apimachinery/pkg/util/sets"
// TODO: Migrate kubelet to either use its own internal objects or client library.
"k8s.io/api/core/v1"
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"k8s.io/kubernetes/pkg/kubelet/status"
"fmt"
"strconv"
"strings"
)
type ActivePodsFunc func() []*v1.Pod
// Manages the containers running on a machine.
type ContainerManager interface {
// Runs the container manager's housekeeping.
// - Ensures that the Docker daemon is in a container.
// - Creates the system container where all non-containerized processes run.
Start(*v1.Node, ActivePodsFunc, status.PodStatusProvider, internalapi.RuntimeService) error
// Returns resources allocated to system cgroups in the machine.
// These cgroups include the system and Kubernetes services.
SystemCgroupsLimit() v1.ResourceList
// Returns a NodeConfig that is being used by the container manager.
GetNodeConfig() NodeConfig
// Returns internal Status.
Status() Status
// NewPodContainerManager is a factory method which returns a podContainerManager object
// Returns a noop implementation if qos cgroup hierarchy is not enabled
NewPodContainerManager() PodContainerManager
// GetMountedSubsystems returns the mounted cgroup subsystems on the node
GetMountedSubsystems() *CgroupSubsystems
// GetQOSContainersInfo returns the names of top level QoS containers
GetQOSContainersInfo() QOSContainersInfo
// GetNodeAllocatable returns the amount of compute resources that have to be reserved from scheduling.
GetNodeAllocatableReservation() v1.ResourceList
// GetCapacity returns the amount of compute resources tracked by container manager available on the node.
GetCapacity() v1.ResourceList
// UpdateQOSCgroups performs housekeeping updates to ensure that the top
// level QoS containers have their desired state in a thread-safe way
UpdateQOSCgroups() error
// Returns RunContainerOptions with devices, mounts, and env fields populated for
// extended resources required by container.
GetResources(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) (*kubecontainer.RunContainerOptions, error)
InternalContainerLifecycle() InternalContainerLifecycle
}
type NodeConfig struct {
RuntimeCgroupsName string
SystemCgroupsName string
KubeletCgroupsName string
ContainerRuntime string
CgroupsPerQOS bool
CgroupRoot string
CgroupDriver string
ProtectKernelDefaults bool
NodeAllocatableConfig
ExperimentalQOSReserved map[v1.ResourceName]int64
ExperimentalCPUManagerPolicy string
ExperimentalCPUManagerReconcilePeriod time.Duration
}
type NodeAllocatableConfig struct {
KubeReservedCgroupName string
SystemReservedCgroupName string
EnforceNodeAllocatable sets.String
KubeReserved v1.ResourceList
SystemReserved v1.ResourceList
HardEvictionThresholds []evictionapi.Threshold
}
type Status struct {
// Any soft requirements that were unsatisfied.
SoftRequirements error
}
const (
// Uer visible keys for managing node allocatable enforcement on the node.
NodeAllocatableEnforcementKey = "pods"
SystemReservedEnforcementKey = "system-reserved"
KubeReservedEnforcementKey = "kube-reserved"
)
// containerManager for the kubelet is currently an injected dependency.
// We need to parse the --qos-reserve-requests option in
// cmd/kubelet/app/server.go and there isn't really a good place to put
// the code. If/When the kubelet dependency injection gets worked out,
// maybe there will be a better place for it.
func parsePercentage(v string) (int64, error) {
if !strings.HasSuffix(v, "%") {
return 0, fmt.Errorf("percentage expected, got '%s'", v)
}
percentage, err := strconv.ParseInt(strings.TrimRight(v, "%"), 10, 0)
if err != nil {
return 0, fmt.Errorf("invalid number in percentage '%s'", v)
}
if percentage < 0 || percentage > 100 {
return 0, fmt.Errorf("percentage must be between 0 and 100")
}
return percentage, nil
}
// ParseQOSReserved parses the --qos-reserve-requests option
func ParseQOSReserved(m kubeletconfig.ConfigurationMap) (*map[v1.ResourceName]int64, error) {
reservations := make(map[v1.ResourceName]int64)
for k, v := range m {
switch v1.ResourceName(k) {
// Only memory resources are supported.
case v1.ResourceMemory:
q, err := parsePercentage(v)
if err != nil {
return nil, err
}
reservations[v1.ResourceName(k)] = q
default:
return nil, fmt.Errorf("cannot reserve %q resource", k)
}
}
return &reservations, nil
}

View file

@ -0,0 +1,959 @@
// +build linux
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"bufio"
"fmt"
"io/ioutil"
"os"
"os/exec"
"path"
"strconv"
"sync"
"time"
"github.com/golang/glog"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/configs"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/tools/record"
kubefeatures "k8s.io/kubernetes/pkg/features"
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/status"
utilfile "k8s.io/kubernetes/pkg/util/file"
"k8s.io/kubernetes/pkg/util/mount"
"k8s.io/kubernetes/pkg/util/oom"
"k8s.io/kubernetes/pkg/util/procfs"
utilsysctl "k8s.io/kubernetes/pkg/util/sysctl"
utilversion "k8s.io/kubernetes/pkg/util/version"
)
const (
// The percent of the machine memory capacity. The value is used to calculate
// docker memory resource container's hardlimit to workaround docker memory
// leakage issue. Please see kubernetes/issues/9881 for more detail.
DockerMemoryLimitThresholdPercent = 70
// The minimum memory limit allocated to docker container: 150Mi
MinDockerMemoryLimit = 150 * 1024 * 1024
dockerProcessName = "docker"
dockerPidFile = "/var/run/docker.pid"
containerdProcessName = "docker-containerd"
containerdPidFile = "/run/docker/libcontainerd/docker-containerd.pid"
)
var (
// The docker version in which containerd was introduced.
containerdAPIVersion = utilversion.MustParseGeneric("1.23")
)
// A non-user container tracked by the Kubelet.
type systemContainer struct {
// Absolute name of the container.
name string
// CPU limit in millicores.
cpuMillicores int64
// Function that ensures the state of the container.
// m is the cgroup manager for the specified container.
ensureStateFunc func(m *fs.Manager) error
// Manager for the cgroups of the external container.
manager *fs.Manager
}
func newSystemCgroups(containerName string) *systemContainer {
return &systemContainer{
name: containerName,
manager: createManager(containerName),
}
}
type containerManagerImpl struct {
sync.RWMutex
cadvisorInterface cadvisor.Interface
mountUtil mount.Interface
NodeConfig
status Status
// External containers being managed.
systemContainers []*systemContainer
qosContainers QOSContainersInfo
// Tasks that are run periodically
periodicTasks []func()
// holds all the mounted cgroup subsystems
subsystems *CgroupSubsystems
nodeInfo *v1.Node
// Interface for cgroup management
cgroupManager CgroupManager
// Capacity of this node.
capacity v1.ResourceList
// Absolute cgroupfs path to a cgroup that Kubelet needs to place all pods under.
// This path include a top level container for enforcing Node Allocatable.
cgroupRoot string
// Event recorder interface.
recorder record.EventRecorder
// Interface for QoS cgroup management
qosContainerManager QOSContainerManager
// Interface for exporting and allocating devices reported by device plugins.
devicePluginHandler DevicePluginHandler
// Interface for CPU affinity management.
cpuManager cpumanager.Manager
}
type features struct {
cpuHardcapping bool
}
var _ ContainerManager = &containerManagerImpl{}
// checks if the required cgroups subsystems are mounted.
// As of now, only 'cpu' and 'memory' are required.
// cpu quota is a soft requirement.
func validateSystemRequirements(mountUtil mount.Interface) (features, error) {
const (
cgroupMountType = "cgroup"
localErr = "system validation failed"
)
var (
cpuMountPoint string
f features
)
mountPoints, err := mountUtil.List()
if err != nil {
return f, fmt.Errorf("%s - %v", localErr, err)
}
expectedCgroups := sets.NewString("cpu", "cpuacct", "cpuset", "memory")
for _, mountPoint := range mountPoints {
if mountPoint.Type == cgroupMountType {
for _, opt := range mountPoint.Opts {
if expectedCgroups.Has(opt) {
expectedCgroups.Delete(opt)
}
if opt == "cpu" {
cpuMountPoint = mountPoint.Path
}
}
}
}
if expectedCgroups.Len() > 0 {
return f, fmt.Errorf("%s - Following Cgroup subsystem not mounted: %v", localErr, expectedCgroups.List())
}
// Check if cpu quota is available.
// CPU cgroup is required and so it expected to be mounted at this point.
periodExists, err := utilfile.FileExists(path.Join(cpuMountPoint, "cpu.cfs_period_us"))
if err != nil {
glog.Errorf("failed to detect if CPU cgroup cpu.cfs_period_us is available - %v", err)
}
quotaExists, err := utilfile.FileExists(path.Join(cpuMountPoint, "cpu.cfs_quota_us"))
if err != nil {
glog.Errorf("failed to detect if CPU cgroup cpu.cfs_quota_us is available - %v", err)
}
if quotaExists && periodExists {
f.cpuHardcapping = true
}
return f, nil
}
// TODO(vmarmol): Add limits to the system containers.
// Takes the absolute name of the specified containers.
// Empty container name disables use of the specified container.
func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, devicePluginEnabled bool, recorder record.EventRecorder) (ContainerManager, error) {
subsystems, err := GetCgroupSubsystems()
if err != nil {
return nil, fmt.Errorf("failed to get mounted cgroup subsystems: %v", err)
}
// Check whether swap is enabled. The Kubelet does not support running with swap enabled.
cmd := exec.Command("cat", "/proc/swaps")
stdout, err := cmd.StdoutPipe()
if err != nil {
return nil, err
}
if err := cmd.Start(); err != nil {
return nil, err
}
var buf []string
scanner := bufio.NewScanner(stdout)
for scanner.Scan() { // Splits on newlines by default
buf = append(buf, scanner.Text())
}
if err := cmd.Wait(); err != nil { // Clean up
return nil, err
}
// Running with swap enabled should be considered an error, but in order to maintain legacy
// behavior we have to require an opt-in to this error for a period of time.
// If there is more than one line (table headers) in /proc/swaps, swap is enabled and we should
// error out unless --fail-swap-on is set to false.
if len(buf) > 1 {
if failSwapOn {
return nil, fmt.Errorf("Running with swap on is not supported, please disable swap! or set --fail-swap-on flag to false. /proc/swaps contained: %v", buf)
}
}
var capacity = v1.ResourceList{}
// It is safe to invoke `MachineInfo` on cAdvisor before logically initializing cAdvisor here because
// machine info is computed and cached once as part of cAdvisor object creation.
// But `RootFsInfo` and `ImagesFsInfo` are not available at this moment so they will be called later during manager starts
machineInfo, err := cadvisorInterface.MachineInfo()
if err != nil {
return nil, err
}
capacity = cadvisor.CapacityFromMachineInfo(machineInfo)
cgroupRoot := nodeConfig.CgroupRoot
cgroupManager := NewCgroupManager(subsystems, nodeConfig.CgroupDriver)
// Check if Cgroup-root actually exists on the node
if nodeConfig.CgroupsPerQOS {
// this does default to / when enabled, but this tests against regressions.
if nodeConfig.CgroupRoot == "" {
return nil, fmt.Errorf("invalid configuration: cgroups-per-qos was specified and cgroup-root was not specified. To enable the QoS cgroup hierarchy you need to specify a valid cgroup-root")
}
// we need to check that the cgroup root actually exists for each subsystem
// of note, we always use the cgroupfs driver when performing this check since
// the input is provided in that format.
// this is important because we do not want any name conversion to occur.
if !cgroupManager.Exists(CgroupName(cgroupRoot)) {
return nil, fmt.Errorf("invalid configuration: cgroup-root %q doesn't exist: %v", cgroupRoot, err)
}
glog.Infof("container manager verified user specified cgroup-root exists: %v", cgroupRoot)
// Include the the top level cgroup for enforcing node allocatable into cgroup-root.
// This way, all sub modules can avoid having to understand the concept of node allocatable.
cgroupRoot = path.Join(cgroupRoot, defaultNodeAllocatableCgroupName)
}
glog.Infof("Creating Container Manager object based on Node Config: %+v", nodeConfig)
qosContainerManager, err := NewQOSContainerManager(subsystems, cgroupRoot, nodeConfig)
if err != nil {
return nil, err
}
cm := &containerManagerImpl{
cadvisorInterface: cadvisorInterface,
mountUtil: mountUtil,
NodeConfig: nodeConfig,
subsystems: subsystems,
cgroupManager: cgroupManager,
capacity: capacity,
cgroupRoot: cgroupRoot,
recorder: recorder,
qosContainerManager: qosContainerManager,
}
updateDeviceCapacityFunc := func(updates v1.ResourceList) {
cm.Lock()
defer cm.Unlock()
for k, v := range updates {
if v.Value() <= 0 {
delete(cm.capacity, k)
} else {
cm.capacity[k] = v
}
}
}
glog.Infof("Creating device plugin handler: %t", devicePluginEnabled)
if devicePluginEnabled {
cm.devicePluginHandler, err = NewDevicePluginHandlerImpl(updateDeviceCapacityFunc)
} else {
cm.devicePluginHandler, err = NewDevicePluginHandlerStub()
}
if err != nil {
return nil, err
}
// Initialize CPU manager
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
cm.cpuManager, err = cpumanager.NewManager(
nodeConfig.ExperimentalCPUManagerPolicy,
nodeConfig.ExperimentalCPUManagerReconcilePeriod,
machineInfo,
cm.GetNodeAllocatableReservation(),
)
if err != nil {
glog.Errorf("failed to initialize cpu manager: %v", err)
return nil, err
}
}
return cm, nil
}
// NewPodContainerManager is a factory method returns a PodContainerManager object
// If qosCgroups are enabled then it returns the general pod container manager
// otherwise it returns a no-op manager which essentially does nothing
func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
if cm.NodeConfig.CgroupsPerQOS {
return &podContainerManagerImpl{
qosContainersInfo: cm.GetQOSContainersInfo(),
subsystems: cm.subsystems,
cgroupManager: cm.cgroupManager,
}
}
return &podContainerManagerNoop{
cgroupRoot: CgroupName(cm.cgroupRoot),
}
}
func (cm *containerManagerImpl) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cm.cpuManager}
}
// Create a cgroup container manager.
func createManager(containerName string) *fs.Manager {
allowAllDevices := true
return &fs.Manager{
Cgroups: &configs.Cgroup{
Parent: "/",
Name: containerName,
Resources: &configs.Resources{
AllowAllDevices: &allowAllDevices,
},
},
}
}
type KernelTunableBehavior string
const (
KernelTunableWarn KernelTunableBehavior = "warn"
KernelTunableError KernelTunableBehavior = "error"
KernelTunableModify KernelTunableBehavior = "modify"
)
// setupKernelTunables validates kernel tunable flags are set as expected
// depending upon the specified option, it will either warn, error, or modify the kernel tunable flags
func setupKernelTunables(option KernelTunableBehavior) error {
desiredState := map[string]int{
utilsysctl.VmOvercommitMemory: utilsysctl.VmOvercommitMemoryAlways,
utilsysctl.VmPanicOnOOM: utilsysctl.VmPanicOnOOMInvokeOOMKiller,
utilsysctl.KernelPanic: utilsysctl.KernelPanicRebootTimeout,
utilsysctl.KernelPanicOnOops: utilsysctl.KernelPanicOnOopsAlways,
utilsysctl.RootMaxKeys: utilsysctl.RootMaxKeysSetting,
utilsysctl.RootMaxBytes: utilsysctl.RootMaxBytesSetting,
}
sysctl := utilsysctl.New()
errList := []error{}
for flag, expectedValue := range desiredState {
val, err := sysctl.GetSysctl(flag)
if err != nil {
errList = append(errList, err)
continue
}
if val == expectedValue {
continue
}
switch option {
case KernelTunableError:
errList = append(errList, fmt.Errorf("Invalid kernel flag: %v, expected value: %v, actual value: %v", flag, expectedValue, val))
case KernelTunableWarn:
glog.V(2).Infof("Invalid kernel flag: %v, expected value: %v, actual value: %v", flag, expectedValue, val)
case KernelTunableModify:
glog.V(2).Infof("Updating kernel flag: %v, expected value: %v, actual value: %v", flag, expectedValue, val)
err = sysctl.SetSysctl(flag, expectedValue)
if err != nil {
errList = append(errList, err)
}
}
}
return utilerrors.NewAggregate(errList)
}
func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error {
f, err := validateSystemRequirements(cm.mountUtil)
if err != nil {
return err
}
if !f.cpuHardcapping {
cm.status.SoftRequirements = fmt.Errorf("CPU hardcapping unsupported")
}
b := KernelTunableModify
if cm.GetNodeConfig().ProtectKernelDefaults {
b = KernelTunableError
}
if err := setupKernelTunables(b); err != nil {
return err
}
// Setup top level qos containers only if CgroupsPerQOS flag is specified as true
if cm.NodeConfig.CgroupsPerQOS {
if err := cm.createNodeAllocatableCgroups(); err != nil {
return err
}
err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods)
if err != nil {
return fmt.Errorf("failed to initialize top level QOS containers: %v", err)
}
}
// Enforce Node Allocatable (if required)
if err := cm.enforceNodeAllocatableCgroups(); err != nil {
return err
}
systemContainers := []*systemContainer{}
if cm.ContainerRuntime == "docker" {
// With the docker-CRI integration, dockershim will manage the cgroups
// and oom score for the docker processes.
// In the future, NodeSpec should mandate the cgroup that the
// runtime processes need to be in. For now, we still check the
// cgroup for docker periodically, so that kubelet can recognize
// the cgroup for docker and serve stats for the runtime.
// TODO(#27097): Fix this after NodeSpec is clearly defined.
cm.periodicTasks = append(cm.periodicTasks, func() {
glog.V(4).Infof("[ContainerManager]: Adding periodic tasks for docker CRI integration")
cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
if err != nil {
glog.Error(err)
return
}
glog.V(2).Infof("[ContainerManager]: Discovered runtime cgroups name: %s", cont)
cm.Lock()
defer cm.Unlock()
cm.RuntimeCgroupsName = cont
})
}
if cm.SystemCgroupsName != "" {
if cm.SystemCgroupsName == "/" {
return fmt.Errorf("system container cannot be root (\"/\")")
}
cont := newSystemCgroups(cm.SystemCgroupsName)
cont.ensureStateFunc = func(manager *fs.Manager) error {
return ensureSystemCgroups("/", manager)
}
systemContainers = append(systemContainers, cont)
}
if cm.KubeletCgroupsName != "" {
cont := newSystemCgroups(cm.KubeletCgroupsName)
allowAllDevices := true
manager := fs.Manager{
Cgroups: &configs.Cgroup{
Parent: "/",
Name: cm.KubeletCgroupsName,
Resources: &configs.Resources{
AllowAllDevices: &allowAllDevices,
},
},
}
cont.ensureStateFunc = func(_ *fs.Manager) error {
return ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, &manager)
}
systemContainers = append(systemContainers, cont)
} else {
cm.periodicTasks = append(cm.periodicTasks, func() {
if err := ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, nil); err != nil {
glog.Error(err)
return
}
cont, err := getContainer(os.Getpid())
if err != nil {
glog.Errorf("failed to find cgroups of kubelet - %v", err)
return
}
cm.Lock()
defer cm.Unlock()
cm.KubeletCgroupsName = cont
})
}
cm.systemContainers = systemContainers
return nil
}
func getContainerNameForProcess(name, pidFile string) (string, error) {
pids, err := getPidsForProcess(name, pidFile)
if err != nil {
return "", fmt.Errorf("failed to detect process id for %q - %v", name, err)
}
if len(pids) == 0 {
return "", nil
}
cont, err := getContainer(pids[0])
if err != nil {
return "", err
}
return cont, nil
}
func (cm *containerManagerImpl) GetNodeConfig() NodeConfig {
cm.RLock()
defer cm.RUnlock()
return cm.NodeConfig
}
func (cm *containerManagerImpl) GetMountedSubsystems() *CgroupSubsystems {
return cm.subsystems
}
func (cm *containerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
return cm.qosContainerManager.GetQOSContainersInfo()
}
func (cm *containerManagerImpl) UpdateQOSCgroups() error {
return cm.qosContainerManager.UpdateCgroups()
}
func (cm *containerManagerImpl) Status() Status {
cm.RLock()
defer cm.RUnlock()
return cm.status
}
func (cm *containerManagerImpl) Start(node *v1.Node,
activePods ActivePodsFunc,
podStatusProvider status.PodStatusProvider,
runtimeService internalapi.RuntimeService) error {
// Initialize CPU manager
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
cm.cpuManager.Start(cpumanager.ActivePodsFunc(activePods), podStatusProvider, runtimeService)
}
// cache the node Info including resource capacity and
// allocatable of the node
cm.nodeInfo = node
// Ensure that node allocatable configuration is valid.
if err := cm.validateNodeAllocatable(); err != nil {
return err
}
// Setup the node
if err := cm.setupNode(activePods); err != nil {
return err
}
// Don't run a background thread if there are no ensureStateFuncs.
hasEnsureStateFuncs := false
for _, cont := range cm.systemContainers {
if cont.ensureStateFunc != nil {
hasEnsureStateFuncs = true
break
}
}
if hasEnsureStateFuncs {
// Run ensure state functions every minute.
go wait.Until(func() {
for _, cont := range cm.systemContainers {
if cont.ensureStateFunc != nil {
if err := cont.ensureStateFunc(cont.manager); err != nil {
glog.Warningf("[ContainerManager] Failed to ensure state of %q: %v", cont.name, err)
}
}
}
}, time.Minute, wait.NeverStop)
}
if len(cm.periodicTasks) > 0 {
go wait.Until(func() {
for _, task := range cm.periodicTasks {
if task != nil {
task()
}
}
}, 5*time.Minute, wait.NeverStop)
}
// Local storage filesystem information from `RootFsInfo` and `ImagesFsInfo` is available at a later time
// depending on the time when cadvisor manager updates container stats. Therefore use a go routine to keep
// retrieving the information until it is available.
stopChan := make(chan struct{})
go wait.Until(func() {
if err := cm.setFsCapacity(); err != nil {
glog.Errorf("[ContainerManager]: %v", err)
return
}
close(stopChan)
}, time.Second, stopChan)
// Starts device plugin manager.
if err := cm.devicePluginHandler.Start(); err != nil {
return err
}
return nil
}
func (cm *containerManagerImpl) setFsCapacity() error {
rootfs, err := cm.cadvisorInterface.RootFsInfo()
if err != nil {
return fmt.Errorf("Fail to get rootfs information %v", err)
}
cm.Lock()
for rName, rCap := range cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs) {
cm.capacity[rName] = rCap
}
cm.Unlock()
return nil
}
// TODO: move the GetResources logic to PodContainerManager.
func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) (*kubecontainer.RunContainerOptions, error) {
opts := &kubecontainer.RunContainerOptions{}
// Gets devices, mounts, and envs from device plugin handler.
glog.V(3).Infof("Calling devicePluginHandler AllocateDevices")
// Maps to detect duplicate settings.
devsMap := make(map[string]string)
mountsMap := make(map[string]string)
envsMap := make(map[string]string)
allocResps, err := cm.devicePluginHandler.Allocate(pod, container, activePods)
if err != nil {
return opts, err
}
// Loops through AllocationResponses of all required extended resources.
for _, resp := range allocResps {
// Loops through runtime spec of all devices of the given resource.
for _, devRuntime := range resp.Spec {
// Updates RunContainerOptions.Devices.
for _, dev := range devRuntime.Devices {
if d, ok := devsMap[dev.ContainerPath]; ok {
glog.V(3).Infof("skip existing device %s %s", dev.ContainerPath, dev.HostPath)
if d != dev.HostPath {
glog.Errorf("Container device %s has conflicting mapping host devices: %s and %s",
dev.ContainerPath, d, dev.HostPath)
}
continue
}
devsMap[dev.ContainerPath] = dev.HostPath
opts.Devices = append(opts.Devices, kubecontainer.DeviceInfo{
PathOnHost: dev.HostPath,
PathInContainer: dev.ContainerPath,
Permissions: dev.Permissions,
})
}
// Updates RunContainerOptions.Mounts.
for _, mount := range devRuntime.Mounts {
if m, ok := mountsMap[mount.ContainerPath]; ok {
glog.V(3).Infof("skip existing mount %s %s", mount.ContainerPath, mount.HostPath)
if m != mount.HostPath {
glog.Errorf("Container mount %s has conflicting mapping host mounts: %s and %s",
mount.ContainerPath, m, mount.HostPath)
}
continue
}
mountsMap[mount.ContainerPath] = mount.HostPath
opts.Mounts = append(opts.Mounts, kubecontainer.Mount{
Name: mount.ContainerPath,
ContainerPath: mount.ContainerPath,
HostPath: mount.HostPath,
ReadOnly: mount.ReadOnly,
SELinuxRelabel: false,
})
}
// Updates RunContainerOptions.Envs.
for k, v := range devRuntime.Envs {
if e, ok := envsMap[k]; ok {
glog.V(3).Infof("skip existing envs %s %s", k, v)
if e != v {
glog.Errorf("Environment variable %s has conflicting setting: %s and %s", k, e, v)
}
continue
}
envsMap[k] = v
opts.Envs = append(opts.Envs, kubecontainer.EnvVar{Name: k, Value: v})
}
}
}
return opts, nil
}
func (cm *containerManagerImpl) SystemCgroupsLimit() v1.ResourceList {
cpuLimit := int64(0)
// Sum up resources of all external containers.
for _, cont := range cm.systemContainers {
cpuLimit += cont.cpuMillicores
}
return v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(
cpuLimit,
resource.DecimalSI),
}
}
func isProcessRunningInHost(pid int) (bool, error) {
// Get init pid namespace.
initPidNs, err := os.Readlink("/proc/1/ns/pid")
if err != nil {
return false, fmt.Errorf("failed to find pid namespace of init process")
}
glog.V(10).Infof("init pid ns is %q", initPidNs)
processPidNs, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", pid))
if err != nil {
return false, fmt.Errorf("failed to find pid namespace of process %q", pid)
}
glog.V(10).Infof("Pid %d pid ns is %q", pid, processPidNs)
return initPidNs == processPidNs, nil
}
func getPidFromPidFile(pidFile string) (int, error) {
file, err := os.Open(pidFile)
if err != nil {
return 0, fmt.Errorf("error opening pid file %s: %v", pidFile, err)
}
defer file.Close()
data, err := ioutil.ReadAll(file)
if err != nil {
return 0, fmt.Errorf("error reading pid file %s: %v", pidFile, err)
}
pid, err := strconv.Atoi(string(data))
if err != nil {
return 0, fmt.Errorf("error parsing %s as a number: %v", string(data), err)
}
return pid, nil
}
func getPidsForProcess(name, pidFile string) ([]int, error) {
if len(pidFile) == 0 {
return procfs.PidOf(name)
}
pid, err := getPidFromPidFile(pidFile)
if err == nil {
return []int{pid}, nil
}
// Try to lookup pid by process name
pids, err2 := procfs.PidOf(name)
if err2 == nil {
return pids, nil
}
// Return error from getPidFromPidFile since that should have worked
// and is the real source of the problem.
glog.V(4).Infof("unable to get pid from %s: %v", pidFile, err)
return []int{}, err
}
// Ensures that the Docker daemon is in the desired container.
// Temporarily export the function to be used by dockershim.
// TODO(yujuhong): Move this function to dockershim once kubelet migrates to
// dockershim as the default.
func EnsureDockerInContainer(dockerAPIVersion *utilversion.Version, oomScoreAdj int, manager *fs.Manager) error {
type process struct{ name, file string }
dockerProcs := []process{{dockerProcessName, dockerPidFile}}
if dockerAPIVersion.AtLeast(containerdAPIVersion) {
dockerProcs = append(dockerProcs, process{containerdProcessName, containerdPidFile})
}
var errs []error
for _, proc := range dockerProcs {
pids, err := getPidsForProcess(proc.name, proc.file)
if err != nil {
errs = append(errs, fmt.Errorf("failed to get pids for %q: %v", proc.name, err))
continue
}
// Move if the pid is not already in the desired container.
for _, pid := range pids {
if err := ensureProcessInContainerWithOOMScore(pid, oomScoreAdj, manager); err != nil {
errs = append(errs, fmt.Errorf("errors moving %q pid: %v", proc.name, err))
}
}
}
return utilerrors.NewAggregate(errs)
}
func ensureProcessInContainerWithOOMScore(pid int, oomScoreAdj int, manager *fs.Manager) error {
if runningInHost, err := isProcessRunningInHost(pid); err != nil {
// Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context.
return err
} else if !runningInHost {
// Process is running inside a container. Don't touch that.
glog.V(2).Infof("pid %d is not running in the host namespaces", pid)
return nil
}
var errs []error
if manager != nil {
cont, err := getContainer(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err))
}
if cont != manager.Cgroups.Name {
err = manager.Apply(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q: %v", pid, cont, manager.Cgroups.Name, err))
}
}
}
// Also apply oom-score-adj to processes
oomAdjuster := oom.NewOOMAdjuster()
glog.V(5).Infof("attempting to apply oom_score_adj of %d to pid %d", oomScoreAdj, pid)
if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil {
glog.V(3).Infof("Failed to apply oom_score_adj %d for pid %d: %v", oomScoreAdj, pid, err)
errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d: %v", oomScoreAdj, pid, err))
}
return utilerrors.NewAggregate(errs)
}
// getContainer returns the cgroup associated with the specified pid.
// It enforces a unified hierarchy for memory and cpu cgroups.
// On systemd environments, it uses the name=systemd cgroup for the specified pid.
func getContainer(pid int) (string, error) {
cgs, err := cgroups.ParseCgroupFile(fmt.Sprintf("/proc/%d/cgroup", pid))
if err != nil {
return "", err
}
cpu, found := cgs["cpu"]
if !found {
return "", cgroups.NewNotFoundError("cpu")
}
memory, found := cgs["memory"]
if !found {
return "", cgroups.NewNotFoundError("memory")
}
// since we use this container for accounting, we need to ensure its a unified hierarchy.
if cpu != memory {
return "", fmt.Errorf("cpu and memory cgroup hierarchy not unified. cpu: %s, memory: %s", cpu, memory)
}
// on systemd, every pid is in a unified cgroup hierarchy (name=systemd as seen in systemd-cgls)
// cpu and memory accounting is off by default, users may choose to enable it per unit or globally.
// users could enable CPU and memory accounting globally via /etc/systemd/system.conf (DefaultCPUAccounting=true DefaultMemoryAccounting=true).
// users could also enable CPU and memory accounting per unit via CPUAccounting=true and MemoryAccounting=true
// we only warn if accounting is not enabled for CPU or memory so as to not break local development flows where kubelet is launched in a terminal.
// for example, the cgroup for the user session will be something like /user.slice/user-X.slice/session-X.scope, but the cpu and memory
// cgroup will be the closest ancestor where accounting is performed (most likely /) on systems that launch docker containers.
// as a result, on those systems, you will not get cpu or memory accounting statistics for kubelet.
// in addition, you would not get memory or cpu accounting for the runtime unless accounting was enabled on its unit (or globally).
if systemd, found := cgs["name=systemd"]; found {
if systemd != cpu {
glog.Warningf("CPUAccounting not enabled for pid: %d", pid)
}
if systemd != memory {
glog.Warningf("MemoryAccounting not enabled for pid: %d", pid)
}
return systemd, nil
}
return cpu, nil
}
// Ensures the system container is created and all non-kernel threads and process 1
// without a container are moved to it.
//
// The reason of leaving kernel threads at root cgroup is that we don't want to tie the
// execution of these threads with to-be defined /system quota and create priority inversions.
//
func ensureSystemCgroups(rootCgroupPath string, manager *fs.Manager) error {
// Move non-kernel PIDs to the system container.
attemptsRemaining := 10
var errs []error
for attemptsRemaining >= 0 {
// Only keep errors on latest attempt.
errs = []error{}
attemptsRemaining--
allPids, err := cmutil.GetPids(rootCgroupPath)
if err != nil {
errs = append(errs, fmt.Errorf("failed to list PIDs for root: %v", err))
continue
}
// Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers)
pids := make([]int, 0, len(allPids))
for _, pid := range allPids {
if pid == 1 || isKernelPid(pid) {
continue
}
pids = append(pids, pid)
}
glog.Infof("Found %d PIDs in root, %d of them are not to be moved", len(allPids), len(allPids)-len(pids))
// Check if we have moved all the non-kernel PIDs.
if len(pids) == 0 {
break
}
glog.Infof("Moving non-kernel processes: %v", pids)
for _, pid := range pids {
err := manager.Apply(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, manager.Cgroups.Name, err))
}
}
}
if attemptsRemaining < 0 {
errs = append(errs, fmt.Errorf("ran out of attempts to create system containers %q", manager.Cgroups.Name))
}
return utilerrors.NewAggregate(errs)
}
// Determines whether the specified PID is a kernel PID.
func isKernelPid(pid int) bool {
// Kernel threads have no associated executable.
_, err := os.Readlink(fmt.Sprintf("/proc/%d/exe", pid))
return err != nil
}
// Helper for getting the docker API version.
func getDockerAPIVersion(cadvisor cadvisor.Interface) *utilversion.Version {
versions, err := cadvisor.VersionInfo()
if err != nil {
glog.Errorf("Error requesting cAdvisor VersionInfo: %v", err)
return utilversion.MustParseSemantic("0.0")
}
dockerAPIVersion, err := utilversion.ParseGeneric(versions.DockerAPIVersion)
if err != nil {
glog.Errorf("Error parsing docker version %q: %v", versions.DockerVersion, err)
return utilversion.MustParseSemantic("0.0")
}
return dockerAPIVersion
}
func (cm *containerManagerImpl) GetCapacity() v1.ResourceList {
cm.RLock()
defer cm.RUnlock()
return cm.capacity
}

View file

@ -0,0 +1,193 @@
// +build linux
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"io/ioutil"
"os"
"path"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"k8s.io/kubernetes/pkg/util/mount"
)
type fakeMountInterface struct {
mountPoints []mount.MountPoint
}
func (mi *fakeMountInterface) Mount(source string, target string, fstype string, options []string) error {
return fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) Unmount(target string) error {
return fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) List() ([]mount.MountPoint, error) {
return mi.mountPoints, nil
}
func (mi *fakeMountInterface) IsMountPointMatch(mp mount.MountPoint, dir string) bool {
return (mp.Path == dir)
}
func (mi *fakeMountInterface) IsNotMountPoint(dir string) (bool, error) {
return false, fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) IsLikelyNotMountPoint(file string) (bool, error) {
return false, fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) GetDeviceNameFromMount(mountPath, pluginDir string) (string, error) {
return "", nil
}
func (mi *fakeMountInterface) DeviceOpened(pathname string) (bool, error) {
for _, mp := range mi.mountPoints {
if mp.Device == pathname {
return true, nil
}
}
return false, nil
}
func (mi *fakeMountInterface) PathIsDevice(pathname string) (bool, error) {
return true, nil
}
func (mi *fakeMountInterface) MakeRShared(path string) error {
return nil
}
func fakeContainerMgrMountInt() mount.Interface {
return &fakeMountInterface{
[]mount.MountPoint{
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuset"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpu"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuacct"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "memory"},
},
},
}
}
func TestCgroupMountValidationSuccess(t *testing.T) {
f, err := validateSystemRequirements(fakeContainerMgrMountInt())
assert.Nil(t, err)
assert.False(t, f.cpuHardcapping, "cpu hardcapping is expected to be disabled")
}
func TestCgroupMountValidationMemoryMissing(t *testing.T) {
mountInt := &fakeMountInterface{
[]mount.MountPoint{
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuset"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpu"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuacct"},
},
},
}
_, err := validateSystemRequirements(mountInt)
assert.Error(t, err)
}
func TestCgroupMountValidationMultipleSubsystem(t *testing.T) {
mountInt := &fakeMountInterface{
[]mount.MountPoint{
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuset", "memory"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpu"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuacct"},
},
},
}
_, err := validateSystemRequirements(mountInt)
assert.Nil(t, err)
}
func TestSoftRequirementsValidationSuccess(t *testing.T) {
req := require.New(t)
tempDir, err := ioutil.TempDir("", "")
req.NoError(err)
defer os.RemoveAll(tempDir)
req.NoError(ioutil.WriteFile(path.Join(tempDir, "cpu.cfs_period_us"), []byte("0"), os.ModePerm))
req.NoError(ioutil.WriteFile(path.Join(tempDir, "cpu.cfs_quota_us"), []byte("0"), os.ModePerm))
mountInt := &fakeMountInterface{
[]mount.MountPoint{
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuset"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpu"},
Path: tempDir,
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuacct", "memory"},
},
},
}
f, err := validateSystemRequirements(mountInt)
assert.NoError(t, err)
assert.True(t, f.cpuHardcapping, "cpu hardcapping is expected to be enabled")
}

View file

@ -0,0 +1,84 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"github.com/golang/glog"
"k8s.io/api/core/v1"
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/status"
)
type containerManagerStub struct{}
var _ ContainerManager = &containerManagerStub{}
func (cm *containerManagerStub) Start(_ *v1.Node, _ ActivePodsFunc, _ status.PodStatusProvider, _ internalapi.RuntimeService) error {
glog.V(2).Infof("Starting stub container manager")
return nil
}
func (cm *containerManagerStub) SystemCgroupsLimit() v1.ResourceList {
return v1.ResourceList{}
}
func (cm *containerManagerStub) GetNodeConfig() NodeConfig {
return NodeConfig{}
}
func (cm *containerManagerStub) GetMountedSubsystems() *CgroupSubsystems {
return &CgroupSubsystems{}
}
func (cm *containerManagerStub) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (cm *containerManagerStub) UpdateQOSCgroups() error {
return nil
}
func (cm *containerManagerStub) Status() Status {
return Status{}
}
func (cm *containerManagerStub) GetNodeAllocatableReservation() v1.ResourceList {
return nil
}
func (cm *containerManagerStub) GetCapacity() v1.ResourceList {
return nil
}
func (cm *containerManagerStub) NewPodContainerManager() PodContainerManager {
return &podContainerManagerStub{}
}
func (cm *containerManagerStub) GetResources(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) (*kubecontainer.RunContainerOptions, error) {
return &kubecontainer.RunContainerOptions{}, nil
}
func (cm *containerManagerStub) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cpumanager.NewFakeManager()}
}
func NewStubContainerManager() ContainerManager {
return &containerManagerStub{}
}

View file

@ -0,0 +1,89 @@
// +build !linux,!windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"k8s.io/api/core/v1"
"k8s.io/client-go/tools/record"
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/pkg/util/mount"
)
type unsupportedContainerManager struct {
}
var _ ContainerManager = &unsupportedContainerManager{}
func (unsupportedContainerManager) Start(_ *v1.Node, _ ActivePodsFunc, _ status.PodStatusProvider, _ internalapi.RuntimeService) error {
return fmt.Errorf("Container Manager is unsupported in this build")
}
func (unsupportedContainerManager) SystemCgroupsLimit() v1.ResourceList {
return v1.ResourceList{}
}
func (unsupportedContainerManager) GetNodeConfig() NodeConfig {
return NodeConfig{}
}
func (unsupportedContainerManager) GetMountedSubsystems() *CgroupSubsystems {
return &CgroupSubsystems{}
}
func (unsupportedContainerManager) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (unsupportedContainerManager) UpdateQOSCgroups() error {
return nil
}
func (cm *unsupportedContainerManager) Status() Status {
return Status{}
}
func (cm *unsupportedContainerManager) GetNodeAllocatableReservation() v1.ResourceList {
return nil
}
func (cm *unsupportedContainerManager) GetCapacity() v1.ResourceList {
return nil
}
func (cm *unsupportedContainerManager) NewPodContainerManager() PodContainerManager {
return &unsupportedPodContainerManager{}
}
func (cm *unsupportedContainerManager) GetResources(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) (*kubecontainer.RunContainerOptions, error) {
return &kubecontainer.RunContainerOptions{}, nil
}
func (cm *unsupportedContainerManager) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cpumanager.NewFakeManager()}
}
func NewContainerManager(_ mount.Interface, _ cadvisor.Interface, _ NodeConfig, failSwapOn bool, devicePluginEnabled bool, recorder record.EventRecorder) (ContainerManager, error) {
return &unsupportedContainerManager{}, nil
}

View file

@ -0,0 +1,101 @@
// +build !linux,!windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"k8s.io/kubernetes/pkg/util/mount"
)
type fakeMountInterface struct {
mountPoints []mount.MountPoint
}
func (mi *fakeMountInterface) Mount(source string, target string, fstype string, options []string) error {
return fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) Unmount(target string) error {
return fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) List() ([]mount.MountPoint, error) {
return mi.mountPoints, nil
}
func (f *fakeMountInterface) IsMountPointMatch(mp mount.MountPoint, dir string) bool {
return (mp.Path == dir)
}
func (f *fakeMountInterface) IsNotMountPoint(dir string) (bool, error) {
return false, fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) IsLikelyNotMountPoint(file string) (bool, error) {
return false, fmt.Errorf("unsupported")
}
func (mi *fakeMountInterface) DeviceOpened(pathname string) (bool, error) {
for _, mp := range mi.mountPoints {
if mp.Device == pathname {
return true, nil
}
}
return false, nil
}
func (mi *fakeMountInterface) PathIsDevice(pathname string) (bool, error) {
return true, nil
}
func (mi *fakeMountInterface) GetDeviceNameFromMount(mountPath, pluginDir string) (string, error) {
return "", nil
}
func (mi *fakeMountInterface) MakeRShared(path string) error {
return nil
}
func fakeContainerMgrMountInt() mount.Interface {
return &fakeMountInterface{
[]mount.MountPoint{
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuset"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpu"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "cpuacct"},
},
{
Device: "cgroup",
Type: "cgroup",
Opts: []string{"rw", "relatime", "memory"},
},
},
}
}

View file

@ -0,0 +1,45 @@
// +build windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/client-go/tools/record"
internalapi "k8s.io/kubernetes/pkg/kubelet/apis/cri"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/pkg/util/mount"
)
type containerManagerImpl struct {
containerManagerStub
}
var _ ContainerManager = &containerManagerImpl{}
func (cm *containerManagerImpl) Start(_ *v1.Node, _ ActivePodsFunc, _ status.PodStatusProvider, _ internalapi.RuntimeService) error {
glog.V(2).Infof("Starting Windows stub container manager")
return nil
}
func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, devicePluginEnabled bool, recorder record.EventRecorder) (ContainerManager, error) {
return &containerManagerImpl{}, nil
}

View file

@ -0,0 +1,68 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = [
"cpu_assignment.go",
"cpu_manager.go",
"fake_cpu_manager.go",
"policy.go",
"policy_none.go",
"policy_static.go",
],
visibility = ["//visibility:public"],
deps = [
"//pkg/api/v1/helper/qos:go_default_library",
"//pkg/kubelet/apis/cri/v1alpha1/runtime:go_default_library",
"//pkg/kubelet/cm/cpumanager/state:go_default_library",
"//pkg/kubelet/cm/cpumanager/topology:go_default_library",
"//pkg/kubelet/cm/cpuset:go_default_library",
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/status:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
],
)
go_test(
name = "go_default_test",
srcs = [
"cpu_assignment_test.go",
"cpu_manager_test.go",
"policy_none_test.go",
"policy_static_test.go",
"policy_test.go",
],
library = ":go_default_library",
deps = [
"//pkg/kubelet/apis/cri/v1alpha1/runtime:go_default_library",
"//pkg/kubelet/cm/cpumanager/state:go_default_library",
"//pkg/kubelet/cm/cpumanager/topology:go_default_library",
"//pkg/kubelet/cm/cpuset:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [
":package-srcs",
"//pkg/kubelet/cm/cpumanager/state:all-srcs",
"//pkg/kubelet/cm/cpumanager/topology:all-srcs",
],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View file

@ -0,0 +1,6 @@
approvers:
- derekwaynecarr
- vishh
- ConnorDoyle
- sjenning
- balajismaniam

View file

@ -0,0 +1,197 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"sort"
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
type cpuAccumulator struct {
topo *topology.CPUTopology
details topology.CPUDetails
numCPUsNeeded int
result cpuset.CPUSet
}
func newCPUAccumulator(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) *cpuAccumulator {
return &cpuAccumulator{
topo: topo,
details: topo.CPUDetails.KeepOnly(availableCPUs),
numCPUsNeeded: numCPUs,
result: cpuset.NewCPUSet(),
}
}
func (a *cpuAccumulator) take(cpus cpuset.CPUSet) {
a.result = a.result.Union(cpus)
a.details = a.details.KeepOnly(a.details.CPUs().Difference(a.result))
a.numCPUsNeeded -= cpus.Size()
}
// Returns true if the supplied socket is fully available in `topoDetails`.
func (a *cpuAccumulator) isSocketFree(socketID int) bool {
return a.details.CPUsInSocket(socketID).Size() == a.topo.CPUsPerSocket()
}
// Returns true if the supplied core is fully available in `topoDetails`.
func (a *cpuAccumulator) isCoreFree(coreID int) bool {
return a.details.CPUsInCore(coreID).Size() == a.topo.CPUsPerCore()
}
// Returns free socket IDs as a slice sorted by:
// - socket ID, ascending.
func (a *cpuAccumulator) freeSockets() []int {
return a.details.Sockets().Filter(a.isSocketFree).ToSlice()
}
// Returns core IDs as a slice sorted by:
// - the number of whole available cores on the socket, ascending
// - socket ID, ascending
// - core ID, ascending
func (a *cpuAccumulator) freeCores() []int {
socketIDs := a.details.Sockets().ToSlice()
sort.Slice(socketIDs,
func(i, j int) bool {
iCores := a.details.CoresInSocket(socketIDs[i]).Filter(a.isCoreFree)
jCores := a.details.CoresInSocket(socketIDs[j]).Filter(a.isCoreFree)
return iCores.Size() < jCores.Size() || socketIDs[i] < socketIDs[j]
})
coreIDs := []int{}
for _, s := range socketIDs {
coreIDs = append(coreIDs, a.details.CoresInSocket(s).Filter(a.isCoreFree).ToSlice()...)
}
return coreIDs
}
// Returns CPU IDs as a slice sorted by:
// - socket affinity with result
// - number of CPUs available on the same sockett
// - number of CPUs available on the same core
// - socket ID.
// - core ID.
func (a *cpuAccumulator) freeCPUs() []int {
result := []int{}
cores := a.details.Cores().ToSlice()
sort.Slice(
cores,
func(i, j int) bool {
iCore := cores[i]
jCore := cores[j]
iCPUs := a.topo.CPUDetails.CPUsInCore(iCore).ToSlice()
jCPUs := a.topo.CPUDetails.CPUsInCore(jCore).ToSlice()
iSocket := a.topo.CPUDetails[iCPUs[0]].SocketID
jSocket := a.topo.CPUDetails[jCPUs[0]].SocketID
// Compute the number of CPUs in the result reside on the same socket
// as each core.
iSocketColoScore := a.topo.CPUDetails.CPUsInSocket(iSocket).Intersection(a.result).Size()
jSocketColoScore := a.topo.CPUDetails.CPUsInSocket(jSocket).Intersection(a.result).Size()
// Compute the number of available CPUs available on the same socket
// as each core.
iSocketFreeScore := a.details.CPUsInSocket(iSocket).Size()
jSocketFreeScore := a.details.CPUsInSocket(jSocket).Size()
// Compute the number of available CPUs on each core.
iCoreFreeScore := a.details.CPUsInCore(iCore).Size()
jCoreFreeScore := a.details.CPUsInCore(jCore).Size()
return iSocketColoScore > jSocketColoScore ||
iSocketFreeScore < jSocketFreeScore ||
iCoreFreeScore < jCoreFreeScore ||
iSocket < jSocket ||
iCore < jCore
})
// For each core, append sorted CPU IDs to result.
for _, core := range cores {
result = append(result, a.details.CPUsInCore(core).ToSlice()...)
}
return result
}
func (a *cpuAccumulator) needs(n int) bool {
return a.numCPUsNeeded >= n
}
func (a *cpuAccumulator) isSatisfied() bool {
return a.numCPUsNeeded < 1
}
func (a *cpuAccumulator) isFailed() bool {
return a.numCPUsNeeded > a.details.CPUs().Size()
}
func takeByTopology(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) {
acc := newCPUAccumulator(topo, availableCPUs, numCPUs)
if acc.isSatisfied() {
return acc.result, nil
}
if acc.isFailed() {
return cpuset.NewCPUSet(), fmt.Errorf("not enough cpus available to satisfy request")
}
// Algorithm: topology-aware best-fit
// 1. Acquire whole sockets, if available and the container requires at
// least a socket's-worth of CPUs.
for _, s := range acc.freeSockets() {
if acc.needs(acc.topo.CPUsPerSocket()) {
glog.V(4).Infof("[cpumanager] takeByTopology: claiming socket [%d]", s)
acc.take(acc.details.CPUsInSocket(s))
if acc.isSatisfied() {
return acc.result, nil
}
}
}
// 2. Acquire whole cores, if available and the container requires at least
// a core's-worth of CPUs.
for _, c := range acc.freeCores() {
if acc.needs(acc.topo.CPUsPerCore()) {
glog.V(4).Infof("[cpumanager] takeByTopology: claiming core [%d]", c)
acc.take(acc.details.CPUsInCore(c))
if acc.isSatisfied() {
return acc.result, nil
}
}
}
// 3. Acquire single threads, preferring to fill partially-allocated cores
// on the same sockets as the whole cores we have already taken in this
// allocation.
for _, c := range acc.freeCPUs() {
glog.V(4).Infof("[cpumanager] takeByTopology: claiming CPU [%d]", c)
if acc.needs(1) {
acc.take(cpuset.NewCPUSet(c))
}
if acc.isSatisfied() {
return acc.result, nil
}
}
return cpuset.NewCPUSet(), fmt.Errorf("failed to allocate cpus")
}

View file

@ -0,0 +1,385 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"reflect"
"testing"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
func TestCPUAccumulatorFreeSockets(t *testing.T) {
testCases := []struct {
description string
topo *topology.CPUTopology
availableCPUs cpuset.CPUSet
expect []int
}{
{
"single socket HT, 1 socket free",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]int{0},
},
{
"single socket HT, 0 sockets free",
topoSingleSocketHT,
cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
[]int{},
},
{
"dual socket HT, 2 sockets free",
topoDualSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
[]int{0, 1},
},
{
"dual socket HT, 1 socket free",
topoDualSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11),
[]int{1},
},
{
"dual socket HT, 0 sockets free",
topoDualSocketHT,
cpuset.NewCPUSet(0, 2, 3, 4, 5, 6, 7, 8, 9, 11),
[]int{},
},
}
for _, tc := range testCases {
acc := newCPUAccumulator(tc.topo, tc.availableCPUs, 0)
result := acc.freeSockets()
if !reflect.DeepEqual(result, tc.expect) {
t.Errorf("[%s] expected %v to equal %v", tc.description, result, tc.expect)
}
}
}
func TestCPUAccumulatorFreeCores(t *testing.T) {
testCases := []struct {
description string
topo *topology.CPUTopology
availableCPUs cpuset.CPUSet
expect []int
}{
{
"single socket HT, 4 cores free",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]int{0, 1, 2, 3},
},
{
"single socket HT, 3 cores free",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 4, 5, 6),
[]int{0, 1, 2},
},
{
"single socket HT, 3 cores free (1 partially consumed)",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6),
[]int{0, 1, 2},
},
{
"single socket HT, 0 cores free",
topoSingleSocketHT,
cpuset.NewCPUSet(),
[]int{},
},
{
"single socket HT, 0 cores free (4 partially consumed)",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3),
[]int{},
},
{
"dual socket HT, 6 cores free",
topoDualSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
[]int{0, 2, 4, 1, 3, 5},
},
{
"dual socket HT, 5 cores free (1 consumed from socket 0)",
topoDualSocketHT,
cpuset.NewCPUSet(2, 1, 3, 4, 5, 7, 8, 9, 10, 11),
[]int{2, 4, 1, 3, 5},
},
{
"dual socket HT, 4 cores free (1 consumed from each socket)",
topoDualSocketHT,
cpuset.NewCPUSet(2, 3, 4, 5, 8, 9, 10, 11),
[]int{2, 4, 3, 5},
},
}
for _, tc := range testCases {
acc := newCPUAccumulator(tc.topo, tc.availableCPUs, 0)
result := acc.freeCores()
if !reflect.DeepEqual(result, tc.expect) {
t.Errorf("[%s] expected %v to equal %v", tc.description, result, tc.expect)
}
}
}
func TestCPUAccumulatorFreeCPUs(t *testing.T) {
testCases := []struct {
description string
topo *topology.CPUTopology
availableCPUs cpuset.CPUSet
expect []int
}{
{
"single socket HT, 8 cpus free",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]int{0, 4, 1, 5, 2, 6, 3, 7},
},
{
"single socket HT, 5 cpus free",
topoSingleSocketHT,
cpuset.NewCPUSet(3, 4, 5, 6, 7),
[]int{4, 5, 6, 3, 7},
},
{
"dual socket HT, 12 cpus free",
topoDualSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
[]int{0, 6, 2, 8, 4, 10, 1, 7, 3, 9, 5, 11},
},
{
"dual socket HT, 11 cpus free",
topoDualSocketHT,
cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
[]int{6, 2, 8, 4, 10, 1, 7, 3, 9, 5, 11},
},
{
"dual socket HT, 10 cpus free",
topoDualSocketHT,
cpuset.NewCPUSet(1, 2, 3, 4, 5, 7, 8, 9, 10, 11),
[]int{2, 8, 4, 10, 1, 7, 3, 9, 5, 11},
},
}
for _, tc := range testCases {
acc := newCPUAccumulator(tc.topo, tc.availableCPUs, 0)
result := acc.freeCPUs()
if !reflect.DeepEqual(result, tc.expect) {
t.Errorf("[%s] expected %v to equal %v", tc.description, result, tc.expect)
}
}
}
func TestCPUAccumulatorTake(t *testing.T) {
testCases := []struct {
description string
topo *topology.CPUTopology
availableCPUs cpuset.CPUSet
takeCPUs []cpuset.CPUSet
numCPUs int
expectSatisfied bool
expectFailed bool
}{
{
"take 0 cpus from a single socket HT, require 1",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]cpuset.CPUSet{cpuset.NewCPUSet()},
1,
false,
false,
},
{
"take 0 cpus from a single socket HT, require 1, none available",
topoSingleSocketHT,
cpuset.NewCPUSet(),
[]cpuset.CPUSet{cpuset.NewCPUSet()},
1,
false,
true,
},
{
"take 1 cpu from a single socket HT, require 1",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]cpuset.CPUSet{cpuset.NewCPUSet(0)},
1,
true,
false,
},
{
"take 1 cpu from a single socket HT, require 2",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]cpuset.CPUSet{cpuset.NewCPUSet(0)},
2,
false,
false,
},
{
"take 2 cpu from a single socket HT, require 4, expect failed",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2),
[]cpuset.CPUSet{cpuset.NewCPUSet(0), cpuset.NewCPUSet(1)},
4,
false,
true,
},
{
"take all cpus one at a time from a single socket HT, require 8",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
[]cpuset.CPUSet{
cpuset.NewCPUSet(0),
cpuset.NewCPUSet(1),
cpuset.NewCPUSet(2),
cpuset.NewCPUSet(3),
cpuset.NewCPUSet(4),
cpuset.NewCPUSet(5),
cpuset.NewCPUSet(6),
cpuset.NewCPUSet(7),
},
8,
true,
false,
},
}
for _, tc := range testCases {
acc := newCPUAccumulator(tc.topo, tc.availableCPUs, tc.numCPUs)
totalTaken := 0
for _, cpus := range tc.takeCPUs {
acc.take(cpus)
totalTaken += cpus.Size()
}
if tc.expectSatisfied != acc.isSatisfied() {
t.Errorf("[%s] expected acc.isSatisfied() to be %t", tc.description, tc.expectSatisfied)
}
if tc.expectFailed != acc.isFailed() {
t.Errorf("[%s] expected acc.isFailed() to be %t", tc.description, tc.expectFailed)
}
for _, cpus := range tc.takeCPUs {
availableCPUs := acc.details.CPUs()
if cpus.Intersection(availableCPUs).Size() > 0 {
t.Errorf("[%s] expected intersection of taken cpus [%s] and acc.details.CPUs() [%s] to be empty", tc.description, cpus, availableCPUs)
}
if !cpus.IsSubsetOf(acc.result) {
t.Errorf("[%s] expected [%s] to be a subset of acc.result [%s]", tc.description, cpus, acc.result)
}
}
expNumCPUsNeeded := tc.numCPUs - totalTaken
if acc.numCPUsNeeded != expNumCPUsNeeded {
t.Errorf("[%s] expected acc.numCPUsNeeded to be %d (got %d)", tc.description, expNumCPUsNeeded, acc.numCPUsNeeded)
}
}
}
func TestTakeByTopology(t *testing.T) {
testCases := []struct {
description string
topo *topology.CPUTopology
availableCPUs cpuset.CPUSet
numCPUs int
expErr string
expResult cpuset.CPUSet
}{
{
"take more cpus than are available from single socket with HT",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 2, 4, 6),
5,
"not enough cpus available to satisfy request",
cpuset.NewCPUSet(),
},
{
"take zero cpus from single socket with HT",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
0,
"",
cpuset.NewCPUSet(),
},
{
"take one cpu from single socket with HT",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
1,
"",
cpuset.NewCPUSet(0),
},
{
"take one cpu from single socket with HT, some cpus are taken",
topoSingleSocketHT,
cpuset.NewCPUSet(1, 3, 5, 6, 7),
1,
"",
cpuset.NewCPUSet(6),
},
{
"take two cpus from single socket with HT",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
2,
"",
cpuset.NewCPUSet(0, 4),
},
{
"take all cpus from single socket with HT",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
8,
"",
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
},
{
"take two cpus from single socket with HT, only one core totally free",
topoSingleSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 6),
2,
"",
cpuset.NewCPUSet(2, 6),
},
{
"take three cpus from dual socket with HT - core from Socket 0",
topoDualSocketHT,
cpuset.NewCPUSet(1, 2, 3, 4, 5, 7, 8, 9, 10, 11),
1,
"",
cpuset.NewCPUSet(2),
},
{
"take a socket of cpus from dual socket with HT",
topoDualSocketHT,
cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
6,
"",
cpuset.NewCPUSet(0, 2, 4, 6, 8, 10),
},
}
for _, tc := range testCases {
result, err := takeByTopology(tc.topo, tc.availableCPUs, tc.numCPUs)
if tc.expErr != "" && err.Error() != tc.expErr {
t.Errorf("expected error to be [%v] but it was [%v] in test \"%s\"", tc.expErr, err, tc.description)
}
if !result.Equals(tc.expResult) {
t.Errorf("expected result [%s] to equal [%s] in test \"%s\"", result, tc.expResult, tc.description)
}
}
}

View file

@ -0,0 +1,276 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"math"
"sync"
"time"
"github.com/golang/glog"
cadvisorapi "github.com/google/cadvisor/info/v1"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/status"
)
// ActivePodsFunc is a function that returns a list of pods to reconcile.
type ActivePodsFunc func() []*v1.Pod
type runtimeService interface {
UpdateContainerResources(id string, resources *runtimeapi.LinuxContainerResources) error
}
type policyName string
// Manager interface provides methods for Kubelet to manage pod cpus.
type Manager interface {
// Start is called during Kubelet initialization.
Start(activePods ActivePodsFunc, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService)
// AddContainer is called between container create and container start
// so that initial CPU affinity settings can be written through to the
// container runtime before the first process begins to execute.
AddContainer(p *v1.Pod, c *v1.Container, containerID string) error
// RemoveContainer is called after Kubelet decides to kill or delete a
// container. After this call, the CPU manager stops trying to reconcile
// that container and any CPUs dedicated to the container are freed.
RemoveContainer(containerID string) error
// State returns a read-only interface to the internal CPU manager state.
State() state.Reader
}
type manager struct {
sync.Mutex
policy Policy
// reconcilePeriod is the duration between calls to reconcileState.
reconcilePeriod time.Duration
// state allows pluggable CPU assignment policies while sharing a common
// representation of state for the system to inspect and reconcile.
state state.State
// containerRuntime is the container runtime service interface needed
// to make UpdateContainerResources() calls against the containers.
containerRuntime runtimeService
// activePods is a method for listing active pods on the node
// so all the containers can be updated in the reconciliation loop.
activePods ActivePodsFunc
// podStatusProvider provides a method for obtaining pod statuses
// and the containerID of their containers
podStatusProvider status.PodStatusProvider
machineInfo *cadvisorapi.MachineInfo
nodeAllocatableReservation v1.ResourceList
}
var _ Manager = &manager{}
// NewManager creates new cpu manager based on provided policy
func NewManager(
cpuPolicyName string,
reconcilePeriod time.Duration,
machineInfo *cadvisorapi.MachineInfo,
nodeAllocatableReservation v1.ResourceList,
) (Manager, error) {
var policy Policy
switch policyName(cpuPolicyName) {
case PolicyNone:
policy = NewNonePolicy()
case PolicyStatic:
topo, err := topology.Discover(machineInfo)
if err != nil {
return nil, err
}
glog.Infof("[cpumanager] detected CPU topology: %v", topo)
reservedCPUs, ok := nodeAllocatableReservation[v1.ResourceCPU]
if !ok {
// The static policy cannot initialize without this information. Panic!
panic("[cpumanager] unable to determine reserved CPU resources for static policy")
}
if reservedCPUs.IsZero() {
// Panic!
//
// The static policy requires this to be nonzero. Zero CPU reservation
// would allow the shared pool to be completely exhausted. At that point
// either we would violate our guarantee of exclusivity or need to evict
// any pod that has at least one container that requires zero CPUs.
// See the comments in policy_static.go for more details.
panic("[cpumanager] the static policy requires systemreserved.cpu + kubereserved.cpu to be greater than zero")
}
// Take the ceiling of the reservation, since fractional CPUs cannot be
// exclusively allocated.
reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
policy = NewStaticPolicy(topo, numReservedCPUs)
default:
glog.Errorf("[cpumanager] Unknown policy \"%s\", falling back to default policy \"%s\"", cpuPolicyName, PolicyNone)
policy = NewNonePolicy()
}
manager := &manager{
policy: policy,
reconcilePeriod: reconcilePeriod,
state: state.NewMemoryState(),
machineInfo: machineInfo,
nodeAllocatableReservation: nodeAllocatableReservation,
}
return manager, nil
}
func (m *manager) Start(activePods ActivePodsFunc, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService) {
glog.Infof("[cpumanger] starting with %s policy", m.policy.Name())
glog.Infof("[cpumanger] reconciling every %v", m.reconcilePeriod)
m.activePods = activePods
m.podStatusProvider = podStatusProvider
m.containerRuntime = containerRuntime
m.policy.Start(m.state)
if m.policy.Name() == string(PolicyNone) {
return
}
go wait.Until(func() { m.reconcileState() }, m.reconcilePeriod, wait.NeverStop)
}
func (m *manager) AddContainer(p *v1.Pod, c *v1.Container, containerID string) error {
m.Lock()
err := m.policy.AddContainer(m.state, p, c, containerID)
if err != nil {
glog.Errorf("[cpumanager] AddContainer error: %v", err)
m.Unlock()
return err
}
cpus := m.state.GetCPUSetOrDefault(containerID)
m.Unlock()
err = m.updateContainerCPUSet(containerID, cpus)
if err != nil {
glog.Errorf("[cpumanager] AddContainer error: %v", err)
return err
}
return nil
}
func (m *manager) RemoveContainer(containerID string) error {
m.Lock()
defer m.Unlock()
err := m.policy.RemoveContainer(m.state, containerID)
if err != nil {
glog.Errorf("[cpumanager] RemoveContainer error: %v", err)
return err
}
return nil
}
func (m *manager) State() state.Reader {
return m.state
}
type reconciledContainer struct {
podName string
containerName string
containerID string
}
func (m *manager) reconcileState() (success []reconciledContainer, failure []reconciledContainer) {
success = []reconciledContainer{}
failure = []reconciledContainer{}
for _, pod := range m.activePods() {
allContainers := pod.Spec.InitContainers
allContainers = append(allContainers, pod.Spec.Containers...)
for _, container := range allContainers {
status, ok := m.podStatusProvider.GetPodStatus(pod.UID)
if !ok {
glog.Warningf("[cpumanager] reconcileState: skipping pod; status not found (pod: %s, container: %s)", pod.Name, container.Name)
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
break
}
containerID, err := findContainerIDByName(&status, container.Name)
if err != nil {
glog.Warningf("[cpumanager] reconcileState: skipping container; ID not found in status (pod: %s, container: %s, error: %v)", pod.Name, container.Name, err)
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
continue
}
cset := m.state.GetCPUSetOrDefault(containerID)
if cset.IsEmpty() {
// NOTE: This should not happen outside of tests.
glog.Infof("[cpumanager] reconcileState: skipping container; assigned cpuset is empty (pod: %s, container: %s)", pod.Name, container.Name)
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
continue
}
glog.Infof("[cpumanager] reconcileState: updating container (pod: %s, container: %s, container id: %s, cpuset: \"%v\")", pod.Name, container.Name, containerID, cset)
err = m.updateContainerCPUSet(containerID, cset)
if err != nil {
glog.Errorf("[cpumanager] reconcileState: failed to update container (pod: %s, container: %s, container id: %s, cpuset: \"%v\", error: %v)", pod.Name, container.Name, containerID, cset, err)
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
continue
}
success = append(success, reconciledContainer{pod.Name, container.Name, containerID})
}
}
return success, failure
}
func findContainerIDByName(status *v1.PodStatus, name string) (string, error) {
for _, container := range status.ContainerStatuses {
if container.Name == name && container.ContainerID != "" {
cid := &kubecontainer.ContainerID{}
err := cid.ParseString(container.ContainerID)
if err != nil {
return "", err
}
return cid.ID, nil
}
}
return "", fmt.Errorf("unable to find ID for container with name %v in pod status (it may not be running)", name)
}
func (m *manager) updateContainerCPUSet(containerID string, cpus cpuset.CPUSet) error {
// TODO: Consider adding a `ResourceConfigForContainer` helper in
// helpers_linux.go similar to what exists for pods.
// It would be better to pass the full container resources here instead of
// this patch-like partial resources.
return m.containerRuntime.UpdateContainerResources(
containerID,
&runtimeapi.LinuxContainerResources{
CpusetCpus: cpus.String(),
})
}

View file

@ -0,0 +1,452 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"reflect"
"testing"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
type mockState struct {
assignments map[string]cpuset.CPUSet
defaultCPUSet cpuset.CPUSet
}
func (s *mockState) GetCPUSet(containerID string) (cpuset.CPUSet, bool) {
res, ok := s.assignments[containerID]
return res.Clone(), ok
}
func (s *mockState) GetDefaultCPUSet() cpuset.CPUSet {
return s.defaultCPUSet.Clone()
}
func (s *mockState) GetCPUSetOrDefault(containerID string) cpuset.CPUSet {
if res, ok := s.GetCPUSet(containerID); ok {
return res
}
return s.GetDefaultCPUSet()
}
func (s *mockState) SetCPUSet(containerID string, cset cpuset.CPUSet) {
s.assignments[containerID] = cset
}
func (s *mockState) SetDefaultCPUSet(cset cpuset.CPUSet) {
s.defaultCPUSet = cset
}
func (s *mockState) Delete(containerID string) {
delete(s.assignments, containerID)
}
type mockPolicy struct {
err error
}
func (p *mockPolicy) Name() string {
return "mock"
}
func (p *mockPolicy) Start(s state.State) {
}
func (p *mockPolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error {
return p.err
}
func (p *mockPolicy) RemoveContainer(s state.State, containerID string) error {
return p.err
}
type mockRuntimeService struct {
err error
}
func (rt mockRuntimeService) UpdateContainerResources(id string, resources *runtimeapi.LinuxContainerResources) error {
return rt.err
}
type mockPodStatusProvider struct {
podStatus v1.PodStatus
found bool
}
func (psp mockPodStatusProvider) GetPodStatus(uid types.UID) (v1.PodStatus, bool) {
return psp.podStatus, psp.found
}
type mockPodKiller struct {
killedPods []*v1.Pod
}
func (f *mockPodKiller) killPodNow(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error {
f.killedPods = append(f.killedPods, pod)
return nil
}
type mockPodProvider struct {
pods []*v1.Pod
}
func (f *mockPodProvider) getPods() []*v1.Pod {
return f.pods
}
type mockRecorder struct{}
func (r *mockRecorder) Eventf(object runtime.Object, eventtype, reason, messageFmt string, args ...interface{}) {
}
func makePod(cpuRequest, cpuLimit string) *v1.Pod {
return &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceName(v1.ResourceCPU): resource.MustParse(cpuRequest),
v1.ResourceName(v1.ResourceMemory): resource.MustParse("1G"),
},
Limits: v1.ResourceList{
v1.ResourceName(v1.ResourceCPU): resource.MustParse(cpuLimit),
v1.ResourceName(v1.ResourceMemory): resource.MustParse("1G"),
},
},
},
},
},
}
}
// CpuAllocatable must be <= CpuCapacity
func prepareCPUNodeStatus(CPUCapacity, CPUAllocatable string) v1.NodeStatus {
nodestatus := v1.NodeStatus{
Capacity: make(v1.ResourceList, 1),
Allocatable: make(v1.ResourceList, 1),
}
cpucap, _ := resource.ParseQuantity(CPUCapacity)
cpuall, _ := resource.ParseQuantity(CPUAllocatable)
nodestatus.Capacity[v1.ResourceCPU] = cpucap
nodestatus.Allocatable[v1.ResourceCPU] = cpuall
return nodestatus
}
func TestCPUManagerAdd(t *testing.T) {
testCases := []struct {
description string
regErr error
updateErr error
expErr error
}{
{
description: "cpu manager add - no error",
regErr: nil,
updateErr: nil,
expErr: nil,
},
{
description: "cpu manager add - policy add container error",
regErr: fmt.Errorf("fake reg error"),
updateErr: nil,
expErr: fmt.Errorf("fake reg error"),
},
{
description: "cpu manager add - container update error",
regErr: nil,
updateErr: fmt.Errorf("fake update error"),
expErr: fmt.Errorf("fake update error"),
},
}
for _, testCase := range testCases {
mgr := &manager{
policy: &mockPolicy{
err: testCase.regErr,
},
state: &mockState{
assignments: map[string]cpuset.CPUSet{},
defaultCPUSet: cpuset.NewCPUSet(),
},
containerRuntime: mockRuntimeService{
err: testCase.updateErr,
},
activePods: func() []*v1.Pod { return nil },
podStatusProvider: mockPodStatusProvider{},
}
pod := makePod("1000", "1000")
container := &pod.Spec.Containers[0]
err := mgr.AddContainer(pod, container, "fakeID")
if !reflect.DeepEqual(err, testCase.expErr) {
t.Errorf("CPU Manager AddContainer() error (%v). expected error: %v but got: %v",
testCase.description, testCase.expErr, err)
}
}
}
func TestCPUManagerRemove(t *testing.T) {
mgr := &manager{
policy: &mockPolicy{
err: nil,
},
state: &mockState{
assignments: map[string]cpuset.CPUSet{},
defaultCPUSet: cpuset.NewCPUSet(),
},
containerRuntime: mockRuntimeService{},
activePods: func() []*v1.Pod { return nil },
podStatusProvider: mockPodStatusProvider{},
}
err := mgr.RemoveContainer("fakeID")
if err != nil {
t.Errorf("CPU Manager RemoveContainer() error. expected error to be nil but got: %v", err)
}
mgr = &manager{
policy: &mockPolicy{
err: fmt.Errorf("fake error"),
},
state: state.NewMemoryState(),
containerRuntime: mockRuntimeService{},
activePods: func() []*v1.Pod { return nil },
podStatusProvider: mockPodStatusProvider{},
}
err = mgr.RemoveContainer("fakeID")
if !reflect.DeepEqual(err, fmt.Errorf("fake error")) {
t.Errorf("CPU Manager RemoveContainer() error. expected error: fake error but got: %v", err)
}
}
func TestReconcileState(t *testing.T) {
testCases := []struct {
description string
activePods []*v1.Pod
pspPS v1.PodStatus
pspFound bool
stAssignments map[string]cpuset.CPUSet
stDefaultCPUSet cpuset.CPUSet
updateErr error
expectFailedContainerName string
}{
{
description: "cpu manager reconclie - no error",
activePods: []*v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "fakePodName",
UID: "fakeUID",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "fakeName",
},
},
},
},
},
pspPS: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{
{
Name: "fakeName",
ContainerID: "docker://fakeID",
},
},
},
pspFound: true,
stAssignments: map[string]cpuset.CPUSet{
"fakeID": cpuset.NewCPUSet(1, 2),
},
stDefaultCPUSet: cpuset.NewCPUSet(3, 4, 5, 6, 7),
updateErr: nil,
expectFailedContainerName: "",
},
{
description: "cpu manager reconclie - pod status not found",
activePods: []*v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "fakePodName",
UID: "fakeUID",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "fakeName",
},
},
},
},
},
pspPS: v1.PodStatus{},
pspFound: false,
stAssignments: map[string]cpuset.CPUSet{},
stDefaultCPUSet: cpuset.NewCPUSet(),
updateErr: nil,
expectFailedContainerName: "fakeName",
},
{
description: "cpu manager reconclie - container id not found",
activePods: []*v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "fakePodName",
UID: "fakeUID",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "fakeName",
},
},
},
},
},
pspPS: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{
{
Name: "fakeName1",
ContainerID: "docker://fakeID",
},
},
},
pspFound: true,
stAssignments: map[string]cpuset.CPUSet{},
stDefaultCPUSet: cpuset.NewCPUSet(),
updateErr: nil,
expectFailedContainerName: "fakeName",
},
{
description: "cpu manager reconclie - cpuset is empty",
activePods: []*v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "fakePodName",
UID: "fakeUID",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "fakeName",
},
},
},
},
},
pspPS: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{
{
Name: "fakeName",
ContainerID: "docker://fakeID",
},
},
},
pspFound: true,
stAssignments: map[string]cpuset.CPUSet{
"fakeID": cpuset.NewCPUSet(),
},
stDefaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
updateErr: nil,
expectFailedContainerName: "fakeName",
},
{
description: "cpu manager reconclie - container update error",
activePods: []*v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
Name: "fakePodName",
UID: "fakeUID",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "fakeName",
},
},
},
},
},
pspPS: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{
{
Name: "fakeName",
ContainerID: "docker://fakeID",
},
},
},
pspFound: true,
stAssignments: map[string]cpuset.CPUSet{
"fakeID": cpuset.NewCPUSet(1, 2),
},
stDefaultCPUSet: cpuset.NewCPUSet(3, 4, 5, 6, 7),
updateErr: fmt.Errorf("fake container update error"),
expectFailedContainerName: "fakeName",
},
}
for _, testCase := range testCases {
mgr := &manager{
policy: &mockPolicy{
err: nil,
},
state: &mockState{
assignments: testCase.stAssignments,
defaultCPUSet: testCase.stDefaultCPUSet,
},
containerRuntime: mockRuntimeService{
err: testCase.updateErr,
},
activePods: func() []*v1.Pod {
return testCase.activePods
},
podStatusProvider: mockPodStatusProvider{
podStatus: testCase.pspPS,
found: testCase.pspFound,
},
}
_, failure := mgr.reconcileState()
if testCase.expectFailedContainerName != "" {
// Search failed reconciled containers for the supplied name.
foundFailedContainer := false
for _, reconciled := range failure {
if reconciled.containerName == testCase.expectFailedContainerName {
foundFailedContainer = true
break
}
}
if !foundFailedContainer {
t.Errorf("Expected reconciliation failure for container: %s", testCase.expectFailedContainerName)
}
}
}
}

View file

@ -0,0 +1,58 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/status"
)
type fakeManager struct {
state state.State
}
func (m *fakeManager) Start(activePods ActivePodsFunc, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService) {
glog.Info("[fake cpumanager] Start()")
}
func (m *fakeManager) Policy() Policy {
glog.Info("[fake cpumanager] Policy()")
return NewNonePolicy()
}
func (m *fakeManager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) error {
glog.Infof("[fake cpumanager] AddContainer (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID)
return nil
}
func (m *fakeManager) RemoveContainer(containerID string) error {
glog.Infof("[fake cpumanager] RemoveContainer (container id: %s)", containerID)
return nil
}
func (m *fakeManager) State() state.Reader {
return m.state
}
// NewFakeManager creates empty/fake cpu manager
func NewFakeManager() Manager {
return &fakeManager{
state: state.NewMemoryState(),
}
}

View file

@ -0,0 +1,30 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
)
// Policy implements logic for pod container to CPU assignment.
type Policy interface {
Name() string
Start(s state.State)
AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error
RemoveContainer(s state.State, containerID string) error
}

View file

@ -0,0 +1,51 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
)
type nonePolicy struct{}
var _ Policy = &nonePolicy{}
// PolicyNone name of none policy
const PolicyNone policyName = "none"
// NewNonePolicy returns a cupset manager policy that does nothing
func NewNonePolicy() Policy {
return &nonePolicy{}
}
func (p *nonePolicy) Name() string {
return string(PolicyNone)
}
func (p *nonePolicy) Start(s state.State) {
glog.Info("[cpumanager] none policy: Start")
}
func (p *nonePolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error {
return nil
}
func (p *nonePolicy) RemoveContainer(s state.State, containerID string) error {
return nil
}

View file

@ -0,0 +1,64 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"testing"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
func TestNonePolicyName(t *testing.T) {
policy := &nonePolicy{}
policyName := policy.Name()
if policyName != "none" {
t.Errorf("NonePolicy Name() error. expected: none, returned: %v",
policyName)
}
}
func TestNonePolicyAdd(t *testing.T) {
policy := &nonePolicy{}
st := &mockState{
assignments: map[string]cpuset.CPUSet{},
defaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
}
testPod := makePod("1000m", "1000m")
container := &testPod.Spec.Containers[0]
err := policy.AddContainer(st, testPod, container, "fakeID")
if err != nil {
t.Errorf("NonePolicy AddContainer() error. expected no error but got: %v", err)
}
}
func TestNonePolicyRemove(t *testing.T) {
policy := &nonePolicy{}
st := &mockState{
assignments: map[string]cpuset.CPUSet{},
defaultCPUSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
}
err := policy.RemoveContainer(st, "fakeID")
if err != nil {
t.Errorf("NonePolicy RemoveContainer() error. expected no error but got %v", err)
}
}

View file

@ -0,0 +1,172 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"github.com/golang/glog"
"k8s.io/api/core/v1"
v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
// PolicyStatic is the name of the static policy
const PolicyStatic policyName = "static"
var _ Policy = &staticPolicy{}
// staticPolicy is a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
//
// This policy allocates CPUs exclusively for a container if all the following
// conditions are met:
//
// - The pod QoS class is Guaranteed.
// - The CPU request is a positive integer.
//
// The static policy maintains the following sets of logical CPUs:
//
// - SHARED: Burstable, BestEffort, and non-integral Guaranteed containers
// run here. Initially this contains all CPU IDs on the system. As
// exclusive allocations are created and destroyed, this CPU set shrinks
// and grows, accordingly. This is stored in the state as the default
// CPU set.
//
// - RESERVED: A subset of the shared pool which is not exclusively
// allocatable. The membership of this pool is static for the lifetime of
// the Kubelet. The size of the reserved pool is
// ceil(systemreserved.cpu + kubereserved.cpu).
// Reserved CPUs are taken topologically starting with lowest-indexed
// physical core, as reported by cAdvisor.
//
// - ASSIGNABLE: Equal to SHARED - RESERVED. Exclusive CPUs are allocated
// from this pool.
//
// - EXCLUSIVE ALLOCATIONS: CPU sets assigned exclusively to one container.
// These are stored as explicit assignments in the state.
//
// When an exclusive allocation is made, the static policy also updates the
// default cpuset in the state abstraction. The CPU manager's periodic
// reconcile loop takes care of rewriting the cpuset in cgroupfs for any
// containers that may be running in the shared pool. For this reason,
// applications running within exclusively-allocated containers must tolerate
// potentially sharing their allocated CPUs for up to the CPU manager
// reconcile period.
type staticPolicy struct {
// cpu socket topology
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reserved cpuset.CPUSet
}
// Ensure staticPolicy implements Policy interface
var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int) Policy {
allCPUs := topology.CPUDetails.CPUs()
// takeByTopology allocates CPUs associated with low-numbered cores from
// allCPUs.
//
// For example: Given a system with 8 CPUs available and HT enabled,
// if numReservedCPUs=2, then reserved={0,4}
reserved, _ := takeByTopology(topology, allCPUs, numReservedCPUs)
if reserved.Size() != numReservedCPUs {
panic(fmt.Sprintf("[cpumanager] unable to reserve the required amount of CPUs (size of %s did not equal %d)", reserved, numReservedCPUs))
}
glog.Infof("[cpumanager] reserved %d CPUs (\"%s\") not available for exclusive assignment", reserved.Size(), reserved)
return &staticPolicy{
topology: topology,
reserved: reserved,
}
}
func (p *staticPolicy) Name() string {
return string(PolicyStatic)
}
func (p *staticPolicy) Start(s state.State) {
// Configure the shared pool to include all detected CPU IDs.
allCPUs := p.topology.CPUDetails.CPUs()
s.SetDefaultCPUSet(allCPUs)
}
// assignableCPUs returns the set of unassigned CPUs minus the reserved set.
func (p *staticPolicy) assignableCPUs(s state.State) cpuset.CPUSet {
return s.GetDefaultCPUSet().Difference(p.reserved)
}
func (p *staticPolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error {
glog.Infof("[cpumanager] static policy: AddContainer (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID)
if numCPUs := guaranteedCPUs(pod, container); numCPUs != 0 {
// container belongs in an exclusively allocated pool
cpuset, err := p.allocateCPUs(s, numCPUs)
if err != nil {
glog.Errorf("[cpumanager] unable to allocate %d CPUs (container id: %s, error: %v)", numCPUs, containerID, err)
return err
}
s.SetCPUSet(containerID, cpuset)
}
// container belongs in the shared pool (nothing to do; use default cpuset)
return nil
}
func (p *staticPolicy) RemoveContainer(s state.State, containerID string) error {
glog.Infof("[cpumanager] static policy: RemoveContainer (container id: %s)", containerID)
if toRelease, ok := s.GetCPUSet(containerID); ok {
s.Delete(containerID)
// Mutate the shared pool, adding released cpus.
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
}
return nil
}
func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int) (cpuset.CPUSet, error) {
glog.Infof("[cpumanager] allocateCpus: (numCPUs: %d)", numCPUs)
result, err := takeByTopology(p.topology, p.assignableCPUs(s), numCPUs)
if err != nil {
return cpuset.NewCPUSet(), err
}
// Remove allocated CPUs from the shared CPUSet.
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Difference(result))
glog.Infof("[cpumanager] allocateCPUs: returning \"%v\"", result)
return result, nil
}
func guaranteedCPUs(pod *v1.Pod, container *v1.Container) int {
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
return 0
}
cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
return 0
}
// Safe downcast to do for all systems with < 2.1 billion CPUs.
// Per the language spec, `int` is guaranteed to be at least 32 bits wide.
// https://golang.org/ref/spec#Numeric_types
return int(cpuQuantity.Value())
}

View file

@ -0,0 +1,437 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"reflect"
"testing"
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
type staticPolicyTest struct {
description string
topo *topology.CPUTopology
numReservedCPUs int
containerID string
stAssignments map[string]cpuset.CPUSet
stDefaultCPUSet cpuset.CPUSet
pod *v1.Pod
expErr error
expCPUAlloc bool
expCSet cpuset.CPUSet
}
func TestStaticPolicyName(t *testing.T) {
policy := NewStaticPolicy(topoSingleSocketHT, 1)
policyName := policy.Name()
if policyName != "static" {
t.Errorf("StaticPolicy Name() error. expected: static, returned: %v",
policyName)
}
}
func TestStaticPolicyStart(t *testing.T) {
policy := NewStaticPolicy(topoSingleSocketHT, 1).(*staticPolicy)
st := &mockState{
assignments: map[string]cpuset.CPUSet{},
defaultCPUSet: cpuset.NewCPUSet(),
}
policy.Start(st)
for cpuid := 1; cpuid < policy.topology.NumCPUs; cpuid++ {
if !st.defaultCPUSet.Contains(cpuid) {
t.Errorf("StaticPolicy Start() error. expected cpuid %d to be present in defaultCPUSet", cpuid)
}
}
}
func TestStaticPolicyAdd(t *testing.T) {
largeTopoBuilder := cpuset.NewBuilder()
largeTopoSock0Builder := cpuset.NewBuilder()
largeTopoSock1Builder := cpuset.NewBuilder()
largeTopo := *topoQuadSocketFourWayHT
for cpuid, val := range largeTopo.CPUDetails {
largeTopoBuilder.Add(cpuid)
if val.SocketID == 0 {
largeTopoSock0Builder.Add(cpuid)
} else if val.SocketID == 1 {
largeTopoSock1Builder.Add(cpuid)
}
}
largeTopoCPUSet := largeTopoBuilder.Result()
largeTopoSock0CPUSet := largeTopoSock0Builder.Result()
largeTopoSock1CPUSet := largeTopoSock1Builder.Result()
testCases := []staticPolicyTest{
{
description: "GuPodSingleCore, SingleSocketHT, ExpectError",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID2",
stAssignments: map[string]cpuset.CPUSet{},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
pod: makePod("8000m", "8000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request"),
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
{
description: "GuPodSingleCore, SingleSocketHT, ExpectAllocOneCPU",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID2",
stAssignments: map[string]cpuset.CPUSet{},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
pod: makePod("1000m", "1000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(4), // expect sibling of partial core
},
{
description: "GuPodMultipleCores, SingleSocketHT, ExpectAllocOneCore",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID3",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": cpuset.NewCPUSet(2, 3, 6, 7),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 4, 5),
pod: makePod("2000m", "2000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(1, 5),
},
{
description: "GuPodMultipleCores, DualSocketHT, ExpectAllocOneSocket",
topo: topoDualSocketHT,
numReservedCPUs: 1,
containerID: "fakeID3",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": cpuset.NewCPUSet(2),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11),
pod: makePod("6000m", "6000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(1, 3, 5, 7, 9, 11),
},
{
description: "GuPodMultipleCores, DualSocketHT, ExpectAllocThreeCores",
topo: topoDualSocketHT,
numReservedCPUs: 1,
containerID: "fakeID3",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": cpuset.NewCPUSet(1, 5),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 2, 3, 4, 6, 7, 8, 9, 10, 11),
pod: makePod("6000m", "6000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(2, 3, 4, 8, 9, 10),
},
{
description: "GuPodMultipleCores, DualSocketNoHT, ExpectAllocOneSocket",
topo: topoDualSocketNoHT,
numReservedCPUs: 1,
containerID: "fakeID1",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": cpuset.NewCPUSet(),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 3, 4, 5, 6, 7),
pod: makePod("4000m", "4000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(4, 5, 6, 7),
},
{
description: "GuPodMultipleCores, DualSocketNoHT, ExpectAllocFourCores",
topo: topoDualSocketNoHT,
numReservedCPUs: 1,
containerID: "fakeID1",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": cpuset.NewCPUSet(4, 5),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 3, 6, 7),
pod: makePod("4000m", "4000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(1, 3, 6, 7),
},
{
description: "GuPodMultipleCores, DualSocketHT, ExpectAllocOneSocketOneCore",
topo: topoDualSocketHT,
numReservedCPUs: 1,
containerID: "fakeID3",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": cpuset.NewCPUSet(2),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11),
pod: makePod("8000m", "8000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(1, 3, 4, 5, 7, 9, 10, 11),
},
{
description: "NonGuPod, SingleSocketHT, NoAlloc",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID1",
stAssignments: map[string]cpuset.CPUSet{},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
pod: makePod("1000m", "2000m"),
expErr: nil,
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
{
description: "GuPodNonIntegerCore, SingleSocketHT, NoAlloc",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID4",
stAssignments: map[string]cpuset.CPUSet{},
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7),
pod: makePod("977m", "977m"),
expErr: nil,
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
{
description: "GuPodMultipleCores, SingleSocketHT, NoAllocExpectError",
topo: topoSingleSocketHT,
numReservedCPUs: 1,
containerID: "fakeID5",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": cpuset.NewCPUSet(1, 2, 3, 4, 5, 6),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 7),
pod: makePod("2000m", "2000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request"),
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
{
description: "GuPodMultipleCores, DualSocketHT, NoAllocExpectError",
topo: topoDualSocketHT,
numReservedCPUs: 1,
containerID: "fakeID5",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": cpuset.NewCPUSet(1, 2, 3),
},
stDefaultCPUSet: cpuset.NewCPUSet(0, 4, 5, 6, 7, 8, 9, 10, 11),
pod: makePod("10000m", "10000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request"),
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
{
// All the CPUs from Socket 0 are available. Some CPUs from each
// Socket have been already assigned.
// Expect all CPUs from Socket 0.
description: "GuPodMultipleCores, topoQuadSocketFourWayHT, ExpectAllocSock0",
topo: topoQuadSocketFourWayHT,
containerID: "fakeID5",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": cpuset.NewCPUSet(3, 11, 4, 5, 6, 7),
},
stDefaultCPUSet: largeTopoCPUSet.Difference(cpuset.NewCPUSet(3, 11, 4, 5, 6, 7)),
pod: makePod("72000m", "72000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: largeTopoSock0CPUSet,
},
{
// Only 2 full cores from three Sockets and some partial cores are available.
// Expect CPUs from the 2 full cores available from the three Sockets.
description: "GuPodMultipleCores, topoQuadSocketFourWayHT, ExpectAllocAllFullCoresFromThreeSockets",
topo: topoQuadSocketFourWayHT,
containerID: "fakeID5",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": largeTopoCPUSet.Difference(cpuset.NewCPUSet(1, 25, 13, 38, 2, 9, 11, 35, 23, 48, 12, 51,
53, 173, 113, 233, 54, 61)),
},
stDefaultCPUSet: cpuset.NewCPUSet(1, 25, 13, 38, 2, 9, 11, 35, 23, 48, 12, 51, 53, 173, 113, 233, 54, 61),
pod: makePod("12000m", "12000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(1, 25, 13, 38, 11, 35, 23, 48, 53, 173, 113, 233),
},
{
// All CPUs from Socket 1, 1 full core and some partial cores are available.
// Expect all CPUs from Socket 1 and the hyper-threads from the full core.
description: "GuPodMultipleCores, topoQuadSocketFourWayHT, ExpectAllocAllSock1+FullCore",
topo: topoQuadSocketFourWayHT,
containerID: "fakeID5",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": largeTopoCPUSet.Difference(largeTopoSock1CPUSet.Union(cpuset.NewCPUSet(10, 34, 22, 47, 53,
173, 61, 181, 108, 228, 115, 235))),
},
stDefaultCPUSet: largeTopoSock1CPUSet.Union(cpuset.NewCPUSet(10, 34, 22, 47, 53, 173, 61, 181, 108, 228,
115, 235)),
pod: makePod("76000m", "76000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: largeTopoSock1CPUSet.Union(cpuset.NewCPUSet(10, 34, 22, 47)),
},
{
// Only partial cores are available in the entire system.
// Expect allocation of all the CPUs from the partial cores.
description: "GuPodMultipleCores, topoQuadSocketFourWayHT, ExpectAllocCPUs",
topo: topoQuadSocketFourWayHT,
containerID: "fakeID5",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": largeTopoCPUSet.Difference(cpuset.NewCPUSet(10, 11, 53, 37, 55, 67, 52)),
},
stDefaultCPUSet: cpuset.NewCPUSet(10, 11, 53, 67, 52),
pod: makePod("5000m", "5000m"),
expErr: nil,
expCPUAlloc: true,
expCSet: cpuset.NewCPUSet(10, 11, 53, 67, 52),
},
{
// Only 7 CPUs are available.
// Pod requests 76 cores.
// Error is expect since available CPUs are less than the request.
description: "GuPodMultipleCores, topoQuadSocketFourWayHT, NoAlloc",
topo: topoQuadSocketFourWayHT,
containerID: "fakeID5",
stAssignments: map[string]cpuset.CPUSet{
"fakeID100": largeTopoCPUSet.Difference(cpuset.NewCPUSet(10, 11, 53, 37, 55, 67, 52)),
},
stDefaultCPUSet: cpuset.NewCPUSet(10, 11, 53, 37, 55, 67, 52),
pod: makePod("76000m", "76000m"),
expErr: fmt.Errorf("not enough cpus available to satisfy request"),
expCPUAlloc: false,
expCSet: cpuset.NewCPUSet(),
},
}
for _, testCase := range testCases {
policy := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs)
st := &mockState{
assignments: testCase.stAssignments,
defaultCPUSet: testCase.stDefaultCPUSet,
}
container := &testCase.pod.Spec.Containers[0]
err := policy.AddContainer(st, testCase.pod, container, testCase.containerID)
if !reflect.DeepEqual(err, testCase.expErr) {
t.Errorf("StaticPolicy AddContainer() error (%v). expected add error: %v but got: %v",
testCase.description, testCase.expErr, err)
}
if testCase.expCPUAlloc {
cset, found := st.assignments[testCase.containerID]
if !found {
t.Errorf("StaticPolicy AddContainer() error (%v). expected container id %v to be present in assignments %v",
testCase.description, testCase.containerID, st.assignments)
}
if !reflect.DeepEqual(cset, testCase.expCSet) {
t.Errorf("StaticPolicy AddContainer() error (%v). expected cpuset %v but got %v",
testCase.description, testCase.expCSet, cset)
}
if !cset.Intersection(st.defaultCPUSet).IsEmpty() {
t.Errorf("StaticPolicy AddContainer() error (%v). expected cpuset %v to be disoint from the shared cpuset %v",
testCase.description, cset, st.defaultCPUSet)
}
}
if !testCase.expCPUAlloc {
_, found := st.assignments[testCase.containerID]
if found {
t.Errorf("StaticPolicy AddContainer() error (%v). Did not expect container id %v to be present in assignments %v",
testCase.description, testCase.containerID, st.assignments)
}
}
}
}
func TestStaticPolicyRemove(t *testing.T) {
testCases := []staticPolicyTest{
{
description: "SingleSocketHT, DeAllocOneContainer",
topo: topoSingleSocketHT,
containerID: "fakeID1",
stAssignments: map[string]cpuset.CPUSet{
"fakeID1": cpuset.NewCPUSet(1, 2, 3),
},
stDefaultCPUSet: cpuset.NewCPUSet(4, 5, 6, 7),
expCSet: cpuset.NewCPUSet(1, 2, 3, 4, 5, 6, 7),
},
{
description: "SingleSocketHT, DeAllocOneContainer, BeginEmpty",
topo: topoSingleSocketHT,
containerID: "fakeID1",
stAssignments: map[string]cpuset.CPUSet{
"fakeID1": cpuset.NewCPUSet(1, 2, 3),
"fakeID2": cpuset.NewCPUSet(4, 5, 6, 7),
},
stDefaultCPUSet: cpuset.NewCPUSet(),
expCSet: cpuset.NewCPUSet(1, 2, 3),
},
{
description: "SingleSocketHT, DeAllocTwoContainer",
topo: topoSingleSocketHT,
containerID: "fakeID1",
stAssignments: map[string]cpuset.CPUSet{
"fakeID1": cpuset.NewCPUSet(1, 3, 5),
"fakeID2": cpuset.NewCPUSet(2, 4),
},
stDefaultCPUSet: cpuset.NewCPUSet(6, 7),
expCSet: cpuset.NewCPUSet(1, 3, 5, 6, 7),
},
{
description: "SingleSocketHT, NoDeAlloc",
topo: topoSingleSocketHT,
containerID: "fakeID2",
stAssignments: map[string]cpuset.CPUSet{
"fakeID1": cpuset.NewCPUSet(1, 3, 5),
},
stDefaultCPUSet: cpuset.NewCPUSet(2, 4, 6, 7),
expCSet: cpuset.NewCPUSet(2, 4, 6, 7),
},
}
for _, testCase := range testCases {
policy := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs)
st := &mockState{
assignments: testCase.stAssignments,
defaultCPUSet: testCase.stDefaultCPUSet,
}
policy.RemoveContainer(st, testCase.containerID)
if !reflect.DeepEqual(st.defaultCPUSet, testCase.expCSet) {
t.Errorf("StaticPolicy RemoveContainer() error (%v). expected default cpuset %v but got %v",
testCase.description, testCase.expCSet, st.defaultCPUSet)
}
if _, found := st.assignments[testCase.containerID]; found {
t.Errorf("StaticPolicy RemoveContainer() error (%v). expected containerID %v not be in assignments %v",
testCase.description, testCase.containerID, st.assignments)
}
}
}

View file

@ -0,0 +1,390 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
)
var (
topoSingleSocketHT = &topology.CPUTopology{
NumCPUs: 8,
NumSockets: 1,
NumCores: 4,
CPUDetails: map[int]topology.CPUInfo{
0: {CoreID: 0, SocketID: 0},
1: {CoreID: 1, SocketID: 0},
2: {CoreID: 2, SocketID: 0},
3: {CoreID: 3, SocketID: 0},
4: {CoreID: 0, SocketID: 0},
5: {CoreID: 1, SocketID: 0},
6: {CoreID: 2, SocketID: 0},
7: {CoreID: 3, SocketID: 0},
},
}
topoDualSocketHT = &topology.CPUTopology{
NumCPUs: 12,
NumSockets: 2,
NumCores: 6,
CPUDetails: map[int]topology.CPUInfo{
0: {CoreID: 0, SocketID: 0},
1: {CoreID: 1, SocketID: 1},
2: {CoreID: 2, SocketID: 0},
3: {CoreID: 3, SocketID: 1},
4: {CoreID: 4, SocketID: 0},
5: {CoreID: 5, SocketID: 1},
6: {CoreID: 0, SocketID: 0},
7: {CoreID: 1, SocketID: 1},
8: {CoreID: 2, SocketID: 0},
9: {CoreID: 3, SocketID: 1},
10: {CoreID: 4, SocketID: 0},
11: {CoreID: 5, SocketID: 1},
},
}
topoDualSocketNoHT = &topology.CPUTopology{
NumCPUs: 8,
NumSockets: 2,
NumCores: 8,
CPUDetails: map[int]topology.CPUInfo{
0: {CoreID: 0, SocketID: 0},
1: {CoreID: 1, SocketID: 0},
2: {CoreID: 2, SocketID: 0},
3: {CoreID: 3, SocketID: 0},
4: {CoreID: 4, SocketID: 1},
5: {CoreID: 5, SocketID: 1},
6: {CoreID: 6, SocketID: 1},
7: {CoreID: 7, SocketID: 1},
},
}
/*
Topology from https://www.open-mpi.org/projects/hwloc/lstopo/images/KNL.SNC4.H50.v1.11.png.
Socket0:
0-2,9-10,13-14,21-22,25-26,33-34,38-39,46-47,50,57-58,71-72,79-80,87-88,95-96,103-104,109-110,117-118,
131-132,139-140,147-148,155-156,163-164,169-170,177-178,191-192,199-200,207-208,215-216,223-224,229-230,
237-238,251-252,259-260,267-268,275-276,283-284
Socket1:
3-4,11-12,15-16,23-24,27-28,35-36,40-41,48-49,51-52,59-60,65-66,73-74,81-82,89-90,97-98,111-112,119-120,125-126,
133-134,141-142,149-150,157-158,171-172,179-180,185-186,193-194,201-202,209-210,217-218,231-232,239-240,245-246,
253-254,261-262,269-270,277-278
Socket2:
5-6,17-18,29-30,42-43,53-54,61-62,67-68,75-76,83-84,91-92,99-100,105-106,113-114,121-122,127-128,135-136,
143-144,151-152,159-160,165-166,173-174,181-182,187-188,195-196,203-204,211-212,219-220,225-226,233-234,241-242,
247-248,255-256,263-264,271-272,279-280,285-286
Socket3:
7-8,19-20,31-32,37,44-45,55-56,63-64,69-70,77-78,85-86,93-94,101-102,107-108,115-116,123-124,129-130,137-138,
145-146,153-154,161-162,167-168,175-176,183-184,189-190,197-198,205-206,213-214,221-222,227-228,235-236,243-244,
249-250,257-258,265-266,273-274,281-282,287
*/
topoQuadSocketFourWayHT = &topology.CPUTopology{
NumCPUs: 288,
NumSockets: 4,
NumCores: 72,
CPUDetails: map[int]topology.CPUInfo{
0: {CoreID: 0, SocketID: 0},
169: {CoreID: 0, SocketID: 0},
109: {CoreID: 0, SocketID: 0},
229: {CoreID: 0, SocketID: 0},
50: {CoreID: 1, SocketID: 0},
170: {CoreID: 1, SocketID: 0},
110: {CoreID: 1, SocketID: 0},
230: {CoreID: 1, SocketID: 0},
1: {CoreID: 64, SocketID: 0},
25: {CoreID: 64, SocketID: 0},
13: {CoreID: 64, SocketID: 0},
38: {CoreID: 64, SocketID: 0},
2: {CoreID: 65, SocketID: 0},
26: {CoreID: 65, SocketID: 0},
14: {CoreID: 65, SocketID: 0},
39: {CoreID: 65, SocketID: 0},
9: {CoreID: 72, SocketID: 0},
33: {CoreID: 72, SocketID: 0},
21: {CoreID: 72, SocketID: 0},
46: {CoreID: 72, SocketID: 0},
10: {CoreID: 73, SocketID: 0},
34: {CoreID: 73, SocketID: 0},
22: {CoreID: 73, SocketID: 0},
47: {CoreID: 73, SocketID: 0},
57: {CoreID: 8, SocketID: 0},
177: {CoreID: 8, SocketID: 0},
117: {CoreID: 8, SocketID: 0},
237: {CoreID: 8, SocketID: 0},
58: {CoreID: 9, SocketID: 0},
178: {CoreID: 9, SocketID: 0},
118: {CoreID: 9, SocketID: 0},
238: {CoreID: 9, SocketID: 0},
71: {CoreID: 24, SocketID: 0},
191: {CoreID: 24, SocketID: 0},
131: {CoreID: 24, SocketID: 0},
251: {CoreID: 24, SocketID: 0},
72: {CoreID: 25, SocketID: 0},
192: {CoreID: 25, SocketID: 0},
132: {CoreID: 25, SocketID: 0},
252: {CoreID: 25, SocketID: 0},
79: {CoreID: 32, SocketID: 0},
199: {CoreID: 32, SocketID: 0},
139: {CoreID: 32, SocketID: 0},
259: {CoreID: 32, SocketID: 0},
80: {CoreID: 33, SocketID: 0},
200: {CoreID: 33, SocketID: 0},
140: {CoreID: 33, SocketID: 0},
260: {CoreID: 33, SocketID: 0},
87: {CoreID: 40, SocketID: 0},
207: {CoreID: 40, SocketID: 0},
147: {CoreID: 40, SocketID: 0},
267: {CoreID: 40, SocketID: 0},
88: {CoreID: 41, SocketID: 0},
208: {CoreID: 41, SocketID: 0},
148: {CoreID: 41, SocketID: 0},
268: {CoreID: 41, SocketID: 0},
95: {CoreID: 48, SocketID: 0},
215: {CoreID: 48, SocketID: 0},
155: {CoreID: 48, SocketID: 0},
275: {CoreID: 48, SocketID: 0},
96: {CoreID: 49, SocketID: 0},
216: {CoreID: 49, SocketID: 0},
156: {CoreID: 49, SocketID: 0},
276: {CoreID: 49, SocketID: 0},
103: {CoreID: 56, SocketID: 0},
223: {CoreID: 56, SocketID: 0},
163: {CoreID: 56, SocketID: 0},
283: {CoreID: 56, SocketID: 0},
104: {CoreID: 57, SocketID: 0},
224: {CoreID: 57, SocketID: 0},
164: {CoreID: 57, SocketID: 0},
284: {CoreID: 57, SocketID: 0},
3: {CoreID: 66, SocketID: 1},
27: {CoreID: 66, SocketID: 1},
15: {CoreID: 66, SocketID: 1},
40: {CoreID: 66, SocketID: 1},
4: {CoreID: 67, SocketID: 1},
28: {CoreID: 67, SocketID: 1},
16: {CoreID: 67, SocketID: 1},
41: {CoreID: 67, SocketID: 1},
11: {CoreID: 74, SocketID: 1},
35: {CoreID: 74, SocketID: 1},
23: {CoreID: 74, SocketID: 1},
48: {CoreID: 74, SocketID: 1},
12: {CoreID: 75, SocketID: 1},
36: {CoreID: 75, SocketID: 1},
24: {CoreID: 75, SocketID: 1},
49: {CoreID: 75, SocketID: 1},
51: {CoreID: 2, SocketID: 1},
171: {CoreID: 2, SocketID: 1},
111: {CoreID: 2, SocketID: 1},
231: {CoreID: 2, SocketID: 1},
52: {CoreID: 3, SocketID: 1},
172: {CoreID: 3, SocketID: 1},
112: {CoreID: 3, SocketID: 1},
232: {CoreID: 3, SocketID: 1},
59: {CoreID: 10, SocketID: 1},
179: {CoreID: 10, SocketID: 1},
119: {CoreID: 10, SocketID: 1},
239: {CoreID: 10, SocketID: 1},
60: {CoreID: 11, SocketID: 1},
180: {CoreID: 11, SocketID: 1},
120: {CoreID: 11, SocketID: 1},
240: {CoreID: 11, SocketID: 1},
65: {CoreID: 18, SocketID: 1},
185: {CoreID: 18, SocketID: 1},
125: {CoreID: 18, SocketID: 1},
245: {CoreID: 18, SocketID: 1},
66: {CoreID: 19, SocketID: 1},
186: {CoreID: 19, SocketID: 1},
126: {CoreID: 19, SocketID: 1},
246: {CoreID: 19, SocketID: 1},
73: {CoreID: 26, SocketID: 1},
193: {CoreID: 26, SocketID: 1},
133: {CoreID: 26, SocketID: 1},
253: {CoreID: 26, SocketID: 1},
74: {CoreID: 27, SocketID: 1},
194: {CoreID: 27, SocketID: 1},
134: {CoreID: 27, SocketID: 1},
254: {CoreID: 27, SocketID: 1},
81: {CoreID: 34, SocketID: 1},
201: {CoreID: 34, SocketID: 1},
141: {CoreID: 34, SocketID: 1},
261: {CoreID: 34, SocketID: 1},
82: {CoreID: 35, SocketID: 1},
202: {CoreID: 35, SocketID: 1},
142: {CoreID: 35, SocketID: 1},
262: {CoreID: 35, SocketID: 1},
89: {CoreID: 42, SocketID: 1},
209: {CoreID: 42, SocketID: 1},
149: {CoreID: 42, SocketID: 1},
269: {CoreID: 42, SocketID: 1},
90: {CoreID: 43, SocketID: 1},
210: {CoreID: 43, SocketID: 1},
150: {CoreID: 43, SocketID: 1},
270: {CoreID: 43, SocketID: 1},
97: {CoreID: 50, SocketID: 1},
217: {CoreID: 50, SocketID: 1},
157: {CoreID: 50, SocketID: 1},
277: {CoreID: 50, SocketID: 1},
98: {CoreID: 51, SocketID: 1},
218: {CoreID: 51, SocketID: 1},
158: {CoreID: 51, SocketID: 1},
278: {CoreID: 51, SocketID: 1},
5: {CoreID: 68, SocketID: 2},
29: {CoreID: 68, SocketID: 2},
17: {CoreID: 68, SocketID: 2},
42: {CoreID: 68, SocketID: 2},
6: {CoreID: 69, SocketID: 2},
30: {CoreID: 69, SocketID: 2},
18: {CoreID: 69, SocketID: 2},
43: {CoreID: 69, SocketID: 2},
53: {CoreID: 4, SocketID: 2},
173: {CoreID: 4, SocketID: 2},
113: {CoreID: 4, SocketID: 2},
233: {CoreID: 4, SocketID: 2},
54: {CoreID: 5, SocketID: 2},
174: {CoreID: 5, SocketID: 2},
114: {CoreID: 5, SocketID: 2},
234: {CoreID: 5, SocketID: 2},
61: {CoreID: 12, SocketID: 2},
181: {CoreID: 12, SocketID: 2},
121: {CoreID: 12, SocketID: 2},
241: {CoreID: 12, SocketID: 2},
62: {CoreID: 13, SocketID: 2},
182: {CoreID: 13, SocketID: 2},
122: {CoreID: 13, SocketID: 2},
242: {CoreID: 13, SocketID: 2},
67: {CoreID: 20, SocketID: 2},
187: {CoreID: 20, SocketID: 2},
127: {CoreID: 20, SocketID: 2},
247: {CoreID: 20, SocketID: 2},
68: {CoreID: 21, SocketID: 2},
188: {CoreID: 21, SocketID: 2},
128: {CoreID: 21, SocketID: 2},
248: {CoreID: 21, SocketID: 2},
75: {CoreID: 28, SocketID: 2},
195: {CoreID: 28, SocketID: 2},
135: {CoreID: 28, SocketID: 2},
255: {CoreID: 28, SocketID: 2},
76: {CoreID: 29, SocketID: 2},
196: {CoreID: 29, SocketID: 2},
136: {CoreID: 29, SocketID: 2},
256: {CoreID: 29, SocketID: 2},
83: {CoreID: 36, SocketID: 2},
203: {CoreID: 36, SocketID: 2},
143: {CoreID: 36, SocketID: 2},
263: {CoreID: 36, SocketID: 2},
84: {CoreID: 37, SocketID: 2},
204: {CoreID: 37, SocketID: 2},
144: {CoreID: 37, SocketID: 2},
264: {CoreID: 37, SocketID: 2},
91: {CoreID: 44, SocketID: 2},
211: {CoreID: 44, SocketID: 2},
151: {CoreID: 44, SocketID: 2},
271: {CoreID: 44, SocketID: 2},
92: {CoreID: 45, SocketID: 2},
212: {CoreID: 45, SocketID: 2},
152: {CoreID: 45, SocketID: 2},
272: {CoreID: 45, SocketID: 2},
99: {CoreID: 52, SocketID: 2},
219: {CoreID: 52, SocketID: 2},
159: {CoreID: 52, SocketID: 2},
279: {CoreID: 52, SocketID: 2},
100: {CoreID: 53, SocketID: 2},
220: {CoreID: 53, SocketID: 2},
160: {CoreID: 53, SocketID: 2},
280: {CoreID: 53, SocketID: 2},
105: {CoreID: 60, SocketID: 2},
225: {CoreID: 60, SocketID: 2},
165: {CoreID: 60, SocketID: 2},
285: {CoreID: 60, SocketID: 2},
106: {CoreID: 61, SocketID: 2},
226: {CoreID: 61, SocketID: 2},
166: {CoreID: 61, SocketID: 2},
286: {CoreID: 61, SocketID: 2},
7: {CoreID: 70, SocketID: 3},
31: {CoreID: 70, SocketID: 3},
19: {CoreID: 70, SocketID: 3},
44: {CoreID: 70, SocketID: 3},
8: {CoreID: 71, SocketID: 3},
32: {CoreID: 71, SocketID: 3},
20: {CoreID: 71, SocketID: 3},
45: {CoreID: 71, SocketID: 3},
37: {CoreID: 63, SocketID: 3},
168: {CoreID: 63, SocketID: 3},
108: {CoreID: 63, SocketID: 3},
228: {CoreID: 63, SocketID: 3},
107: {CoreID: 62, SocketID: 3},
227: {CoreID: 62, SocketID: 3},
167: {CoreID: 62, SocketID: 3},
287: {CoreID: 62, SocketID: 3},
55: {CoreID: 6, SocketID: 3},
175: {CoreID: 6, SocketID: 3},
115: {CoreID: 6, SocketID: 3},
235: {CoreID: 6, SocketID: 3},
56: {CoreID: 7, SocketID: 3},
176: {CoreID: 7, SocketID: 3},
116: {CoreID: 7, SocketID: 3},
236: {CoreID: 7, SocketID: 3},
63: {CoreID: 14, SocketID: 3},
183: {CoreID: 14, SocketID: 3},
123: {CoreID: 14, SocketID: 3},
243: {CoreID: 14, SocketID: 3},
64: {CoreID: 15, SocketID: 3},
184: {CoreID: 15, SocketID: 3},
124: {CoreID: 15, SocketID: 3},
244: {CoreID: 15, SocketID: 3},
69: {CoreID: 22, SocketID: 3},
189: {CoreID: 22, SocketID: 3},
129: {CoreID: 22, SocketID: 3},
249: {CoreID: 22, SocketID: 3},
70: {CoreID: 23, SocketID: 3},
190: {CoreID: 23, SocketID: 3},
130: {CoreID: 23, SocketID: 3},
250: {CoreID: 23, SocketID: 3},
77: {CoreID: 30, SocketID: 3},
197: {CoreID: 30, SocketID: 3},
137: {CoreID: 30, SocketID: 3},
257: {CoreID: 30, SocketID: 3},
78: {CoreID: 31, SocketID: 3},
198: {CoreID: 31, SocketID: 3},
138: {CoreID: 31, SocketID: 3},
258: {CoreID: 31, SocketID: 3},
85: {CoreID: 38, SocketID: 3},
205: {CoreID: 38, SocketID: 3},
145: {CoreID: 38, SocketID: 3},
265: {CoreID: 38, SocketID: 3},
86: {CoreID: 39, SocketID: 3},
206: {CoreID: 39, SocketID: 3},
146: {CoreID: 39, SocketID: 3},
266: {CoreID: 39, SocketID: 3},
93: {CoreID: 46, SocketID: 3},
213: {CoreID: 46, SocketID: 3},
153: {CoreID: 46, SocketID: 3},
273: {CoreID: 46, SocketID: 3},
94: {CoreID: 47, SocketID: 3},
214: {CoreID: 47, SocketID: 3},
154: {CoreID: 47, SocketID: 3},
274: {CoreID: 47, SocketID: 3},
101: {CoreID: 54, SocketID: 3},
221: {CoreID: 54, SocketID: 3},
161: {CoreID: 54, SocketID: 3},
281: {CoreID: 54, SocketID: 3},
102: {CoreID: 55, SocketID: 3},
222: {CoreID: 55, SocketID: 3},
162: {CoreID: 55, SocketID: 3},
282: {CoreID: 55, SocketID: 3},
},
}
)

View file

@ -0,0 +1,28 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = [
"state.go",
"state_mem.go",
],
visibility = ["//visibility:public"],
deps = [
"//pkg/kubelet/cm/cpuset:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View file

@ -0,0 +1,40 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
// Reader interface used to read current cpu/pod assignment state
type Reader interface {
GetCPUSet(containerID string) (cpuset.CPUSet, bool)
GetDefaultCPUSet() cpuset.CPUSet
GetCPUSetOrDefault(containerID string) cpuset.CPUSet
}
type writer interface {
SetCPUSet(containerID string, cpuset cpuset.CPUSet)
SetDefaultCPUSet(cpuset cpuset.CPUSet)
Delete(containerID string)
}
// State interface provides methods for tracking and setting cpu/pod assignment
type State interface {
Reader
writer
}

View file

@ -0,0 +1,90 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"sync"
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
type stateMemory struct {
sync.RWMutex
assignments map[string]cpuset.CPUSet
defaultCPUSet cpuset.CPUSet
}
var _ State = &stateMemory{}
// NewMemoryState creates new State for keeping track of cpu/pod assignment
func NewMemoryState() State {
glog.Infof("[cpumanager] initializing new in-memory state store")
return &stateMemory{
assignments: map[string]cpuset.CPUSet{},
defaultCPUSet: cpuset.NewCPUSet(),
}
}
func (s *stateMemory) GetCPUSet(containerID string) (cpuset.CPUSet, bool) {
s.RLock()
defer s.RUnlock()
res, ok := s.assignments[containerID]
return res.Clone(), ok
}
func (s *stateMemory) GetDefaultCPUSet() cpuset.CPUSet {
s.RLock()
defer s.RUnlock()
return s.defaultCPUSet.Clone()
}
func (s *stateMemory) GetCPUSetOrDefault(containerID string) cpuset.CPUSet {
s.RLock()
defer s.RUnlock()
if res, ok := s.GetCPUSet(containerID); ok {
return res
}
return s.GetDefaultCPUSet()
}
func (s *stateMemory) SetCPUSet(containerID string, cset cpuset.CPUSet) {
s.Lock()
defer s.Unlock()
s.assignments[containerID] = cset
glog.Infof("[cpumanager] updated desired cpuset (container id: %s, cpuset: \"%s\")", containerID, cset)
}
func (s *stateMemory) SetDefaultCPUSet(cset cpuset.CPUSet) {
s.Lock()
defer s.Unlock()
s.defaultCPUSet = cset
glog.Infof("[cpumanager] updated default cpuset: \"%s\"", cset)
}
func (s *stateMemory) Delete(containerID string) {
s.Lock()
defer s.Unlock()
delete(s.assignments, containerID)
glog.V(2).Infof("[cpumanager] deleted cpuset assignment (container id: %s)", containerID)
}

View file

@ -0,0 +1,35 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = [
"doc.go",
"topology.go",
],
visibility = ["//visibility:public"],
deps = [
"//pkg/kubelet/cm/cpuset:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)
go_test(
name = "go_default_test",
srcs = ["topology_test.go"],
library = ":go_default_library",
deps = ["//vendor/github.com/google/cadvisor/info/v1:go_default_library"],
)

View file

@ -0,0 +1,18 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package topology contains helpers for the CPU manager.
package topology // import "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"

View file

@ -0,0 +1,169 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topology
import (
"fmt"
cadvisorapi "github.com/google/cadvisor/info/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
)
// CPUDetails is a map from CPU ID to Core ID and Socket ID.
type CPUDetails map[int]CPUInfo
// CPUTopology contains details of node cpu, where :
// CPU - logical CPU, cadvisor - thread
// Core - physical CPU, cadvisor - Core
// Socket - socket, cadvisor - Node
type CPUTopology struct {
NumCPUs int
NumCores int
NumSockets int
CPUDetails CPUDetails
}
// CPUsPerCore returns the number of logical CPUs are associated with
// each core.
func (topo *CPUTopology) CPUsPerCore() int {
if topo.NumCores == 0 {
return 0
}
return topo.NumCPUs / topo.NumCores
}
// CPUsPerSocket returns the number of logical CPUs are associated with
// each socket.
func (topo *CPUTopology) CPUsPerSocket() int {
if topo.NumSockets == 0 {
return 0
}
return topo.NumCPUs / topo.NumSockets
}
// CPUInfo contains the socket and core IDs associated with a CPU.
type CPUInfo struct {
SocketID int
CoreID int
}
// KeepOnly returns a new CPUDetails object with only the supplied cpus.
func (d CPUDetails) KeepOnly(cpus cpuset.CPUSet) CPUDetails {
result := CPUDetails{}
for cpu, info := range d {
if cpus.Contains(cpu) {
result[cpu] = info
}
}
return result
}
// Sockets returns all of the socket IDs associated with the CPUs in this
// CPUDetails.
func (d CPUDetails) Sockets() cpuset.CPUSet {
b := cpuset.NewBuilder()
for _, info := range d {
b.Add(info.SocketID)
}
return b.Result()
}
// CPUsInSocket returns all of the logical CPU IDs associated with the
// given socket ID in this CPUDetails.
func (d CPUDetails) CPUsInSocket(id int) cpuset.CPUSet {
b := cpuset.NewBuilder()
for cpu, info := range d {
if info.SocketID == id {
b.Add(cpu)
}
}
return b.Result()
}
// Cores returns all of the core IDs associated with the CPUs in this
// CPUDetails.
func (d CPUDetails) Cores() cpuset.CPUSet {
b := cpuset.NewBuilder()
for _, info := range d {
b.Add(info.CoreID)
}
return b.Result()
}
// CoresInSocket returns all of the core IDs associated with the given
// socket ID in this CPUDetails.
func (d CPUDetails) CoresInSocket(id int) cpuset.CPUSet {
b := cpuset.NewBuilder()
for _, info := range d {
if info.SocketID == id {
b.Add(info.CoreID)
}
}
return b.Result()
}
// CPUs returns all of the logical CPU IDs in this CPUDetails.
func (d CPUDetails) CPUs() cpuset.CPUSet {
b := cpuset.NewBuilder()
for cpuID := range d {
b.Add(cpuID)
}
return b.Result()
}
// CPUsInCore returns all of the logical CPU IDs associated with the
// given core ID in this CPUDetails.
func (d CPUDetails) CPUsInCore(id int) cpuset.CPUSet {
b := cpuset.NewBuilder()
for cpu, info := range d {
if info.CoreID == id {
b.Add(cpu)
}
}
return b.Result()
}
// Discover returns CPUTopology based on cadvisor node info
func Discover(machineInfo *cadvisorapi.MachineInfo) (*CPUTopology, error) {
if machineInfo.NumCores == 0 {
return nil, fmt.Errorf("could not detect number of cpus")
}
CPUDetails := CPUDetails{}
numCPUs := machineInfo.NumCores
numPhysicalCores := 0
for _, socket := range machineInfo.Topology {
numPhysicalCores += len(socket.Cores)
for _, core := range socket.Cores {
for _, cpu := range core.Threads {
CPUDetails[cpu] = CPUInfo{
CoreID: core.Id,
SocketID: socket.Id,
}
}
}
}
return &CPUTopology{
NumCPUs: numCPUs,
NumSockets: len(machineInfo.Topology),
NumCores: numPhysicalCores,
CPUDetails: CPUDetails,
}, nil
}

View file

@ -0,0 +1,123 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topology
import (
"reflect"
"testing"
cadvisorapi "github.com/google/cadvisor/info/v1"
)
func Test_Discover(t *testing.T) {
tests := []struct {
name string
args *cadvisorapi.MachineInfo
want *CPUTopology
wantErr bool
}{
{
name: "FailNumCores",
args: &cadvisorapi.MachineInfo{
NumCores: 0,
},
want: &CPUTopology{},
wantErr: true,
},
{
name: "OneSocketHT",
args: &cadvisorapi.MachineInfo{
NumCores: 8,
Topology: []cadvisorapi.Node{
{Id: 0,
Cores: []cadvisorapi.Core{
{Id: 0, Threads: []int{0, 4}},
{Id: 1, Threads: []int{1, 5}},
{Id: 2, Threads: []int{2, 6}},
{Id: 3, Threads: []int{3, 7}},
},
},
},
},
want: &CPUTopology{
NumCPUs: 8,
NumSockets: 1,
NumCores: 4,
CPUDetails: map[int]CPUInfo{
0: {CoreID: 0, SocketID: 0},
1: {CoreID: 1, SocketID: 0},
2: {CoreID: 2, SocketID: 0},
3: {CoreID: 3, SocketID: 0},
4: {CoreID: 0, SocketID: 0},
5: {CoreID: 1, SocketID: 0},
6: {CoreID: 2, SocketID: 0},
7: {CoreID: 3, SocketID: 0},
},
},
wantErr: false,
},
{
name: "DualSocketNoHT",
args: &cadvisorapi.MachineInfo{
NumCores: 4,
Topology: []cadvisorapi.Node{
{Id: 0,
Cores: []cadvisorapi.Core{
{Id: 0, Threads: []int{0}},
{Id: 2, Threads: []int{2}},
},
},
{Id: 1,
Cores: []cadvisorapi.Core{
{Id: 1, Threads: []int{1}},
{Id: 3, Threads: []int{3}},
},
},
},
},
want: &CPUTopology{
NumCPUs: 4,
NumSockets: 2,
NumCores: 4,
CPUDetails: map[int]CPUInfo{
0: {CoreID: 0, SocketID: 0},
1: {CoreID: 1, SocketID: 1},
2: {CoreID: 2, SocketID: 0},
3: {CoreID: 3, SocketID: 1},
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := Discover(tt.args)
if err != nil {
if tt.wantErr {
t.Logf("Discover() expected error = %v", err)
} else {
t.Errorf("Discover() error = %v, wantErr %v", err, tt.wantErr)
}
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("Discover() = %v, want %v", got, tt.want)
}
})
}
}

28
vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpuset/BUILD generated vendored Normal file
View file

@ -0,0 +1,28 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
go_library(
name = "go_default_library",
srcs = ["cpuset.go"],
visibility = ["//visibility:public"],
deps = ["//vendor/github.com/golang/glog:go_default_library"],
)
go_test(
name = "go_default_test",
srcs = ["cpuset_test.go"],
library = ":go_default_library",
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View file

@ -0,0 +1,5 @@
approvers:
- derekwaynecarr
- vishh
- ConnorDoyle
- sjenning

View file

@ -0,0 +1,280 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpuset
import (
"bytes"
"fmt"
"github.com/golang/glog"
"reflect"
"sort"
"strconv"
"strings"
)
// Builder is a mutable builder for CPUSet. Functions that mutate instances
// of this type are not thread-safe.
type Builder struct {
result CPUSet
done bool
}
// NewBuilder returns a mutable CPUSet builder.
func NewBuilder() Builder {
return Builder{
result: CPUSet{
elems: map[int]struct{}{},
},
}
}
// Add adds the supplied elements to the result. Calling Add after calling
// Result has no effect.
func (b Builder) Add(elems ...int) {
if b.done {
return
}
for _, elem := range elems {
b.result.elems[elem] = struct{}{}
}
}
// Result returns the result CPUSet containing all elements that were
// previously added to this builder. Subsequent calls to Add have no effect.
func (b Builder) Result() CPUSet {
b.done = true
return b.result
}
// CPUSet is a thread-safe, immutable set-like data structure for CPU IDs.
type CPUSet struct {
elems map[int]struct{}
}
// NewCPUSet returns a new CPUSet containing the supplied elements.
func NewCPUSet(cpus ...int) CPUSet {
b := NewBuilder()
for _, c := range cpus {
b.Add(c)
}
return b.Result()
}
// Size returns the number of elements in this set.
func (s CPUSet) Size() int {
return len(s.elems)
}
// IsEmpty returns true if there are zero elements in this set.
func (s CPUSet) IsEmpty() bool {
return s.Size() == 0
}
// Contains returns true if the supplied element is present in this set.
func (s CPUSet) Contains(cpu int) bool {
_, found := s.elems[cpu]
return found
}
// Equals returns true if the supplied set contains exactly the same elements
// as this set (s IsSubsetOf s2 and s2 IsSubsetOf s).
func (s CPUSet) Equals(s2 CPUSet) bool {
return reflect.DeepEqual(s.elems, s2.elems)
}
// Filter returns a new CPU set that contains all of the elements from this
// set that match the supplied predicate, without mutating the source set.
func (s CPUSet) Filter(predicate func(int) bool) CPUSet {
b := NewBuilder()
for cpu := range s.elems {
if predicate(cpu) {
b.Add(cpu)
}
}
return b.Result()
}
// FilterNot returns a new CPU set that contains all of the elements from this
// set that do not match the supplied predicate, without mutating the source
// set.
func (s CPUSet) FilterNot(predicate func(int) bool) CPUSet {
b := NewBuilder()
for cpu := range s.elems {
if !predicate(cpu) {
b.Add(cpu)
}
}
return b.Result()
}
// IsSubsetOf returns true if the supplied set contains all the elements
func (s CPUSet) IsSubsetOf(s2 CPUSet) bool {
result := true
for cpu := range s.elems {
if !s2.Contains(cpu) {
result = false
break
}
}
return result
}
// Union returns a new CPU set that contains all of the elements from this
// set and all of the elements from the supplied set, without mutating
// either source set.
func (s CPUSet) Union(s2 CPUSet) CPUSet {
b := NewBuilder()
for cpu := range s.elems {
b.Add(cpu)
}
for cpu := range s2.elems {
b.Add(cpu)
}
return b.Result()
}
// Intersection returns a new CPU set that contains all of the elements
// that are present in both this set and the supplied set, without mutating
// either source set.
func (s CPUSet) Intersection(s2 CPUSet) CPUSet {
return s.Filter(func(cpu int) bool { return s2.Contains(cpu) })
}
// Difference returns a new CPU set that contains all of the elements that
// are present in this set and not the supplied set, without mutating either
// source set.
func (s CPUSet) Difference(s2 CPUSet) CPUSet {
return s.FilterNot(func(cpu int) bool { return s2.Contains(cpu) })
}
// ToSlice returns a slice of integers that contains all elements from
// this set.
func (s CPUSet) ToSlice() []int {
result := []int{}
for cpu := range s.elems {
result = append(result, cpu)
}
sort.Ints(result)
return result
}
// String returns a new string representation of the elements in this CPU set
// in canonical linux CPU list format.
//
// See: http://man7.org/linux/man-pages/man7/cpuset.7.html#FORMATS
func (s CPUSet) String() string {
if s.IsEmpty() {
return ""
}
elems := s.ToSlice()
type rng struct {
start int
end int
}
ranges := []rng{{elems[0], elems[0]}}
for i := 1; i < len(elems); i++ {
lastRange := &ranges[len(ranges)-1]
// if this element is adjacent to the high end of the last range
if elems[i] == lastRange.end+1 {
// then extend the last range to include this element
lastRange.end = elems[i]
continue
}
// otherwise, start a new range beginning with this element
ranges = append(ranges, rng{elems[i], elems[i]})
}
// construct string from ranges
var result bytes.Buffer
for _, r := range ranges {
if r.start == r.end {
result.WriteString(strconv.Itoa(r.start))
} else {
result.WriteString(fmt.Sprintf("%d-%d", r.start, r.end))
}
result.WriteString(",")
}
return strings.TrimRight(result.String(), ",")
}
// MustParse CPUSet constructs a new CPU set from a Linux CPU list formatted
// string. Unlike Parse, it does not return an error but rather panics if the
// input cannot be used to construct a CPU set.
func MustParse(s string) CPUSet {
res, err := Parse(s)
if err != nil {
glog.Fatalf("unable to parse [%s] as CPUSet: %v", s, err)
}
return res
}
// Parse CPUSet constructs a new CPU set from a Linux CPU list formatted string.
//
// See: http://man7.org/linux/man-pages/man7/cpuset.7.html#FORMATS
func Parse(s string) (CPUSet, error) {
b := NewBuilder()
// Handle empty string.
if s == "" {
return b.Result(), nil
}
// Split CPU list string:
// "0-5,34,46-48 => ["0-5", "34", "46-48"]
ranges := strings.Split(s, ",")
for _, r := range ranges {
boundaries := strings.Split(r, "-")
if len(boundaries) == 1 {
// Handle ranges that consist of only one element like "34".
elem, err := strconv.Atoi(boundaries[0])
if err != nil {
return NewCPUSet(), err
}
b.Add(elem)
} else if len(boundaries) == 2 {
// Handle multi-element ranges like "0-5".
start, err := strconv.Atoi(boundaries[0])
if err != nil {
return NewCPUSet(), err
}
end, err := strconv.Atoi(boundaries[1])
if err != nil {
return NewCPUSet(), err
}
// Add all elements to the result.
// e.g. "0-5", "46-48" => [0, 1, 2, 3, 4, 5, 46, 47, 48].
for e := start; e <= end; e++ {
b.Add(e)
}
}
}
return b.Result(), nil
}
// Clone returns a copy of this CPU set.
func (s CPUSet) Clone() CPUSet {
b := NewBuilder()
for elem := range s.elems {
b.Add(elem)
}
return b.Result()
}

View file

@ -0,0 +1,324 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpuset
import (
"reflect"
"testing"
)
func TestCPUSetBuilder(t *testing.T) {
b := NewBuilder()
elems := []int{1, 2, 3, 4, 5}
for _, elem := range elems {
b.Add(elem)
}
result := b.Result()
for _, elem := range elems {
if !result.Contains(elem) {
t.Fatalf("expected cpuset to contain element %d: [%v]", elem, result)
}
}
if len(elems) != result.Size() {
t.Fatalf("expected cpuset %s to have the same size as %v", result, elems)
}
}
func TestCPUSetSize(t *testing.T) {
testCases := []struct {
cpuset CPUSet
expected int
}{
{NewCPUSet(), 0},
{NewCPUSet(5), 1},
{NewCPUSet(1, 2, 3, 4, 5), 5},
}
for _, c := range testCases {
actual := c.cpuset.Size()
if actual != c.expected {
t.Fatalf("expected: %d, actual: %d, cpuset: [%v]", c.expected, actual, c.cpuset)
}
}
}
func TestCPUSetIsEmpty(t *testing.T) {
testCases := []struct {
cpuset CPUSet
expected bool
}{
{NewCPUSet(), true},
{NewCPUSet(5), false},
{NewCPUSet(1, 2, 3, 4, 5), false},
}
for _, c := range testCases {
actual := c.cpuset.IsEmpty()
if actual != c.expected {
t.Fatalf("expected: %t, IsEmpty() returned: %t, cpuset: [%v]", c.expected, actual, c.cpuset)
}
}
}
func TestCPUSetContains(t *testing.T) {
testCases := []struct {
cpuset CPUSet
mustContain []int
mustNotContain []int
}{
{NewCPUSet(), []int{}, []int{1, 2, 3, 4, 5}},
{NewCPUSet(5), []int{5}, []int{1, 2, 3, 4}},
{NewCPUSet(1, 2, 4, 5), []int{1, 2, 4, 5}, []int{0, 3, 6}},
}
for _, c := range testCases {
for _, elem := range c.mustContain {
if !c.cpuset.Contains(elem) {
t.Fatalf("expected cpuset to contain element %d: [%v]", elem, c.cpuset)
}
}
for _, elem := range c.mustNotContain {
if c.cpuset.Contains(elem) {
t.Fatalf("expected cpuset not to contain element %d: [%v]", elem, c.cpuset)
}
}
}
}
func TestCPUSetEqual(t *testing.T) {
shouldEqual := []struct {
s1 CPUSet
s2 CPUSet
}{
{NewCPUSet(), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
}
shouldNotEqual := []struct {
s1 CPUSet
s2 CPUSet
}{
{NewCPUSet(), NewCPUSet(5)},
{NewCPUSet(5), NewCPUSet()},
{NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(5)},
}
for _, c := range shouldEqual {
if !c.s1.Equals(c.s2) {
t.Fatalf("expected cpusets to be equal: s1: [%v], s2: [%v]", c.s1, c.s2)
}
}
for _, c := range shouldNotEqual {
if c.s1.Equals(c.s2) {
t.Fatalf("expected cpusets to not be equal: s1: [%v], s2: [%v]", c.s1, c.s2)
}
}
}
func TestCPUSetIsSubsetOf(t *testing.T) {
shouldBeSubset := []struct {
s1 CPUSet
s2 CPUSet
}{
// A set is a subset of itself
{NewCPUSet(), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
// Empty set is a subset of every set
{NewCPUSet(), NewCPUSet(5)},
{NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(2, 3), NewCPUSet(1, 2, 3, 4, 5)},
}
shouldNotBeSubset := []struct {
s1 CPUSet
s2 CPUSet
}{}
for _, c := range shouldBeSubset {
if !c.s1.IsSubsetOf(c.s2) {
t.Fatalf("expected s1 to be a subset of s2: s1: [%v], s2: [%v]", c.s1, c.s2)
}
}
for _, c := range shouldNotBeSubset {
if c.s1.IsSubsetOf(c.s2) {
t.Fatalf("expected s1 to not be a subset of s2: s1: [%v], s2: [%v]", c.s1, c.s2)
}
}
}
func TestCPUSetUnion(t *testing.T) {
testCases := []struct {
s1 CPUSet
s2 CPUSet
expected CPUSet
}{
{NewCPUSet(), NewCPUSet(), NewCPUSet()},
{NewCPUSet(), NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(5), NewCPUSet(), NewCPUSet(5)},
{NewCPUSet(5), NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2), NewCPUSet(3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3), NewCPUSet(3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
}
for _, c := range testCases {
result := c.s1.Union(c.s2)
if !result.Equals(c.expected) {
t.Fatalf("expected the union of s1 and s2 to be [%v] (got [%v]), s1: [%v], s2: [%v]", c.expected, result, c.s1, c.s2)
}
}
}
func TestCPUSetIntersection(t *testing.T) {
testCases := []struct {
s1 CPUSet
s2 CPUSet
expected CPUSet
}{
{NewCPUSet(), NewCPUSet(), NewCPUSet()},
{NewCPUSet(), NewCPUSet(5), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet()},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(), NewCPUSet()},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(5), NewCPUSet(5)},
{NewCPUSet(1, 2), NewCPUSet(3, 4, 5), NewCPUSet()},
{NewCPUSet(1, 2, 3), NewCPUSet(3, 4, 5), NewCPUSet(3)},
}
for _, c := range testCases {
result := c.s1.Intersection(c.s2)
if !result.Equals(c.expected) {
t.Fatalf("expected the intersection of s1 and s2 to be [%v] (got [%v]), s1: [%v], s2: [%v]", c.expected, result, c.s1, c.s2)
}
}
}
func TestCPUSetDifference(t *testing.T) {
testCases := []struct {
s1 CPUSet
s2 CPUSet
expected CPUSet
}{
{NewCPUSet(), NewCPUSet(), NewCPUSet()},
{NewCPUSet(), NewCPUSet(5), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(), NewCPUSet(5)},
{NewCPUSet(5), NewCPUSet(5), NewCPUSet()},
{NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet()},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(), NewCPUSet(1, 2, 3, 4, 5)},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet()},
{NewCPUSet(5), NewCPUSet(1, 2, 3, 4, 5), NewCPUSet()},
{NewCPUSet(1, 2, 3, 4, 5), NewCPUSet(5), NewCPUSet(1, 2, 3, 4)},
{NewCPUSet(1, 2), NewCPUSet(3, 4, 5), NewCPUSet(1, 2)},
{NewCPUSet(1, 2, 3), NewCPUSet(3, 4, 5), NewCPUSet(1, 2)},
}
for _, c := range testCases {
result := c.s1.Difference(c.s2)
if !result.Equals(c.expected) {
t.Fatalf("expected the difference of s1 and s2 to be [%v] (got [%v]), s1: [%v], s2: [%v]", c.expected, result, c.s1, c.s2)
}
}
}
func TestCPUSetToSlice(t *testing.T) {
testCases := []struct {
set CPUSet
expected []int
}{
{NewCPUSet(), []int{}},
{NewCPUSet(5), []int{5}},
{NewCPUSet(1, 2, 3, 4, 5), []int{1, 2, 3, 4, 5}},
}
for _, c := range testCases {
result := c.set.ToSlice()
if !reflect.DeepEqual(result, c.expected) {
t.Fatalf("expected set as slice to be [%v] (got [%v]), s: [%v]", c.expected, result, c.set)
}
}
}
func TestCPUSetString(t *testing.T) {
testCases := []struct {
set CPUSet
expected string
}{
{NewCPUSet(), ""},
{NewCPUSet(5), "5"},
{NewCPUSet(1, 2, 3, 4, 5), "1-5"},
{NewCPUSet(1, 2, 3, 5, 6, 8), "1-3,5-6,8"},
}
for _, c := range testCases {
result := c.set.String()
if result != c.expected {
t.Fatalf("expected set as string to be %s (got \"%s\"), s: [%v]", c.expected, result, c.set)
}
}
}
func TestParse(t *testing.T) {
testCases := []struct {
cpusetString string
expected CPUSet
}{
{"", NewCPUSet()},
{"5", NewCPUSet(5)},
{"1,2,3,4,5", NewCPUSet(1, 2, 3, 4, 5)},
{"1-5", NewCPUSet(1, 2, 3, 4, 5)},
{"1-2,3-5", NewCPUSet(1, 2, 3, 4, 5)},
}
for _, c := range testCases {
result, err := Parse(c.cpusetString)
if err != nil {
t.Fatalf("expected error not to have occurred: %v", err)
}
if !result.Equals(c.expected) {
t.Fatalf("expected string \"%s\" to parse as [%v] (got [%v])", c.cpusetString, c.expected, result)
}
}
}

View file

@ -0,0 +1,293 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"sync"
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha1"
"k8s.io/kubernetes/pkg/kubelet/deviceplugin"
)
// podDevices represents a list of pod to device Id mappings.
type containerDevices map[string]sets.String
type podDevices map[string]containerDevices
func (pdev podDevices) pods() sets.String {
ret := sets.NewString()
for k := range pdev {
ret.Insert(k)
}
return ret
}
func (pdev podDevices) insert(podUID, contName string, device string) {
if _, exists := pdev[podUID]; !exists {
pdev[podUID] = make(containerDevices)
}
if _, exists := pdev[podUID][contName]; !exists {
pdev[podUID][contName] = sets.NewString()
}
pdev[podUID][contName].Insert(device)
}
func (pdev podDevices) getDevices(podUID, contName string) sets.String {
containers, exists := pdev[podUID]
if !exists {
return nil
}
devices, exists := containers[contName]
if !exists {
return nil
}
return devices
}
func (pdev podDevices) delete(pods []string) {
for _, uid := range pods {
delete(pdev, uid)
}
}
func (pdev podDevices) devices() sets.String {
ret := sets.NewString()
for _, containerDevices := range pdev {
for _, deviceSet := range containerDevices {
ret = ret.Union(deviceSet)
}
}
return ret
}
type DevicePluginHandler interface {
// Start starts device plugin registration service.
Start() error
// Devices returns all of registered devices keyed by resourceName.
Devices() map[string][]*pluginapi.Device
// Allocate attempts to allocate all of required extended resources for
// the input container, issues an Allocate rpc request for each of such
// resources, and returns their AllocateResponses on success.
Allocate(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) ([]*pluginapi.AllocateResponse, error)
}
type DevicePluginHandlerImpl struct {
sync.Mutex
devicePluginManager deviceplugin.Manager
// devicePluginManagerMonitorCallback is used for testing only.
devicePluginManagerMonitorCallback deviceplugin.MonitorCallback
// allDevices contains all of registered resourceNames and their exported device IDs.
allDevices map[string]sets.String
// allocatedDevices contains pod to allocated device mapping, keyed by resourceName.
allocatedDevices map[string]podDevices
}
// NewDevicePluginHandler create a DevicePluginHandler
// updateCapacityFunc is called to update ContainerManager capacity when
// device capacity changes.
func NewDevicePluginHandlerImpl(updateCapacityFunc func(v1.ResourceList)) (*DevicePluginHandlerImpl, error) {
glog.V(2).Infof("Creating Device Plugin Handler")
handler := &DevicePluginHandlerImpl{
allDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]podDevices),
}
deviceManagerMonitorCallback := func(resourceName string, added, updated, deleted []*pluginapi.Device) {
var capacity = v1.ResourceList{}
kept := append(updated, added...)
if _, ok := handler.allDevices[resourceName]; !ok {
handler.allDevices[resourceName] = sets.NewString()
}
// For now, DevicePluginHandler only keeps track of healthy devices.
// We can revisit this later when the need comes to track unhealthy devices here.
for _, dev := range kept {
if dev.Health == pluginapi.Healthy {
handler.allDevices[resourceName].Insert(dev.ID)
} else {
handler.allDevices[resourceName].Delete(dev.ID)
}
}
for _, dev := range deleted {
handler.allDevices[resourceName].Delete(dev.ID)
}
capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(handler.allDevices[resourceName].Len()), resource.DecimalSI)
updateCapacityFunc(capacity)
}
mgr, err := deviceplugin.NewManagerImpl(pluginapi.KubeletSocket, deviceManagerMonitorCallback)
if err != nil {
return nil, fmt.Errorf("Failed to initialize device plugin manager: %+v", err)
}
handler.devicePluginManager = mgr
handler.devicePluginManagerMonitorCallback = deviceManagerMonitorCallback
// Loads in allocatedDevices information from disk.
err = handler.readCheckpoint()
if err != nil {
glog.Warningf("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date. Err: %v", err)
}
return handler, nil
}
func (h *DevicePluginHandlerImpl) Start() error {
return h.devicePluginManager.Start()
}
func (h *DevicePluginHandlerImpl) Devices() map[string][]*pluginapi.Device {
return h.devicePluginManager.Devices()
}
func (h *DevicePluginHandlerImpl) Allocate(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) ([]*pluginapi.AllocateResponse, error) {
var ret []*pluginapi.AllocateResponse
h.updateAllocatedDevices(activePods)
for k, v := range container.Resources.Limits {
resource := string(k)
needed := int(v.Value())
glog.V(3).Infof("needs %d %s", needed, resource)
if !deviceplugin.IsDeviceName(k) || needed == 0 {
continue
}
h.Lock()
// Gets list of devices that have already been allocated.
// This can happen if a container restarts for example.
if h.allocatedDevices[resource] == nil {
h.allocatedDevices[resource] = make(podDevices)
}
devices := h.allocatedDevices[resource].getDevices(string(pod.UID), container.Name)
if devices != nil {
glog.V(3).Infof("Found pre-allocated devices for resource %s container %q in Pod %q: %v", resource, container.Name, pod.UID, devices.List())
needed = needed - devices.Len()
}
// Get Devices in use.
devicesInUse := h.allocatedDevices[resource].devices()
// Get a list of available devices.
available := h.allDevices[resource].Difference(devicesInUse)
if int(available.Len()) < needed {
h.Unlock()
return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
}
allocated := available.UnsortedList()[:needed]
for _, device := range allocated {
// Update internal allocated device cache.
h.allocatedDevices[resource].insert(string(pod.UID), container.Name, device)
}
h.Unlock()
// devicePluginManager.Allocate involves RPC calls to device plugin, which
// could be heavy-weight. Therefore we want to perform this operation outside
// mutex lock. Note if Allcate call fails, we may leave container resources
// partially allocated for the failed container. We rely on updateAllocatedDevices()
// to garbage collect these resources later. Another side effect is that if
// we have X resource A and Y resource B in total, and two containers, container1
// and container2 both require X resource A and Y resource B. Both allocation
// requests may fail if we serve them in mixed order.
// TODO: may revisit this part later if we see inefficient resource allocation
// in real use as the result of this.
resp, err := h.devicePluginManager.Allocate(resource, append(devices.UnsortedList(), allocated...))
if err != nil {
return nil, err
}
ret = append(ret, resp)
}
// Checkpoints device to container allocation information.
if err := h.writeCheckpoint(); err != nil {
return nil, err
}
return ret, nil
}
// updateAllocatedDevices updates the list of GPUs in use.
// It gets a list of active pods and then frees any GPUs that are bound to
// terminated pods. Returns error on failure.
func (h *DevicePluginHandlerImpl) updateAllocatedDevices(activePods []*v1.Pod) {
h.Lock()
defer h.Unlock()
activePodUids := sets.NewString()
for _, pod := range activePods {
activePodUids.Insert(string(pod.UID))
}
for _, podDevs := range h.allocatedDevices {
allocatedPodUids := podDevs.pods()
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
podDevs.delete(podsToBeRemoved.List())
}
}
type checkpointEntry struct {
PodUID string
ContainerName string
ResourceName string
DeviceID string
}
// checkpointData struct is used to store pod to device allocation information
// in a checkpoint file.
// TODO: add version control when we need to change checkpoint format.
type checkpointData struct {
Entries []checkpointEntry
}
// Checkpoints device to container allocation information to disk.
func (h *DevicePluginHandlerImpl) writeCheckpoint() error {
filepath := h.devicePluginManager.CheckpointFile()
var data checkpointData
for resourceName, podDev := range h.allocatedDevices {
for podUID, conDev := range podDev {
for conName, devs := range conDev {
for _, devId := range devs.UnsortedList() {
data.Entries = append(data.Entries, checkpointEntry{podUID, conName, resourceName, devId})
}
}
}
}
dataJson, err := json.Marshal(data)
if err != nil {
return err
}
return ioutil.WriteFile(filepath, dataJson, 0644)
}
// Reads device to container allocation information from disk, and populates
// h.allocatedDevices accordingly.
func (h *DevicePluginHandlerImpl) readCheckpoint() error {
filepath := h.devicePluginManager.CheckpointFile()
content, err := ioutil.ReadFile(filepath)
if err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to read checkpoint file %q: %v", filepath, err)
}
glog.V(2).Infof("Read checkpoint file %s\n", filepath)
var data checkpointData
if err := json.Unmarshal(content, &data); err != nil {
return fmt.Errorf("failed to unmarshal checkpoint data: %v", err)
}
for _, entry := range data.Entries {
glog.V(2).Infof("Get checkpoint entry: %v %v %v %v\n", entry.PodUID, entry.ContainerName, entry.ResourceName, entry.DeviceID)
if h.allocatedDevices[entry.ResourceName] == nil {
h.allocatedDevices[entry.ResourceName] = make(podDevices)
}
h.allocatedDevices[entry.ResourceName].insert(entry.PodUID, entry.ContainerName, entry.DeviceID)
}
return nil
}

View file

@ -0,0 +1,42 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha1"
)
// A simple stub implementation for DevicePluginHandler.
type DevicePluginHandlerStub struct{}
func NewDevicePluginHandlerStub() (*DevicePluginHandlerStub, error) {
return &DevicePluginHandlerStub{}, nil
}
func (h *DevicePluginHandlerStub) Start() error {
return nil
}
func (h *DevicePluginHandlerStub) Devices() map[string][]*pluginapi.Device {
return make(map[string][]*pluginapi.Device)
}
func (h *DevicePluginHandlerStub) Allocate(pod *v1.Pod, container *v1.Container, activePods []*v1.Pod) ([]*pluginapi.AllocateResponse, error) {
var ret []*pluginapi.AllocateResponse
return ret, nil
}

View file

@ -0,0 +1,285 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"flag"
"fmt"
"testing"
"github.com/stretchr/testify/assert"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/uuid"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha1"
)
func TestUpdateCapacity(t *testing.T) {
var expected = v1.ResourceList{}
as := assert.New(t)
verifyCapacityFunc := func(updates v1.ResourceList) {
as.Equal(expected, updates)
}
testDevicePluginHandler, err := NewDevicePluginHandlerImpl(verifyCapacityFunc)
as.NotNil(testDevicePluginHandler)
as.Nil(err)
devs := []*pluginapi.Device{
{ID: "Device1", Health: pluginapi.Healthy},
{ID: "Device2", Health: pluginapi.Healthy},
{ID: "Device3", Health: pluginapi.Unhealthy},
}
resourceName := "resource1"
// Adds three devices for resource1, two healthy and one unhealthy.
// Expects capacity for resource1 to be 2.
expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(2), resource.DecimalSI)
testDevicePluginHandler.devicePluginManagerMonitorCallback(resourceName, devs, []*pluginapi.Device{}, []*pluginapi.Device{})
// Deletes an unhealthy device should NOT change capacity.
testDevicePluginHandler.devicePluginManagerMonitorCallback(resourceName, []*pluginapi.Device{}, []*pluginapi.Device{}, []*pluginapi.Device{devs[2]})
// Updates a healthy device to unhealthy should reduce capacity by 1.
expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(1), resource.DecimalSI)
// Deletes a healthy device should reduce capacity by 1.
expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(0), resource.DecimalSI)
// Tests adding another resource.
delete(expected, v1.ResourceName(resourceName))
resourceName2 := "resource2"
expected[v1.ResourceName(resourceName2)] = *resource.NewQuantity(int64(2), resource.DecimalSI)
testDevicePluginHandler.devicePluginManagerMonitorCallback(resourceName2, devs, []*pluginapi.Device{}, []*pluginapi.Device{})
}
type stringPairType struct {
value1 string
value2 string
}
// DevicePluginManager stub to test device Allocation behavior.
type DevicePluginManagerTestStub struct {
// All data structs are keyed by resourceName+DevId
devRuntimeDevices map[string][]stringPairType
devRuntimeMounts map[string][]stringPairType
devRuntimeEnvs map[string][]stringPairType
}
func NewDevicePluginManagerTestStub() (*DevicePluginManagerTestStub, error) {
return &DevicePluginManagerTestStub{
devRuntimeDevices: make(map[string][]stringPairType),
devRuntimeMounts: make(map[string][]stringPairType),
devRuntimeEnvs: make(map[string][]stringPairType),
}, nil
}
func (m *DevicePluginManagerTestStub) Start() error {
return nil
}
func (m *DevicePluginManagerTestStub) Devices() map[string][]*pluginapi.Device {
return make(map[string][]*pluginapi.Device)
}
func (m *DevicePluginManagerTestStub) Allocate(resourceName string, devIds []string) (*pluginapi.AllocateResponse, error) {
resp := new(pluginapi.AllocateResponse)
for _, id := range devIds {
key := resourceName + id
fmt.Printf("Alloc device %q for resource %q\n", id, resourceName)
devRuntime := new(pluginapi.DeviceRuntimeSpec)
for _, dev := range m.devRuntimeDevices[key] {
devRuntime.Devices = append(devRuntime.Devices, &pluginapi.DeviceSpec{
ContainerPath: dev.value1,
HostPath: dev.value2,
Permissions: "mrw",
})
}
for _, mount := range m.devRuntimeMounts[key] {
fmt.Printf("Add mount %q %q\n", mount.value1, mount.value2)
devRuntime.Mounts = append(devRuntime.Mounts, &pluginapi.Mount{
ContainerPath: mount.value1,
HostPath: mount.value2,
ReadOnly: true,
})
}
devRuntime.Envs = make(map[string]string)
for _, env := range m.devRuntimeEnvs[key] {
devRuntime.Envs[env.value1] = env.value2
}
resp.Spec = append(resp.Spec, devRuntime)
}
return resp, nil
}
func (m *DevicePluginManagerTestStub) Stop() error {
return nil
}
func (m *DevicePluginManagerTestStub) CheckpointFile() string {
return "/tmp/device-plugin-checkpoint"
}
func TestCheckpoint(t *testing.T) {
resourceName1 := "domain1.com/resource1"
resourceName2 := "domain2.com/resource2"
m, err := NewDevicePluginManagerTestStub()
as := assert.New(t)
as.Nil(err)
testDevicePluginHandler := &DevicePluginHandlerImpl{
devicePluginManager: m,
allDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]podDevices),
}
testDevicePluginHandler.allocatedDevices[resourceName1] = make(podDevices)
testDevicePluginHandler.allocatedDevices[resourceName1].insert("pod1", "con1", "dev1")
testDevicePluginHandler.allocatedDevices[resourceName1].insert("pod1", "con1", "dev2")
testDevicePluginHandler.allocatedDevices[resourceName1].insert("pod1", "con2", "dev1")
testDevicePluginHandler.allocatedDevices[resourceName1].insert("pod2", "con1", "dev1")
testDevicePluginHandler.allocatedDevices[resourceName2] = make(podDevices)
testDevicePluginHandler.allocatedDevices[resourceName2].insert("pod1", "con1", "dev3")
testDevicePluginHandler.allocatedDevices[resourceName2].insert("pod1", "con1", "dev4")
err = testDevicePluginHandler.writeCheckpoint()
as.Nil(err)
expected := testDevicePluginHandler.allocatedDevices
testDevicePluginHandler.allocatedDevices = make(map[string]podDevices)
err = testDevicePluginHandler.readCheckpoint()
as.Nil(err)
as.Equal(expected, testDevicePluginHandler.allocatedDevices)
}
func TestPodContainerDeviceAllocation(t *testing.T) {
flag.Set("alsologtostderr", fmt.Sprintf("%t", true))
var logLevel string
flag.StringVar(&logLevel, "logLevel", "4", "test")
flag.Lookup("v").Value.Set(logLevel)
var activePods []*v1.Pod
resourceName1 := "domain1.com/resource1"
resourceQuantity1 := *resource.NewQuantity(int64(2), resource.DecimalSI)
devId1 := "dev1"
devId2 := "dev2"
resourceName2 := "domain2.com/resource2"
resourceQuantity2 := *resource.NewQuantity(int64(1), resource.DecimalSI)
devId3 := "dev3"
devId4 := "dev4"
m, err := NewDevicePluginManagerTestStub()
as := assert.New(t)
as.Nil(err)
monitorCallback := func(resourceName string, added, updated, deleted []*pluginapi.Device) {}
testDevicePluginHandler := &DevicePluginHandlerImpl{
devicePluginManager: m,
devicePluginManagerMonitorCallback: monitorCallback,
allDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]podDevices),
}
testDevicePluginHandler.allDevices[resourceName1] = sets.NewString()
testDevicePluginHandler.allDevices[resourceName1].Insert(devId1)
testDevicePluginHandler.allDevices[resourceName1].Insert(devId2)
testDevicePluginHandler.allDevices[resourceName2] = sets.NewString()
testDevicePluginHandler.allDevices[resourceName2].Insert(devId3)
testDevicePluginHandler.allDevices[resourceName2].Insert(devId4)
m.devRuntimeDevices[resourceName1+devId1] = append(m.devRuntimeDevices[resourceName1+devId1], stringPairType{"/dev/aaa", "/dev/aaa"})
m.devRuntimeDevices[resourceName1+devId1] = append(m.devRuntimeDevices[resourceName1+devId1], stringPairType{"/dev/bbb", "/dev/bbb"})
m.devRuntimeDevices[resourceName1+devId2] = append(m.devRuntimeDevices[resourceName1+devId2], stringPairType{"/dev/ccc", "/dev/ccc"})
m.devRuntimeMounts[resourceName1+devId1] = append(m.devRuntimeMounts[resourceName1+devId1], stringPairType{"/container_dir1/file1", "host_dir1/file1"})
m.devRuntimeMounts[resourceName1+devId2] = append(m.devRuntimeMounts[resourceName1+devId2], stringPairType{"/container_dir1/file1", "host_dir1/file1"})
m.devRuntimeEnvs[resourceName1+devId2] = append(m.devRuntimeEnvs[resourceName1+devId2], stringPairType{"key1", "val1"})
m.devRuntimeEnvs[resourceName2+devId3] = append(m.devRuntimeEnvs[resourceName2+devId3], stringPairType{"key2", "val2"})
m.devRuntimeEnvs[resourceName2+devId4] = append(m.devRuntimeEnvs[resourceName2+devId4], stringPairType{"key2", "val2"})
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: uuid.NewUUID(),
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: string(uuid.NewUUID()),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(resourceName1): resourceQuantity1,
v1.ResourceName("cpu"): resourceQuantity1,
v1.ResourceName(resourceName2): resourceQuantity2,
},
},
},
},
},
}
cm := &containerManagerImpl{
devicePluginHandler: testDevicePluginHandler,
}
activePods = append(activePods, pod)
runContainerOpts, err := cm.GetResources(pod, &pod.Spec.Containers[0], activePods)
as.Equal(len(runContainerOpts.Devices), 3)
// Two devices require to mount the same path. Expects a single mount entry to be created.
as.Equal(len(runContainerOpts.Mounts), 1)
as.Equal(runContainerOpts.Mounts[0].ContainerPath, "/container_dir1/file1")
as.Equal(len(runContainerOpts.Envs), 2)
// Requesting to create a pod without enough resources should fail.
failPod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: uuid.NewUUID(),
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: string(uuid.NewUUID()),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(resourceName1): resourceQuantity1,
},
},
},
},
},
}
runContainerOpts2, err := cm.GetResources(failPod, &failPod.Spec.Containers[0], activePods)
as.NotNil(err)
as.Equal(len(runContainerOpts2.Devices), 0)
as.Equal(len(runContainerOpts2.Mounts), 0)
as.Equal(len(runContainerOpts2.Envs), 0)
// Requesting to create a new pod with a single resourceName2 should succeed.
newPod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: uuid.NewUUID(),
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: string(uuid.NewUUID()),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(resourceName2): resourceQuantity2,
},
},
},
},
},
}
runContainerOpts3, err := cm.GetResources(newPod, &newPod.Spec.Containers[0], activePods)
as.Nil(err)
as.Equal(len(runContainerOpts3.Envs), 1)
}

View file

@ -0,0 +1,39 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
)
func NewFakeInternalContainerLifecycle() *fakeInternalContainerLifecycle {
return &fakeInternalContainerLifecycle{}
}
type fakeInternalContainerLifecycle struct{}
func (f *fakeInternalContainerLifecycle) PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error {
return nil
}
func (f *fakeInternalContainerLifecycle) PreStopContainer(containerID string) error {
return nil
}
func (f *fakeInternalContainerLifecycle) PostStopContainer(containerID string) error {
return nil
}

View file

@ -0,0 +1,224 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"k8s.io/api/core/v1"
v1helper "k8s.io/kubernetes/pkg/api/v1/helper"
v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos"
"k8s.io/kubernetes/pkg/api/v1/resource"
)
const (
// Taken from lmctfy https://github.com/google/lmctfy/blob/master/lmctfy/controllers/cpu_controller.cc
MinShares = 2
SharesPerCPU = 1024
MilliCPUToCPU = 1000
// 100000 is equivalent to 100ms
QuotaPeriod = 100000
MinQuotaPeriod = 1000
)
// MilliCPUToQuota converts milliCPU to CFS quota and period values.
func MilliCPUToQuota(milliCPU int64) (quota int64, period uint64) {
// CFS quota is measured in two values:
// - cfs_period_us=100ms (the amount of time to measure usage across)
// - cfs_quota=20ms (the amount of cpu time allowed to be used across a period)
// so in the above example, you are limited to 20% of a single CPU
// for multi-cpu environments, you just scale equivalent amounts
if milliCPU == 0 {
return
}
// we set the period to 100ms by default
period = QuotaPeriod
// we then convert your milliCPU to a value normalized over a period
quota = (milliCPU * QuotaPeriod) / MilliCPUToCPU
// quota needs to be a minimum of 1ms.
if quota < MinQuotaPeriod {
quota = MinQuotaPeriod
}
return
}
// MilliCPUToShares converts the milliCPU to CFS shares.
func MilliCPUToShares(milliCPU int64) uint64 {
if milliCPU == 0 {
// Docker converts zero milliCPU to unset, which maps to kernel default
// for unset: 1024. Return 2 here to really match kernel default for
// zero milliCPU.
return MinShares
}
// Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding.
shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU
if shares < MinShares {
return MinShares
}
return uint64(shares)
}
// HugePageLimits converts the API representation to a map
// from huge page size (in bytes) to huge page limit (in bytes).
func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
hugePageLimits := map[int64]int64{}
for k, v := range resourceList {
if v1helper.IsHugePageResourceName(k) {
pageSize, _ := v1helper.HugePageSizeFromResourceName(k)
if value, exists := hugePageLimits[pageSize.Value()]; exists {
hugePageLimits[pageSize.Value()] = value + v.Value()
} else {
hugePageLimits[pageSize.Value()] = v.Value()
}
}
}
return hugePageLimits
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
// sum requests and limits.
reqs, limits := resource.PodRequestsAndLimits(pod)
cpuRequests := int64(0)
cpuLimits := int64(0)
memoryLimits := int64(0)
if request, found := reqs[v1.ResourceCPU]; found {
cpuRequests = request.MilliValue()
}
if limit, found := limits[v1.ResourceCPU]; found {
cpuLimits = limit.MilliValue()
}
if limit, found := limits[v1.ResourceMemory]; found {
memoryLimits = limit.Value()
}
// convert to CFS values
cpuShares := MilliCPUToShares(cpuRequests)
cpuQuota, cpuPeriod := MilliCPUToQuota(cpuLimits)
// track if limits were applied for each resource.
memoryLimitsDeclared := true
cpuLimitsDeclared := true
// map hugepage pagesize (bytes) to limits (bytes)
hugePageLimits := map[int64]int64{}
for _, container := range pod.Spec.Containers {
if container.Resources.Limits.Cpu().IsZero() {
cpuLimitsDeclared = false
}
if container.Resources.Limits.Memory().IsZero() {
memoryLimitsDeclared = false
}
containerHugePageLimits := HugePageLimits(container.Resources.Requests)
for k, v := range containerHugePageLimits {
if value, exists := hugePageLimits[k]; exists {
hugePageLimits[k] = value + v
} else {
hugePageLimits[k] = v
}
}
}
// determine the qos class
qosClass := v1qos.GetPodQOS(pod)
// build the result
result := &ResourceConfig{}
if qosClass == v1.PodQOSGuaranteed {
result.CpuShares = &cpuShares
result.CpuQuota = &cpuQuota
result.CpuPeriod = &cpuPeriod
result.Memory = &memoryLimits
} else if qosClass == v1.PodQOSBurstable {
result.CpuShares = &cpuShares
if cpuLimitsDeclared {
result.CpuQuota = &cpuQuota
result.CpuPeriod = &cpuPeriod
}
if memoryLimitsDeclared {
result.Memory = &memoryLimits
}
} else {
shares := uint64(MinShares)
result.CpuShares = &shares
}
result.HugePageLimit = hugePageLimits
return result
}
// GetCgroupSubsystems returns information about the mounted cgroup subsystems
func GetCgroupSubsystems() (*CgroupSubsystems, error) {
// get all cgroup mounts.
allCgroups, err := libcontainercgroups.GetCgroupMounts(true)
if err != nil {
return &CgroupSubsystems{}, err
}
if len(allCgroups) == 0 {
return &CgroupSubsystems{}, fmt.Errorf("failed to find cgroup mounts")
}
mountPoints := make(map[string]string, len(allCgroups))
for _, mount := range allCgroups {
for _, subsystem := range mount.Subsystems {
mountPoints[subsystem] = mount.Mountpoint
}
}
return &CgroupSubsystems{
Mounts: allCgroups,
MountPoints: mountPoints,
}, nil
}
// getCgroupProcs takes a cgroup directory name as an argument
// reads through the cgroup's procs file and returns a list of tgid's.
// It returns an empty list if a procs file doesn't exists
func getCgroupProcs(dir string) ([]int, error) {
procsFile := filepath.Join(dir, "cgroup.procs")
f, err := os.Open(procsFile)
if err != nil {
if os.IsNotExist(err) {
// The procsFile does not exist, So no pids attached to this directory
return []int{}, nil
}
return nil, err
}
defer f.Close()
s := bufio.NewScanner(f)
out := []int{}
for s.Scan() {
if t := s.Text(); t != "" {
pid, err := strconv.Atoi(t)
if err != nil {
return nil, fmt.Errorf("unexpected line in %v; could not convert to pid: %v", procsFile, err)
}
out = append(out, pid)
}
}
return out, nil
}

View file

@ -0,0 +1,199 @@
// +build linux
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"reflect"
"testing"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)
// getResourceList returns a ResourceList with the
// specified cpu and memory resource values
func getResourceList(cpu, memory string) v1.ResourceList {
res := v1.ResourceList{}
if cpu != "" {
res[v1.ResourceCPU] = resource.MustParse(cpu)
}
if memory != "" {
res[v1.ResourceMemory] = resource.MustParse(memory)
}
return res
}
// getResourceRequirements returns a ResourceRequirements object
func getResourceRequirements(requests, limits v1.ResourceList) v1.ResourceRequirements {
res := v1.ResourceRequirements{}
res.Requests = requests
res.Limits = limits
return res
}
func TestResourceConfigForPod(t *testing.T) {
minShares := uint64(MinShares)
burstableShares := MilliCPUToShares(100)
memoryQuantity := resource.MustParse("200Mi")
burstableMemory := memoryQuantity.Value()
burstablePartialShares := MilliCPUToShares(200)
burstableQuota, burstablePeriod := MilliCPUToQuota(200)
guaranteedShares := MilliCPUToShares(100)
guaranteedQuota, guaranteedPeriod := MilliCPUToQuota(100)
memoryQuantity = resource.MustParse("100Mi")
guaranteedMemory := memoryQuantity.Value()
testCases := map[string]struct {
pod *v1.Pod
expected *ResourceConfig
}{
"besteffort": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("", ""), getResourceList("", "")),
},
},
},
},
expected: &ResourceConfig{CpuShares: &minShares},
},
"burstable-no-limits": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")),
},
},
},
},
expected: &ResourceConfig{CpuShares: &burstableShares},
},
"burstable-with-limits": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
},
},
},
},
expected: &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &burstableQuota, CpuPeriod: &burstablePeriod, Memory: &burstableMemory},
},
"burstable-partial-limits": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
},
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")),
},
},
},
},
expected: &ResourceConfig{CpuShares: &burstablePartialShares},
},
"guaranteed": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
},
},
},
},
expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &guaranteedPeriod, Memory: &guaranteedMemory},
},
}
for testName, testCase := range testCases {
actual := ResourceConfigForPod(testCase.pod)
if !reflect.DeepEqual(actual.CpuPeriod, testCase.expected.CpuPeriod) {
t.Errorf("unexpected result, test: %v, cpu period not as expected", testName)
}
if !reflect.DeepEqual(actual.CpuQuota, testCase.expected.CpuQuota) {
t.Errorf("unexpected result, test: %v, cpu quota not as expected", testName)
}
if !reflect.DeepEqual(actual.CpuShares, testCase.expected.CpuShares) {
t.Errorf("unexpected result, test: %v, cpu shares not as expected", testName)
}
if !reflect.DeepEqual(actual.Memory, testCase.expected.Memory) {
t.Errorf("unexpected result, test: %v, memory not as expected", testName)
}
}
}
func TestMilliCPUToQuota(t *testing.T) {
testCases := []struct {
input int64
quota int64
period uint64
}{
{
input: int64(0),
quota: int64(0),
period: uint64(0),
},
{
input: int64(5),
quota: int64(1000),
period: uint64(100000),
},
{
input: int64(9),
quota: int64(1000),
period: uint64(100000),
},
{
input: int64(10),
quota: int64(1000),
period: uint64(100000),
},
{
input: int64(200),
quota: int64(20000),
period: uint64(100000),
},
{
input: int64(500),
quota: int64(50000),
period: uint64(100000),
},
{
input: int64(1000),
quota: int64(100000),
period: uint64(100000),
},
{
input: int64(1500),
quota: int64(150000),
period: uint64(100000),
},
}
for _, testCase := range testCases {
quota, period := MilliCPUToQuota(testCase.input)
if quota != testCase.quota || period != testCase.period {
t.Errorf("Input %v, expected quota %v period %v, but got quota %v period %v", testCase.input, testCase.quota, testCase.period, quota, period)
}
}
}

View file

@ -0,0 +1,54 @@
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import "k8s.io/api/core/v1"
const (
MinShares = 0
SharesPerCPU = 0
MilliCPUToCPU = 0
QuotaPeriod = 0
MinQuotaPeriod = 0
)
// MilliCPUToQuota converts milliCPU to CFS quota and period values.
func MilliCPUToQuota(milliCPU int64) (int64, int64) {
return 0, 0
}
// MilliCPUToShares converts the milliCPU to CFS shares.
func MilliCPUToShares(milliCPU int64) int64 {
return 0
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
return nil
}
// GetCgroupSubsystems returns information about the mounted cgroup subsystems
func GetCgroupSubsystems() (*CgroupSubsystems, error) {
return nil, nil
}
func getCgroupProcs(dir string) ([]int, error) {
return nil, nil
}

View file

@ -0,0 +1,57 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
)
type InternalContainerLifecycle interface {
PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error
PreStopContainer(containerID string) error
PostStopContainer(containerID string) error
}
// Implements InternalContainerLifecycle interface.
type internalContainerLifecycleImpl struct {
cpuManager cpumanager.Manager
}
func (i *internalContainerLifecycleImpl) PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
return i.cpuManager.AddContainer(pod, container, containerID)
}
return nil
}
func (i *internalContainerLifecycleImpl) PreStopContainer(containerID string) error {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
return i.cpuManager.RemoveContainer(containerID)
}
return nil
}
func (i *internalContainerLifecycleImpl) PostStopContainer(containerID string) error {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManager) {
return i.cpuManager.RemoveContainer(containerID)
}
return nil
}

View file

@ -0,0 +1,262 @@
// +build linux
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"strings"
"time"
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/api"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/events"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
const (
defaultNodeAllocatableCgroupName = "kubepods"
)
func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
cgroupConfig := &CgroupConfig{
Name: CgroupName(cm.cgroupRoot),
// The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
ResourceParameters: getCgroupConfig(cm.capacity),
}
if cm.cgroupManager.Exists(cgroupConfig.Name) {
return nil
}
if err := cm.cgroupManager.Create(cgroupConfig); err != nil {
glog.Errorf("Failed to create %q cgroup", cm.cgroupRoot)
return err
}
return nil
}
// Enforce Node Allocatable Cgroup settings.
func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
nc := cm.NodeConfig.NodeAllocatableConfig
// We need to update limits on node allocatable cgroup no matter what because
// default cpu shares on cgroups are low and can cause cpu starvation.
nodeAllocatable := cm.capacity
// Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable.
if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(NodeAllocatableEnforcementKey) {
nodeAllocatable = cm.getNodeAllocatableAbsolute()
}
glog.V(4).Infof("Attempting to enforce Node Allocatable with config: %+v", nc)
cgroupConfig := &CgroupConfig{
Name: CgroupName(cm.cgroupRoot),
ResourceParameters: getCgroupConfig(nodeAllocatable),
}
// Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
nodeRef := &v1.ObjectReference{
Kind: "Node",
Name: cm.nodeInfo.Name,
UID: types.UID(cm.nodeInfo.Name),
Namespace: "",
}
// If Node Allocatable is enforced on a node that has not been drained or is updated on an existing node to a lower value,
// existing memory usage across pods might be higher that current Node Allocatable Memory Limits.
// Pod Evictions are expected to bring down memory usage to below Node Allocatable limits.
// Until evictions happen retry cgroup updates.
// Update limits on non root cgroup-root to be safe since the default limits for CPU can be too low.
if cm.cgroupRoot != "/" {
go func() {
for {
err := cm.cgroupManager.Update(cgroupConfig)
if err == nil {
cm.recorder.Event(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated Node Allocatable limit across pods")
return
}
message := fmt.Sprintf("Failed to update Node Allocatable Limits %q: %v", cm.cgroupRoot, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
time.Sleep(time.Minute)
}
}()
}
// Now apply kube reserved and system reserved limits if required.
if nc.EnforceNodeAllocatable.Has(SystemReservedEnforcementKey) {
glog.V(2).Infof("Enforcing System reserved on cgroup %q with limits: %+v", nc.SystemReservedCgroupName, nc.SystemReserved)
if err := enforceExistingCgroup(cm.cgroupManager, nc.SystemReservedCgroupName, nc.SystemReserved); err != nil {
message := fmt.Sprintf("Failed to enforce System Reserved Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return fmt.Errorf(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on system reserved cgroup %v", nc.SystemReservedCgroupName)
}
if nc.EnforceNodeAllocatable.Has(KubeReservedEnforcementKey) {
glog.V(2).Infof("Enforcing kube reserved on cgroup %q with limits: %+v", nc.KubeReservedCgroupName, nc.KubeReserved)
if err := enforceExistingCgroup(cm.cgroupManager, nc.KubeReservedCgroupName, nc.KubeReserved); err != nil {
message := fmt.Sprintf("Failed to enforce Kube Reserved Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return fmt.Errorf(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on kube reserved cgroup %v", nc.KubeReservedCgroupName)
}
return nil
}
// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
func enforceExistingCgroup(cgroupManager CgroupManager, cName string, rl v1.ResourceList) error {
cgroupConfig := &CgroupConfig{
Name: CgroupName(cName),
ResourceParameters: getCgroupConfig(rl),
}
glog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares and %d bytes of memory", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory)
if !cgroupManager.Exists(cgroupConfig.Name) {
return fmt.Errorf("%q cgroup does not exist", cgroupConfig.Name)
}
if err := cgroupManager.Update(cgroupConfig); err != nil {
return err
}
return nil
}
// Returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface.
func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
// TODO(vishh): Set CPU Quota if necessary.
if rl == nil {
return nil
}
var rc ResourceConfig
if q, exists := rl[v1.ResourceMemory]; exists {
// Memory is defined in bytes.
val := q.Value()
rc.Memory = &val
}
if q, exists := rl[v1.ResourceCPU]; exists {
// CPU is defined in milli-cores.
val := MilliCPUToShares(q.MilliValue())
rc.CpuShares = &val
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
rc.HugePageLimit = HugePageLimits(rl)
}
return &rc
}
// getNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
// Note that not all resources that are available on the node are included in the returned list of resources.
// Returns a ResourceList.
func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList {
result := make(v1.ResourceList)
for k, v := range cm.capacity {
value := *(v.Copy())
if cm.NodeConfig.SystemReserved != nil {
value.Sub(cm.NodeConfig.SystemReserved[k])
}
if cm.NodeConfig.KubeReserved != nil {
value.Sub(cm.NodeConfig.KubeReserved[k])
}
if value.Sign() < 0 {
// Negative Allocatable resources don't make sense.
value.Set(0)
}
result[k] = value
}
return result
}
// GetNodeAllocatable returns amount of compute or storage resource that have to be reserved on this node from scheduling.
func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList {
evictionReservation := hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity)
result := make(v1.ResourceList)
for k := range cm.capacity {
value := resource.NewQuantity(0, resource.DecimalSI)
if cm.NodeConfig.SystemReserved != nil {
value.Add(cm.NodeConfig.SystemReserved[k])
}
if cm.NodeConfig.KubeReserved != nil {
value.Add(cm.NodeConfig.KubeReserved[k])
}
if evictionReservation != nil {
value.Add(evictionReservation[k])
}
if !value.IsZero() {
result[k] = *value
}
}
return result
}
// hardEvictionReservation returns a resourcelist that includes reservation of resources based on hard eviction thresholds.
func hardEvictionReservation(thresholds []evictionapi.Threshold, capacity v1.ResourceList) v1.ResourceList {
if len(thresholds) == 0 {
return nil
}
ret := v1.ResourceList{}
for _, threshold := range thresholds {
if threshold.Operator != evictionapi.OpLessThan {
continue
}
switch threshold.Signal {
case evictionapi.SignalMemoryAvailable:
memoryCapacity := capacity[v1.ResourceMemory]
value := evictionapi.GetThresholdQuantity(threshold.Value, &memoryCapacity)
ret[v1.ResourceMemory] = *value
case evictionapi.SignalNodeFsAvailable:
storageCapacity := capacity[v1.ResourceEphemeralStorage]
value := evictionapi.GetThresholdQuantity(threshold.Value, &storageCapacity)
ret[v1.ResourceEphemeralStorage] = *value
}
}
return ret
}
// validateNodeAllocatable ensures that the user specified Node Allocatable Configuration doesn't reserve more than the node capacity.
// Returns error if the configuration is invalid, nil otherwise.
func (cm *containerManagerImpl) validateNodeAllocatable() error {
var errors []string
nar := cm.GetNodeAllocatableReservation()
for k, v := range nar {
capacityClone, err := api.Scheme.DeepCopy(cm.capacity[k])
if err != nil {
errors = append(errors, fmt.Sprintf("DeepCopy capacity error"))
}
value, ok := capacityClone.(resource.Quantity)
if !ok {
return fmt.Errorf(
"failed to cast object %#v to Quantity",
capacityClone)
}
value.Sub(v)
if value.Sign() < 0 {
errors = append(errors, fmt.Sprintf("Resource %q has an allocatable of %v, capacity of %v", k, v, value))
}
}
if len(errors) > 0 {
return fmt.Errorf("Invalid Node Allocatable configuration. %s", strings.Join(errors, " "))
}
return nil
}

View file

@ -0,0 +1,370 @@
// +build linux
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"testing"
"github.com/stretchr/testify/assert"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
func TestNodeAllocatableReservationForScheduling(t *testing.T) {
memoryEvictionThreshold := resource.MustParse("100Mi")
testCases := []struct {
kubeReserved v1.ResourceList
systemReserved v1.ResourceList
expected v1.ResourceList
capacity v1.ResourceList
hardThreshold evictionapi.ThresholdValue
}{
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("150m", "150Mi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
hardThreshold: evictionapi.ThresholdValue{
Quantity: &memoryEvictionThreshold,
},
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("150m", "250Mi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
hardThreshold: evictionapi.ThresholdValue{
Percentage: 0.05,
},
expected: getResourceList("150m", "694157320"),
},
{
kubeReserved: v1.ResourceList{},
systemReserved: v1.ResourceList{},
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("", ""),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("50m", "150Mi"),
},
{
kubeReserved: getResourceList("50m", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("50m", "150Mi"),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", ""),
expected: getResourceList("", "150Mi"),
},
}
for idx, tc := range testCases {
nc := NodeConfig{
NodeAllocatableConfig: NodeAllocatableConfig{
KubeReserved: tc.kubeReserved,
SystemReserved: tc.systemReserved,
HardEvictionThresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: tc.hardThreshold,
},
},
},
}
cm := &containerManagerImpl{
NodeConfig: nc,
capacity: tc.capacity,
}
for k, v := range cm.GetNodeAllocatableReservation() {
expected, exists := tc.expected[k]
assert.True(t, exists, "test case %d expected resource %q", idx+1, k)
assert.Equal(t, expected.MilliValue(), v.MilliValue(), "test case %d failed for resource %q", idx+1, k)
}
}
}
func TestNodeAllocatableWithNilHardThreshold(t *testing.T) {
nc := NodeConfig{
NodeAllocatableConfig: NodeAllocatableConfig{
KubeReserved: getResourceList("100m", "100Mi"),
SystemReserved: getResourceList("50m", "50Mi"),
},
}
cm := &containerManagerImpl{
NodeConfig: nc,
capacity: getResourceList("10", "10Gi"),
}
expected := getResourceList("150m", "150Mi")
for k, v := range cm.GetNodeAllocatableReservation() {
expected, exists := expected[k]
assert.True(t, exists)
assert.Equal(t, expected.MilliValue(), v.MilliValue(), "failed for resource %q", k)
}
}
func TestNodeAllocatableForEnforcement(t *testing.T) {
memoryEvictionThreshold := resource.MustParse("100Mi")
testCases := []struct {
kubeReserved v1.ResourceList
systemReserved v1.ResourceList
capacity v1.ResourceList
expected v1.ResourceList
hardThreshold evictionapi.ThresholdValue
}{
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("9850m", "10090Mi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
hardThreshold: evictionapi.ThresholdValue{
Quantity: &memoryEvictionThreshold,
},
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("9850m", "10090Mi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
hardThreshold: evictionapi.ThresholdValue{
Percentage: 0.05,
},
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("9850m", "10090Mi"),
},
{
kubeReserved: v1.ResourceList{},
systemReserved: v1.ResourceList{},
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("9950m", "10090Mi"),
},
{
kubeReserved: getResourceList("50m", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", "10Gi"),
expected: getResourceList("9950m", "10090Mi"),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", ""),
expected: getResourceList("10", ""),
},
}
for idx, tc := range testCases {
nc := NodeConfig{
NodeAllocatableConfig: NodeAllocatableConfig{
KubeReserved: tc.kubeReserved,
SystemReserved: tc.systemReserved,
HardEvictionThresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: tc.hardThreshold,
},
},
},
}
cm := &containerManagerImpl{
NodeConfig: nc,
capacity: tc.capacity,
}
for k, v := range cm.getNodeAllocatableAbsolute() {
expected, exists := tc.expected[k]
assert.True(t, exists)
assert.Equal(t, expected.MilliValue(), v.MilliValue(), "test case %d failed for resource %q", idx+1, k)
}
}
}
func TestNodeAllocatableInputValidation(t *testing.T) {
memoryEvictionThreshold := resource.MustParse("100Mi")
highMemoryEvictionThreshold := resource.MustParse("2Gi")
cpuMemTestCases := []struct {
kubeReserved v1.ResourceList
systemReserved v1.ResourceList
capacity v1.ResourceList
hardThreshold evictionapi.ThresholdValue
invalidConfiguration bool
}{
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
hardThreshold: evictionapi.ThresholdValue{
Quantity: &memoryEvictionThreshold,
},
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("100m", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
hardThreshold: evictionapi.ThresholdValue{
Percentage: 0.05,
},
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: v1.ResourceList{},
systemReserved: v1.ResourceList{},
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("50m", "50Mi"),
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("50m", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", "10Gi"),
},
{
kubeReserved: getResourceList("", "100Mi"),
systemReserved: getResourceList("", "50Mi"),
capacity: getResourceList("10", ""),
},
{
kubeReserved: getResourceList("5", "10Gi"),
systemReserved: getResourceList("5", "10Gi"),
hardThreshold: evictionapi.ThresholdValue{
Quantity: &highMemoryEvictionThreshold,
},
capacity: getResourceList("10", "11Gi"),
invalidConfiguration: true,
},
}
for _, tc := range cpuMemTestCases {
nc := NodeConfig{
NodeAllocatableConfig: NodeAllocatableConfig{
KubeReserved: tc.kubeReserved,
SystemReserved: tc.systemReserved,
HardEvictionThresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: tc.hardThreshold,
},
},
},
}
cm := &containerManagerImpl{
NodeConfig: nc,
capacity: tc.capacity,
}
err := cm.validateNodeAllocatable()
if err == nil && tc.invalidConfiguration {
t.Logf("Expected invalid node allocatable configuration")
t.FailNow()
} else if err != nil && !tc.invalidConfiguration {
t.Logf("Expected valid node allocatable configuration: %v", err)
t.FailNow()
}
}
storageEvictionThreshold := resource.MustParse("100Mi")
storageTestCases := []struct {
kubeReserved v1.ResourceList
systemReserved v1.ResourceList
capacity v1.ResourceList
hardThreshold evictionapi.ThresholdValue
invalidConfiguration bool
}{
{
kubeReserved: getEphemeralStorageResourceList("100Mi"),
systemReserved: getEphemeralStorageResourceList("50Mi"),
capacity: getEphemeralStorageResourceList("500Mi"),
},
{
kubeReserved: getEphemeralStorageResourceList("10Gi"),
systemReserved: getEphemeralStorageResourceList("10Gi"),
hardThreshold: evictionapi.ThresholdValue{
Quantity: &storageEvictionThreshold,
},
capacity: getEphemeralStorageResourceList("20Gi"),
invalidConfiguration: true,
},
}
for _, tc := range storageTestCases {
nc := NodeConfig{
NodeAllocatableConfig: NodeAllocatableConfig{
KubeReserved: tc.kubeReserved,
SystemReserved: tc.systemReserved,
HardEvictionThresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalNodeFsAvailable,
Operator: evictionapi.OpLessThan,
Value: tc.hardThreshold,
},
},
},
}
cm := &containerManagerImpl{
NodeConfig: nc,
capacity: tc.capacity,
}
err := cm.validateNodeAllocatable()
if err == nil && tc.invalidConfiguration {
t.Logf("Expected invalid node allocatable configuration")
t.FailNow()
} else if err != nil && !tc.invalidConfiguration {
t.Logf("Expected valid node allocatable configuration: %v", err)
t.FailNow()
}
}
}
// getEphemeralStorageResourceList returns a ResourceList with the
// specified ephemeral storage resource values
func getEphemeralStorageResourceList(storage string) v1.ResourceList {
res := v1.ResourceList{}
if storage != "" {
res[v1.ResourceEphemeralStorage] = resource.MustParse(storage)
}
return res
}

View file

@ -0,0 +1,271 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"io/ioutil"
"os"
"path"
"strings"
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos"
)
const (
podCgroupNamePrefix = "pod"
)
// podContainerManagerImpl implements podContainerManager interface.
// It is the general implementation which allows pod level container
// management if qos Cgroup is enabled.
type podContainerManagerImpl struct {
// qosContainersInfo hold absolute paths of the top level qos containers
qosContainersInfo QOSContainersInfo
// Stores the mounted cgroup subsystems
subsystems *CgroupSubsystems
// cgroupManager is the cgroup Manager Object responsible for managing all
// pod cgroups.
cgroupManager CgroupManager
}
// Make sure that podContainerManagerImpl implements the PodContainerManager interface
var _ PodContainerManager = &podContainerManagerImpl{}
// applyLimits sets pod cgroup resource limits
// It also updates the resource limits on top level qos containers.
func (m *podContainerManagerImpl) applyLimits(pod *v1.Pod) error {
// This function will house the logic for setting the resource parameters
// on the pod container config and updating top level qos container configs
return nil
}
// Exists checks if the pod's cgroup already exists
func (m *podContainerManagerImpl) Exists(pod *v1.Pod) bool {
podContainerName, _ := m.GetPodContainerName(pod)
return m.cgroupManager.Exists(podContainerName)
}
// EnsureExists takes a pod as argument and makes sure that
// pod cgroup exists if qos cgroup hierarchy flag is enabled.
// If the pod level container doesn't already exist it is created.
func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
podContainerName, _ := m.GetPodContainerName(pod)
// check if container already exist
alreadyExists := m.Exists(pod)
if !alreadyExists {
// Create the pod container
containerConfig := &CgroupConfig{
Name: podContainerName,
ResourceParameters: ResourceConfigForPod(pod),
}
if err := m.cgroupManager.Create(containerConfig); err != nil {
return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)
}
}
// Apply appropriate resource limits on the pod container
// Top level qos containers limits are not updated
// until we figure how to maintain the desired state in the kubelet.
// Because maintaining the desired state is difficult without checkpointing.
if err := m.applyLimits(pod); err != nil {
return fmt.Errorf("failed to apply resource limits on container for %v : %v", podContainerName, err)
}
return nil
}
// GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) {
podQOS := v1qos.GetPodQOS(pod)
// Get the parent QOS container name
var parentContainer string
switch podQOS {
case v1.PodQOSGuaranteed:
parentContainer = m.qosContainersInfo.Guaranteed
case v1.PodQOSBurstable:
parentContainer = m.qosContainersInfo.Burstable
case v1.PodQOSBestEffort:
parentContainer = m.qosContainersInfo.BestEffort
}
podContainer := podCgroupNamePrefix + string(pod.UID)
// Get the absolute path of the cgroup
cgroupName := (CgroupName)(path.Join(parentContainer, podContainer))
// Get the literal cgroupfs name
cgroupfsName := m.cgroupManager.Name(cgroupName)
return cgroupName, cgroupfsName
}
// Scan through the whole cgroup directory and kill all processes either
// attached to the pod cgroup or to a container cgroup under the pod cgroup
func (m *podContainerManagerImpl) tryKillingCgroupProcesses(podCgroup CgroupName) error {
pidsToKill := m.cgroupManager.Pids(podCgroup)
// No pids charged to the terminated pod cgroup return
if len(pidsToKill) == 0 {
return nil
}
var errlist []error
// os.Kill often errors out,
// We try killing all the pids multiple times
for i := 0; i < 5; i++ {
if i != 0 {
glog.V(3).Infof("Attempt %v failed to kill all unwanted process. Retyring", i)
}
errlist = []error{}
for _, pid := range pidsToKill {
p, err := os.FindProcess(pid)
if err != nil {
// Process not running anymore, do nothing
continue
}
glog.V(3).Infof("Attempt to kill process with pid: %v", pid)
if err := p.Kill(); err != nil {
glog.V(3).Infof("failed to kill process with pid: %v", pid)
errlist = append(errlist, err)
}
}
if len(errlist) == 0 {
glog.V(3).Infof("successfully killed all unwanted processes.")
return nil
}
}
return utilerrors.NewAggregate(errlist)
}
// Destroy destroys the pod container cgroup paths
func (m *podContainerManagerImpl) Destroy(podCgroup CgroupName) error {
// Try killing all the processes attached to the pod cgroup
if err := m.tryKillingCgroupProcesses(podCgroup); err != nil {
glog.V(3).Infof("failed to kill all the processes attached to the %v cgroups", podCgroup)
return fmt.Errorf("failed to kill all the processes attached to the %v cgroups : %v", podCgroup, err)
}
// Now its safe to remove the pod's cgroup
containerConfig := &CgroupConfig{
Name: podCgroup,
ResourceParameters: &ResourceConfig{},
}
if err := m.cgroupManager.Destroy(containerConfig); err != nil {
return fmt.Errorf("failed to delete cgroup paths for %v : %v", podCgroup, err)
}
return nil
}
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
func (m *podContainerManagerImpl) ReduceCPULimits(podCgroup CgroupName) error {
return m.cgroupManager.ReduceCPULimits(podCgroup)
}
// GetAllPodsFromCgroups scans through all the subsystems of pod cgroups
// Get list of pods whose cgroup still exist on the cgroup mounts
func (m *podContainerManagerImpl) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
// Map for storing all the found pods on the disk
foundPods := make(map[types.UID]CgroupName)
qosContainersList := [3]string{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
// Scan through all the subsystem mounts
// and through each QoS cgroup directory for each subsystem mount
// If a pod cgroup exists in even a single subsystem mount
// we will attempt to delete it
for _, val := range m.subsystems.MountPoints {
for _, qosContainerName := range qosContainersList {
// get the subsystems QoS cgroup absolute name
qcConversion := m.cgroupManager.Name(CgroupName(qosContainerName))
qc := path.Join(val, qcConversion)
dirInfo, err := ioutil.ReadDir(qc)
if err != nil {
if os.IsNotExist(err) {
continue
}
return nil, fmt.Errorf("failed to read the cgroup directory %v : %v", qc, err)
}
for i := range dirInfo {
// its not a directory, so continue on...
if !dirInfo[i].IsDir() {
continue
}
// convert the concrete cgroupfs name back to an internal identifier
// this is needed to handle path conversion for systemd environments.
// we pass the fully qualified path so decoding can work as expected
// since systemd encodes the path in each segment.
cgroupfsPath := path.Join(qcConversion, dirInfo[i].Name())
internalPath := m.cgroupManager.CgroupName(cgroupfsPath)
// we only care about base segment of the converted path since that
// is what we are reading currently to know if it is a pod or not.
basePath := path.Base(string(internalPath))
if !strings.Contains(basePath, podCgroupNamePrefix) {
continue
}
// we then split the name on the pod prefix to determine the uid
parts := strings.Split(basePath, podCgroupNamePrefix)
// the uid is missing, so we log the unexpected cgroup not of form pod<uid>
if len(parts) != 2 {
glog.Errorf("pod cgroup manager ignoring unexpected cgroup %v because it is not a pod", cgroupfsPath)
continue
}
podUID := parts[1]
foundPods[types.UID(podUID)] = internalPath
}
}
}
return foundPods, nil
}
// podContainerManagerNoop implements podContainerManager interface.
// It is a no-op implementation and basically does nothing
// podContainerManagerNoop is used in case the QoS cgroup Hierarchy is not
// enabled, so Exists() returns true always as the cgroupRoot
// is expected to always exist.
type podContainerManagerNoop struct {
cgroupRoot CgroupName
}
// Make sure that podContainerManagerStub implements the PodContainerManager interface
var _ PodContainerManager = &podContainerManagerNoop{}
func (m *podContainerManagerNoop) Exists(_ *v1.Pod) bool {
return true
}
func (m *podContainerManagerNoop) EnsureExists(_ *v1.Pod) error {
return nil
}
func (m *podContainerManagerNoop) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
return m.cgroupRoot, string(m.cgroupRoot)
}
func (m *podContainerManagerNoop) GetPodContainerNameForDriver(_ *v1.Pod) string {
return ""
}
// Destroy destroys the pod container cgroup paths
func (m *podContainerManagerNoop) Destroy(_ CgroupName) error {
return nil
}
func (m *podContainerManagerNoop) ReduceCPULimits(_ CgroupName) error {
return nil
}
func (m *podContainerManagerNoop) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
return nil, nil
}

View file

@ -0,0 +1,51 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
type podContainerManagerStub struct {
}
var _ PodContainerManager = &podContainerManagerStub{}
func (m *podContainerManagerStub) Exists(_ *v1.Pod) bool {
return true
}
func (m *podContainerManagerStub) EnsureExists(_ *v1.Pod) error {
return nil
}
func (m *podContainerManagerStub) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
return "", ""
}
func (m *podContainerManagerStub) Destroy(_ CgroupName) error {
return nil
}
func (m *podContainerManagerStub) ReduceCPULimits(_ CgroupName) error {
return nil
}
func (m *podContainerManagerStub) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
return nil, nil
}

View file

@ -0,0 +1,53 @@
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
type unsupportedPodContainerManager struct {
}
var _ PodContainerManager = &unsupportedPodContainerManager{}
func (m *unsupportedPodContainerManager) Exists(_ *v1.Pod) bool {
return true
}
func (m *unsupportedPodContainerManager) EnsureExists(_ *v1.Pod) error {
return nil
}
func (m *unsupportedPodContainerManager) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
return "", ""
}
func (m *unsupportedPodContainerManager) ReduceCPULimits(_ CgroupName) error {
return nil
}
func (m *unsupportedPodContainerManager) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
return nil, nil
}
func (m *unsupportedPodContainerManager) Destroy(name CgroupName) error {
return nil
}

View file

@ -0,0 +1,362 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"path"
"strings"
"sync"
"time"
"github.com/golang/glog"
"k8s.io/apimachinery/pkg/util/wait"
units "github.com/docker/go-units"
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
"k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos"
"k8s.io/kubernetes/pkg/api/v1/resource"
kubefeatures "k8s.io/kubernetes/pkg/features"
)
const (
// how often the qos cgroup manager will perform periodic update
// of the qos level cgroup resource constraints
periodicQOSCgroupUpdateInterval = 1 * time.Minute
)
type QOSContainerManager interface {
Start(func() v1.ResourceList, ActivePodsFunc) error
GetQOSContainersInfo() QOSContainersInfo
UpdateCgroups() error
}
type qosContainerManagerImpl struct {
sync.Mutex
nodeInfo *v1.Node
qosContainersInfo QOSContainersInfo
subsystems *CgroupSubsystems
cgroupManager CgroupManager
activePods ActivePodsFunc
getNodeAllocatable func() v1.ResourceList
cgroupRoot string
qosReserved map[v1.ResourceName]int64
}
func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot string, nodeConfig NodeConfig) (QOSContainerManager, error) {
if !nodeConfig.CgroupsPerQOS {
return &qosContainerManagerNoop{
cgroupRoot: CgroupName(nodeConfig.CgroupRoot),
}, nil
}
return &qosContainerManagerImpl{
subsystems: subsystems,
cgroupManager: NewCgroupManager(subsystems, nodeConfig.CgroupDriver),
cgroupRoot: cgroupRoot,
qosReserved: nodeConfig.ExperimentalQOSReserved,
}, nil
}
func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
return m.qosContainersInfo
}
func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
cm := m.cgroupManager
rootContainer := m.cgroupRoot
if !cm.Exists(CgroupName(rootContainer)) {
return fmt.Errorf("root container %s doesn't exist", rootContainer)
}
// Top level for Qos containers are created only for Burstable
// and Best Effort classes
qosClasses := map[v1.PodQOSClass]string{
v1.PodQOSBurstable: path.Join(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))),
v1.PodQOSBestEffort: path.Join(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))),
}
// Create containers for both qos classes
for qosClass, containerName := range qosClasses {
// get the container's absolute name
absoluteContainerName := CgroupName(containerName)
resourceParameters := &ResourceConfig{}
// the BestEffort QoS class has a statically configured minShares value
if qosClass == v1.PodQOSBestEffort {
minShares := uint64(MinShares)
resourceParameters.CpuShares = &minShares
}
// containerConfig object stores the cgroup specifications
containerConfig := &CgroupConfig{
Name: absoluteContainerName,
ResourceParameters: resourceParameters,
}
// for each enumerated huge page size, the qos tiers are unbounded
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
m.setHugePagesUnbounded(containerConfig)
}
// check if it exists
if !cm.Exists(absoluteContainerName) {
if err := cm.Create(containerConfig); err != nil {
return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err)
}
} else {
// to ensure we actually have the right state, we update the config on startup
if err := cm.Update(containerConfig); err != nil {
return fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err)
}
}
}
// Store the top level qos container names
m.qosContainersInfo = QOSContainersInfo{
Guaranteed: rootContainer,
Burstable: qosClasses[v1.PodQOSBurstable],
BestEffort: qosClasses[v1.PodQOSBestEffort],
}
m.getNodeAllocatable = getNodeAllocatable
m.activePods = activePods
// update qos cgroup tiers on startup and in periodic intervals
// to ensure desired state is in sync with actual state.
go wait.Until(func() {
err := m.UpdateCgroups()
if err != nil {
glog.Warningf("[ContainerManager] Failed to reserve QoS requests: %v", err)
}
}, periodicQOSCgroupUpdateInterval, wait.NeverStop)
return nil
}
// setHugePagesUnbounded ensures hugetlb is effectively unbounded
func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error {
hugePageLimit := map[int64]int64{}
for _, pageSize := range cgroupfs.HugePageSizes {
pageSizeBytes, err := units.RAMInBytes(pageSize)
if err != nil {
return err
}
hugePageLimit[pageSizeBytes] = int64(1 << 62)
}
cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit
return nil
}
func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
for _, v := range configs {
if err := m.setHugePagesUnbounded(v); err != nil {
return err
}
}
return nil
}
func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
pods := m.activePods()
burstablePodCPURequest := int64(0)
for i := range pods {
pod := pods[i]
qosClass := v1qos.GetPodQOS(pod)
if qosClass != v1.PodQOSBurstable {
// we only care about the burstable qos tier
continue
}
req, _ := resource.PodRequestsAndLimits(pod)
if request, found := req[v1.ResourceCPU]; found {
burstablePodCPURequest += request.MilliValue()
}
}
// make sure best effort is always 2 shares
bestEffortCPUShares := uint64(MinShares)
configs[v1.PodQOSBestEffort].ResourceParameters.CpuShares = &bestEffortCPUShares
// set burstable shares based on current observe state
burstableCPUShares := MilliCPUToShares(burstablePodCPURequest)
if burstableCPUShares < uint64(MinShares) {
burstableCPUShares = uint64(MinShares)
}
configs[v1.PodQOSBurstable].ResourceParameters.CpuShares = &burstableCPUShares
return nil
}
// setMemoryReserve sums the memory limits of all pods in a QOS class,
// calculates QOS class memory limits, and set those limits in the
// CgroupConfig for each QOS class.
func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
qosMemoryRequests := map[v1.PodQOSClass]int64{
v1.PodQOSGuaranteed: 0,
v1.PodQOSBurstable: 0,
}
// Sum the pod limits for pods in each QOS class
pods := m.activePods()
for _, pod := range pods {
podMemoryRequest := int64(0)
qosClass := v1qos.GetPodQOS(pod)
if qosClass == v1.PodQOSBestEffort {
// limits are not set for Best Effort pods
continue
}
req, _ := resource.PodRequestsAndLimits(pod)
if request, found := req[v1.ResourceMemory]; found {
podMemoryRequest += request.Value()
}
qosMemoryRequests[qosClass] += podMemoryRequest
}
resources := m.getNodeAllocatable()
allocatableResource, ok := resources[v1.ResourceMemory]
if !ok {
glog.V(2).Infof("[Container Manager] Allocatable memory value could not be determined. Not setting QOS memory limts.")
return
}
allocatable := allocatableResource.Value()
if allocatable == 0 {
glog.V(2).Infof("[Container Manager] Memory allocatable reported as 0, might be in standalone mode. Not setting QOS memory limts.")
return
}
for qos, limits := range qosMemoryRequests {
glog.V(2).Infof("[Container Manager] %s pod requests total %d bytes (reserve %d%%)", qos, limits, percentReserve)
}
// Calculate QOS memory limits
burstableLimit := allocatable - (qosMemoryRequests[v1.PodQOSGuaranteed] * percentReserve / 100)
bestEffortLimit := burstableLimit - (qosMemoryRequests[v1.PodQOSBurstable] * percentReserve / 100)
configs[v1.PodQOSBurstable].ResourceParameters.Memory = &burstableLimit
configs[v1.PodQOSBestEffort].ResourceParameters.Memory = &bestEffortLimit
}
// retrySetMemoryReserve checks for any QoS cgroups over the limit
// that was attempted to be set in the first Update() and adjusts
// their memory limit to the usage to prevent further growth.
func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
// Unreclaimable memory usage may already exceeded the desired limit
// Attempt to set the limit near the current usage to put pressure
// on the cgroup and prevent further growth.
for qos, config := range configs {
stats, err := m.cgroupManager.GetResourceStats(config.Name)
if err != nil {
glog.V(2).Infof("[Container Manager] %v", err)
return
}
usage := stats.MemoryStats.Usage
// Because there is no good way to determine of the original Update()
// on the memory resource was successful, we determine failure of the
// first attempt by checking if the usage is above the limit we attempt
// to set. If it is, we assume the first attempt to set the limit failed
// and try again setting the limit to the usage. Otherwise we leave
// the CgroupConfig as is.
if configs[qos].ResourceParameters.Memory != nil && usage > *configs[qos].ResourceParameters.Memory {
configs[qos].ResourceParameters.Memory = &usage
}
}
}
func (m *qosContainerManagerImpl) UpdateCgroups() error {
m.Lock()
defer m.Unlock()
qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
v1.PodQOSBurstable: {
Name: CgroupName(m.qosContainersInfo.Burstable),
ResourceParameters: &ResourceConfig{},
},
v1.PodQOSBestEffort: {
Name: CgroupName(m.qosContainersInfo.BestEffort),
ResourceParameters: &ResourceConfig{},
},
}
// update the qos level cgroup settings for cpu shares
if err := m.setCPUCgroupConfig(qosConfigs); err != nil {
return err
}
// update the qos level cgroup settings for huge pages (ensure they remain unbounded)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
if err := m.setHugePagesConfig(qosConfigs); err != nil {
return err
}
}
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.setMemoryReserve(qosConfigs, percentReserve)
}
}
updateSuccess := true
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
updateSuccess = false
}
}
if updateSuccess {
glog.V(2).Infof("[ContainerManager]: Updated QoS cgroup configuration")
return nil
}
// If the resource can adjust the ResourceConfig to increase likelihood of
// success, call the adjustment function here. Otherwise, the Update() will
// be called again with the same values.
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.retrySetMemoryReserve(qosConfigs, percentReserve)
}
}
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
glog.V(2).Infof("[ContainerManager]: Failed to update QoS cgroup configuration")
return err
}
}
glog.V(2).Infof("[ContainerManager]: Updated QoS cgroup configuration on retry")
return nil
}
type qosContainerManagerNoop struct {
cgroupRoot CgroupName
}
var _ QOSContainerManager = &qosContainerManagerNoop{}
func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (m *qosContainerManagerNoop) Start(_ func() v1.ResourceList, _ ActivePodsFunc) error {
return nil
}
func (m *qosContainerManagerNoop) UpdateCgroups() error {
return nil
}

123
vendor/k8s.io/kubernetes/pkg/kubelet/cm/types.go generated vendored Normal file
View file

@ -0,0 +1,123 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
// ResourceConfig holds information about all the supported cgroup resource parameters.
type ResourceConfig struct {
// Memory limit (in bytes).
Memory *int64
// CPU shares (relative weight vs. other containers).
CpuShares *uint64
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
CpuQuota *int64
// CPU quota period.
CpuPeriod *uint64
// HugePageLimit map from page size (in bytes) to limit (in bytes)
HugePageLimit map[int64]int64
}
// CgroupName is the abstract name of a cgroup prior to any driver specific conversion.
type CgroupName string
// CgroupConfig holds the cgroup configuration information.
// This is common object which is used to specify
// cgroup information to both systemd and raw cgroup fs
// implementation of the Cgroup Manager interface.
type CgroupConfig struct {
// Fully qualified name prior to any driver specific conversions.
Name CgroupName
// ResourceParameters contains various cgroups settings to apply.
ResourceParameters *ResourceConfig
}
// MemoryStats holds the on-demand statistics from the memory cgroup
type MemoryStats struct {
// Memory usage (in bytes).
Usage int64
}
// ResourceStats holds on-demand statistics from various cgroup subsystems
type ResourceStats struct {
// Memory statistics.
MemoryStats *MemoryStats
}
// CgroupManager allows for cgroup management.
// Supports Cgroup Creation ,Deletion and Updates.
type CgroupManager interface {
// Create creates and applies the cgroup configurations on the cgroup.
// It just creates the leaf cgroups.
// It expects the parent cgroup to already exist.
Create(*CgroupConfig) error
// Destroy the cgroup.
Destroy(*CgroupConfig) error
// Update cgroup configuration.
Update(*CgroupConfig) error
// Exists checks if the cgroup already exists
Exists(name CgroupName) bool
// Name returns the literal cgroupfs name on the host after any driver specific conversions.
// We would expect systemd implementation to make appropriate name conversion.
// For example, if we pass /foo/bar
// then systemd should convert the name to something like
// foo.slice/foo-bar.slice
Name(name CgroupName) string
// CgroupName converts the literal cgroupfs name on the host to an internal identifier.
CgroupName(name string) CgroupName
// Pids scans through all subsystems to find pids associated with specified cgroup.
Pids(name CgroupName) []int
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
ReduceCPULimits(cgroupName CgroupName) error
// GetResourceStats returns statistics of the specified cgroup as read from the cgroup fs.
GetResourceStats(name CgroupName) (*ResourceStats, error)
}
// QOSContainersInfo stores the names of containers per qos
type QOSContainersInfo struct {
Guaranteed string
BestEffort string
Burstable string
}
// PodContainerManager stores and manages pod level containers
// The Pod workers interact with the PodContainerManager to create and destroy
// containers for the pod.
type PodContainerManager interface {
// GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
GetPodContainerName(*v1.Pod) (CgroupName, string)
// EnsureExists takes a pod as argument and makes sure that
// pod cgroup exists if qos cgroup hierarchy flag is enabled.
// If the pod cgroup doesn't already exist this method creates it.
EnsureExists(*v1.Pod) error
// Exists returns true if the pod cgroup exists.
Exists(*v1.Pod) bool
// Destroy takes a pod Cgroup name as argument and destroys the pod's container.
Destroy(name CgroupName) error
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
ReduceCPULimits(name CgroupName) error
// GetAllPodsFromCgroups enumerates the set of pod uids to their associated cgroup based on state of cgroupfs system.
GetAllPodsFromCgroups() (map[types.UID]CgroupName, error)
}

38
vendor/k8s.io/kubernetes/pkg/kubelet/cm/util/BUILD generated vendored Normal file
View file

@ -0,0 +1,38 @@
package(default_visibility = ["//visibility:public"])
load(
"@io_bazel_rules_go//go:def.bzl",
"go_library",
)
go_library(
name = "go_default_library",
srcs = [
"cgroups_unsupported.go",
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"cgroups_linux.go",
],
"//conditions:default": [],
}),
deps = select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/utils:go_default_library",
],
"//conditions:default": [],
}),
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
)

View file

@ -0,0 +1,76 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"path/filepath"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
libcontainerutils "github.com/opencontainers/runc/libcontainer/utils"
)
// Forked from opencontainers/runc/libcontainer/cgroup/fs.Manager.GetPids()
func GetPids(cgroupPath string) ([]int, error) {
dir, err := getCgroupPath(cgroupPath)
if err != nil {
return nil, err
}
return libcontainercgroups.GetPids(dir)
}
// getCgroupPath gets the file path to the "devices" subsystem of the desired cgroup.
// cgroupPath is the path in the cgroup hierarchy.
func getCgroupPath(cgroupPath string) (string, error) {
cgroupPath = libcontainerutils.CleanPath(cgroupPath)
mnt, root, err := libcontainercgroups.FindCgroupMountpointAndRoot("devices")
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
}
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(cgroupPath) {
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(root, mnt, cgroupPath), nil
}
parentPath, err := getCgroupParentPath(mnt, root)
if err != nil {
return "", err
}
return filepath.Join(parentPath, cgroupPath), nil
}
// getCgroupParentPath gets the parent filepath to this cgroup, for resolving relative cgroup paths.
func getCgroupParentPath(mountpoint, root string) (string, error) {
// Use GetThisCgroupDir instead of GetInitCgroupDir, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
initPath, err := libcontainercgroups.GetOwnCgroup("devices")
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see paths from host, which don't exist in container.
relDir, err := filepath.Rel(root, initPath)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, relDir), nil
}

View file

@ -0,0 +1,23 @@
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
func GetPids(cgroupPath string) ([]int, error) {
return nil, nil
}