Replace godep with dep
This commit is contained in:
parent
1e7489927c
commit
bf5616c65b
14883 changed files with 3937406 additions and 361781 deletions
31
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/BUILD
generated
vendored
Normal file
31
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/BUILD
generated
vendored
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
load(
|
||||
"@io_bazel_rules_go//go:def.bzl",
|
||||
"go_library",
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
"gpu_manager_stub.go",
|
||||
"types.go",
|
||||
],
|
||||
deps = ["//vendor/k8s.io/api/core/v1:go_default_library"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["automanaged"],
|
||||
visibility = ["//visibility:private"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all-srcs",
|
||||
srcs = [
|
||||
":package-srcs",
|
||||
"//pkg/kubelet/gpu/nvidia:all-srcs",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
)
|
||||
8
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/OWNERS
generated
vendored
Normal file
8
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/OWNERS
generated
vendored
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
approvers:
|
||||
- dchen1107
|
||||
- derekwaynecarr
|
||||
- vishh
|
||||
- yujuhong
|
||||
reviewers:
|
||||
- cmluciano
|
||||
- sig-node-reviewers
|
||||
41
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/gpu_manager_stub.go
generated
vendored
Normal file
41
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/gpu_manager_stub.go
generated
vendored
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gpu
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
)
|
||||
|
||||
type gpuManagerStub struct{}
|
||||
|
||||
func (gms *gpuManagerStub) Start() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (gms *gpuManagerStub) Capacity() v1.ResourceList {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (gms *gpuManagerStub) AllocateGPU(_ *v1.Pod, _ *v1.Container) ([]string, error) {
|
||||
return nil, fmt.Errorf("GPUs are not supported")
|
||||
}
|
||||
|
||||
func NewGPUManagerStub() GPUManager {
|
||||
return &gpuManagerStub{}
|
||||
}
|
||||
51
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/nvidia/BUILD
generated
vendored
Normal file
51
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/nvidia/BUILD
generated
vendored
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
load(
|
||||
"@io_bazel_rules_go//go:def.bzl",
|
||||
"go_library",
|
||||
"go_test",
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
"helpers.go",
|
||||
"nvidia_gpu_manager.go",
|
||||
],
|
||||
deps = [
|
||||
"//pkg/kubelet/dockershim/libdocker:go_default_library",
|
||||
"//pkg/kubelet/gpu:go_default_library",
|
||||
"//vendor/github.com/golang/glog:go_default_library",
|
||||
"//vendor/k8s.io/api/core/v1:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
|
||||
],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["automanaged"],
|
||||
visibility = ["//visibility:private"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all-srcs",
|
||||
srcs = [":package-srcs"],
|
||||
tags = ["automanaged"],
|
||||
)
|
||||
|
||||
go_test(
|
||||
name = "go_default_test",
|
||||
srcs = ["nvidia_gpu_manager_test.go"],
|
||||
library = ":go_default_library",
|
||||
deps = [
|
||||
"//pkg/kubelet/dockershim/libdocker:go_default_library",
|
||||
"//vendor/github.com/stretchr/testify/assert:go_default_library",
|
||||
"//vendor/k8s.io/api/core/v1:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
|
||||
],
|
||||
)
|
||||
77
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/nvidia/helpers.go
generated
vendored
Normal file
77
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/nvidia/helpers.go
generated
vendored
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nvidia
|
||||
|
||||
import "k8s.io/apimachinery/pkg/util/sets"
|
||||
|
||||
type containerToGPU map[string]sets.String
|
||||
|
||||
// podGPUs represents a list of pod to GPU mappings.
|
||||
type podGPUs struct {
|
||||
podGPUMapping map[string]containerToGPU
|
||||
}
|
||||
|
||||
func newPodGPUs() *podGPUs {
|
||||
return &podGPUs{
|
||||
podGPUMapping: make(map[string]containerToGPU),
|
||||
}
|
||||
}
|
||||
func (pgpu *podGPUs) pods() sets.String {
|
||||
ret := sets.NewString()
|
||||
for k := range pgpu.podGPUMapping {
|
||||
ret.Insert(k)
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
func (pgpu *podGPUs) insert(podUID, contName string, device string) {
|
||||
if _, exists := pgpu.podGPUMapping[podUID]; !exists {
|
||||
pgpu.podGPUMapping[podUID] = make(containerToGPU)
|
||||
}
|
||||
if _, exists := pgpu.podGPUMapping[podUID][contName]; !exists {
|
||||
pgpu.podGPUMapping[podUID][contName] = sets.NewString()
|
||||
}
|
||||
pgpu.podGPUMapping[podUID][contName].Insert(device)
|
||||
}
|
||||
|
||||
func (pgpu *podGPUs) getGPUs(podUID, contName string) sets.String {
|
||||
containers, exists := pgpu.podGPUMapping[podUID]
|
||||
if !exists {
|
||||
return nil
|
||||
}
|
||||
devices, exists := containers[contName]
|
||||
if !exists {
|
||||
return nil
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
func (pgpu *podGPUs) delete(pods []string) {
|
||||
for _, uid := range pods {
|
||||
delete(pgpu.podGPUMapping, uid)
|
||||
}
|
||||
}
|
||||
|
||||
func (pgpu *podGPUs) devices() sets.String {
|
||||
ret := sets.NewString()
|
||||
for _, containerToGPU := range pgpu.podGPUMapping {
|
||||
for _, deviceSet := range containerToGPU {
|
||||
ret = ret.Union(deviceSet)
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
275
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go
generated
vendored
Normal file
275
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager.go
generated
vendored
Normal file
|
|
@ -0,0 +1,275 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
|
||||
"k8s.io/kubernetes/pkg/kubelet/gpu"
|
||||
)
|
||||
|
||||
// TODO: rework to use Nvidia's NVML, which is more complex, but also provides more fine-grained information and stats.
|
||||
const (
|
||||
// All NVIDIA GPUs cards should be mounted with nvidiactl and nvidia-uvm
|
||||
// If the driver installed correctly, the 2 devices will be there.
|
||||
nvidiaCtlDevice string = "/dev/nvidiactl"
|
||||
nvidiaUVMDevice string = "/dev/nvidia-uvm"
|
||||
// Optional device.
|
||||
nvidiaUVMToolsDevice string = "/dev/nvidia-uvm-tools"
|
||||
devDirectory = "/dev"
|
||||
nvidiaDeviceRE = `^nvidia[0-9]*$`
|
||||
nvidiaFullpathRE = `^/dev/nvidia[0-9]*$`
|
||||
)
|
||||
|
||||
type activePodsLister interface {
|
||||
// Returns a list of active pods on the node.
|
||||
GetActivePods() []*v1.Pod
|
||||
}
|
||||
|
||||
// nvidiaGPUManager manages nvidia gpu devices.
|
||||
type nvidiaGPUManager struct {
|
||||
sync.Mutex
|
||||
// All gpus available on the Node
|
||||
allGPUs sets.String
|
||||
allocated *podGPUs
|
||||
defaultDevices []string
|
||||
// The interface which could get GPU mapping from all the containers.
|
||||
// TODO: Should make this independent of Docker in the future.
|
||||
dockerClient libdocker.Interface
|
||||
activePodsLister activePodsLister
|
||||
}
|
||||
|
||||
// NewNvidiaGPUManager returns a GPUManager that manages local Nvidia GPUs.
|
||||
// TODO: Migrate to use pod level cgroups and make it generic to all runtimes.
|
||||
func NewNvidiaGPUManager(activePodsLister activePodsLister, dockerClient libdocker.Interface) (gpu.GPUManager, error) {
|
||||
if dockerClient == nil {
|
||||
return nil, fmt.Errorf("invalid docker client specified")
|
||||
}
|
||||
return &nvidiaGPUManager{
|
||||
allGPUs: sets.NewString(),
|
||||
dockerClient: dockerClient,
|
||||
activePodsLister: activePodsLister,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Initialize the GPU devices, so far only needed to discover the GPU paths.
|
||||
func (ngm *nvidiaGPUManager) Start() error {
|
||||
if ngm.dockerClient == nil {
|
||||
return fmt.Errorf("Invalid docker client specified in GPU Manager")
|
||||
}
|
||||
ngm.Lock()
|
||||
defer ngm.Unlock()
|
||||
|
||||
if _, err := os.Stat(nvidiaCtlDevice); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if _, err := os.Stat(nvidiaUVMDevice); err != nil {
|
||||
return err
|
||||
}
|
||||
ngm.defaultDevices = []string{nvidiaCtlDevice, nvidiaUVMDevice}
|
||||
_, err := os.Stat(nvidiaUVMToolsDevice)
|
||||
if !os.IsNotExist(err) {
|
||||
ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevice)
|
||||
}
|
||||
|
||||
if err := ngm.discoverGPUs(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// We ignore errors when identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up.
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get how many GPU cards we have.
|
||||
func (ngm *nvidiaGPUManager) Capacity() v1.ResourceList {
|
||||
gpus := resource.NewQuantity(int64(len(ngm.allGPUs)), resource.DecimalSI)
|
||||
return v1.ResourceList{
|
||||
v1.ResourceNvidiaGPU: *gpus,
|
||||
}
|
||||
}
|
||||
|
||||
// AllocateGPUs returns `num` GPUs if available, error otherwise.
|
||||
// Allocation is made thread safe using the following logic.
|
||||
// A list of all GPUs allocated is maintained along with their respective Pod UIDs.
|
||||
// It is expected that the list of active pods will not return any false positives.
|
||||
// As part of initialization or allocation, the list of GPUs in use will be computed once.
|
||||
// Whenever an allocation happens, the list of GPUs allocated is updated based on the list of currently active pods.
|
||||
// GPUs allocated to terminated pods are freed up lazily as part of allocation.
|
||||
// GPUs are allocated based on the internal list of allocatedGPUs.
|
||||
// It is not safe to generate a list of GPUs in use by inspecting active containers because of the delay between GPU allocation and container creation.
|
||||
// A GPU allocated to a container might be re-allocated to a subsequent container because the original container wasn't started quick enough.
|
||||
// The current algorithm scans containers only once and then uses a list of active pods to track GPU usage.
|
||||
// This is a sub-optimal solution and a better alternative would be that of using pod level cgroups instead.
|
||||
// GPUs allocated to containers should be reflected in pod level device cgroups before completing allocations.
|
||||
// The pod level cgroups will then serve as a checkpoint of GPUs in use.
|
||||
func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) ([]string, error) {
|
||||
gpusNeeded := container.Resources.Limits.NvidiaGPU().Value()
|
||||
if gpusNeeded == 0 {
|
||||
return []string{}, nil
|
||||
}
|
||||
ngm.Lock()
|
||||
defer ngm.Unlock()
|
||||
if ngm.allocated == nil {
|
||||
// Initialization is not complete. Try now. Failures can no longer be tolerated.
|
||||
ngm.allocated = ngm.gpusInUse()
|
||||
} else {
|
||||
// update internal list of GPUs in use prior to allocating new GPUs.
|
||||
ngm.updateAllocatedGPUs()
|
||||
}
|
||||
// Check if GPUs have already been allocated. If so return them right away.
|
||||
// This can happen if a container restarts for example.
|
||||
if devices := ngm.allocated.getGPUs(string(pod.UID), container.Name); devices != nil {
|
||||
glog.V(2).Infof("Found pre-allocated GPUs for container %q in Pod %q: %v", container.Name, pod.UID, devices.List())
|
||||
return append(devices.List(), ngm.defaultDevices...), nil
|
||||
}
|
||||
// Get GPU devices in use.
|
||||
devicesInUse := ngm.allocated.devices()
|
||||
glog.V(5).Infof("gpus in use: %v", devicesInUse.List())
|
||||
// Get a list of available GPUs.
|
||||
available := ngm.allGPUs.Difference(devicesInUse)
|
||||
glog.V(5).Infof("gpus available: %v", available.List())
|
||||
if int64(available.Len()) < gpusNeeded {
|
||||
return nil, fmt.Errorf("requested number of GPUs unavailable. Requested: %d, Available: %d", gpusNeeded, available.Len())
|
||||
}
|
||||
ret := available.UnsortedList()[:gpusNeeded]
|
||||
for _, device := range ret {
|
||||
// Update internal allocated GPU cache.
|
||||
ngm.allocated.insert(string(pod.UID), container.Name, device)
|
||||
}
|
||||
// Add standard devices files that needs to be exposed.
|
||||
ret = append(ret, ngm.defaultDevices...)
|
||||
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
// updateAllocatedGPUs updates the list of GPUs in use.
|
||||
// It gets a list of active pods and then frees any GPUs that are bound to terminated pods.
|
||||
// Returns error on failure.
|
||||
func (ngm *nvidiaGPUManager) updateAllocatedGPUs() {
|
||||
activePods := ngm.activePodsLister.GetActivePods()
|
||||
activePodUids := sets.NewString()
|
||||
for _, pod := range activePods {
|
||||
activePodUids.Insert(string(pod.UID))
|
||||
}
|
||||
allocatedPodUids := ngm.allocated.pods()
|
||||
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
|
||||
glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
|
||||
ngm.allocated.delete(podsToBeRemoved.List())
|
||||
}
|
||||
|
||||
// discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory.
|
||||
// TODO: Without NVML support we only can check whether there has GPU devices, but
|
||||
// could not give a health check or get more information like GPU cores, memory, or
|
||||
// family name. Need to support NVML in the future. But we do not need NVML until
|
||||
// we want more features, features like schedule containers according to GPU family
|
||||
// name.
|
||||
func (ngm *nvidiaGPUManager) discoverGPUs() error {
|
||||
reg := regexp.MustCompile(nvidiaDeviceRE)
|
||||
files, err := ioutil.ReadDir(devDirectory)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, f := range files {
|
||||
if f.IsDir() {
|
||||
continue
|
||||
}
|
||||
if reg.MatchString(f.Name()) {
|
||||
glog.V(2).Infof("Found Nvidia GPU %q", f.Name())
|
||||
ngm.allGPUs.Insert(path.Join(devDirectory, f.Name()))
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// gpusInUse returns a list of GPUs in use along with the respective pods that are using it.
|
||||
func (ngm *nvidiaGPUManager) gpusInUse() *podGPUs {
|
||||
pods := ngm.activePodsLister.GetActivePods()
|
||||
type containerIdentifier struct {
|
||||
id string
|
||||
name string
|
||||
}
|
||||
type podContainers struct {
|
||||
uid string
|
||||
containers []containerIdentifier
|
||||
}
|
||||
// List of containers to inspect.
|
||||
podContainersToInspect := []podContainers{}
|
||||
for _, pod := range pods {
|
||||
containers := sets.NewString()
|
||||
for _, container := range pod.Spec.Containers {
|
||||
// GPUs are expected to be specified only in limits.
|
||||
if !container.Resources.Limits.NvidiaGPU().IsZero() {
|
||||
containers.Insert(container.Name)
|
||||
}
|
||||
}
|
||||
// If no GPUs were requested skip this pod.
|
||||
if containers.Len() == 0 {
|
||||
continue
|
||||
}
|
||||
// TODO: If kubelet restarts right after allocating a GPU to a pod, the container might not have started yet and so container status might not be available yet.
|
||||
// Use an internal checkpoint instead or try using the CRI if its checkpoint is reliable.
|
||||
var containersToInspect []containerIdentifier
|
||||
for _, container := range pod.Status.ContainerStatuses {
|
||||
if containers.Has(container.Name) {
|
||||
containersToInspect = append(containersToInspect, containerIdentifier{strings.Replace(container.ContainerID, "docker://", "", 1), container.Name})
|
||||
}
|
||||
}
|
||||
// add the pod and its containers that need to be inspected.
|
||||
podContainersToInspect = append(podContainersToInspect, podContainers{string(pod.UID), containersToInspect})
|
||||
}
|
||||
ret := newPodGPUs()
|
||||
for _, podContainer := range podContainersToInspect {
|
||||
for _, containerIdentifier := range podContainer.containers {
|
||||
containerJSON, err := ngm.dockerClient.InspectContainer(containerIdentifier.id)
|
||||
if err != nil {
|
||||
glog.V(3).Infof("Failed to inspect container %q in pod %q while attempting to reconcile nvidia gpus in use", containerIdentifier.id, podContainer.uid)
|
||||
continue
|
||||
}
|
||||
|
||||
devices := containerJSON.HostConfig.Devices
|
||||
if devices == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, device := range devices {
|
||||
if isValidPath(device.PathOnHost) {
|
||||
glog.V(4).Infof("Nvidia GPU %q is in use by Docker Container: %q", device.PathOnHost, containerJSON.ID)
|
||||
ret.insert(podContainer.uid, containerIdentifier.name, device.PathOnHost)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
func isValidPath(path string) bool {
|
||||
return regexp.MustCompile(nvidiaFullpathRE).MatchString(path)
|
||||
}
|
||||
211
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager_test.go
generated
vendored
Normal file
211
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/nvidia/nvidia_gpu_manager_test.go
generated
vendored
Normal file
|
|
@ -0,0 +1,211 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"os"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/apimachinery/pkg/util/uuid"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
|
||||
)
|
||||
|
||||
type testActivePodsLister struct {
|
||||
activePods []*v1.Pod
|
||||
}
|
||||
|
||||
func (tapl *testActivePodsLister) GetActivePods() []*v1.Pod {
|
||||
return tapl.activePods
|
||||
}
|
||||
|
||||
func makeTestPod(numContainers, gpusPerContainer int) *v1.Pod {
|
||||
quantity := resource.NewQuantity(int64(gpusPerContainer), resource.DecimalSI)
|
||||
resources := v1.ResourceRequirements{
|
||||
Limits: v1.ResourceList{
|
||||
v1.ResourceNvidiaGPU: *quantity,
|
||||
},
|
||||
}
|
||||
pod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
UID: uuid.NewUUID(),
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{},
|
||||
},
|
||||
}
|
||||
for ; numContainers > 0; numContainers-- {
|
||||
pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{
|
||||
Name: string(uuid.NewUUID()),
|
||||
Resources: resources,
|
||||
})
|
||||
}
|
||||
return pod
|
||||
}
|
||||
|
||||
func TestNewNvidiaGPUManager(t *testing.T) {
|
||||
podLister := &testActivePodsLister{}
|
||||
|
||||
// Expects nil GPUManager and an error with nil dockerClient.
|
||||
testGpuManager1, err := NewNvidiaGPUManager(podLister, nil)
|
||||
as := assert.New(t)
|
||||
as.Nil(testGpuManager1)
|
||||
as.NotNil(err)
|
||||
|
||||
// Expects a GPUManager to be created with non-nil dockerClient.
|
||||
fakeDocker := libdocker.NewFakeDockerClient()
|
||||
testGpuManager2, err := NewNvidiaGPUManager(podLister, fakeDocker)
|
||||
as.NotNil(testGpuManager2)
|
||||
as.Nil(err)
|
||||
|
||||
// Expects zero capacity without any GPUs.
|
||||
gpuCapacity := testGpuManager2.Capacity()
|
||||
as.Equal(len(gpuCapacity), 1)
|
||||
rgpu := gpuCapacity[v1.ResourceNvidiaGPU]
|
||||
as.Equal(rgpu.Value(), int64(0))
|
||||
|
||||
err2 := testGpuManager2.Start()
|
||||
if !os.IsNotExist(err2) {
|
||||
gpus := reflect.ValueOf(testGpuManager2).Elem().FieldByName("allGPUs").Len()
|
||||
as.NotZero(gpus)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMultiContainerPodGPUAllocation(t *testing.T) {
|
||||
podLister := &testActivePodsLister{}
|
||||
|
||||
testGpuManager := &nvidiaGPUManager{
|
||||
activePodsLister: podLister,
|
||||
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
|
||||
allocated: newPodGPUs(),
|
||||
}
|
||||
|
||||
// Expect that no devices are in use.
|
||||
gpusInUse := testGpuManager.gpusInUse()
|
||||
as := assert.New(t)
|
||||
as.Equal(len(gpusInUse.devices()), 0)
|
||||
|
||||
// Allocated GPUs for a pod with two containers.
|
||||
pod := makeTestPod(2, 1)
|
||||
// Allocate for the first container.
|
||||
devices1, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[0])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devices1), 1)
|
||||
|
||||
podLister.activePods = append(podLister.activePods, pod)
|
||||
// Allocate for the second container.
|
||||
devices2, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[1])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devices2), 1)
|
||||
|
||||
as.NotEqual(devices1, devices2, "expected containers to get different devices")
|
||||
|
||||
// further allocations should fail.
|
||||
newPod := makeTestPod(2, 1)
|
||||
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
|
||||
as.NotNil(err, "expected gpu allocation to fail. got: %v", devices1)
|
||||
|
||||
// Now terminate the original pod and observe that GPU allocation for new pod succeeds.
|
||||
podLister.activePods = podLister.activePods[:0]
|
||||
|
||||
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devices1), 1)
|
||||
|
||||
podLister.activePods = append(podLister.activePods, newPod)
|
||||
|
||||
devices2, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[1])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devices2), 1)
|
||||
|
||||
as.NotEqual(devices1, devices2, "expected containers to get different devices")
|
||||
}
|
||||
|
||||
func TestMultiPodGPUAllocation(t *testing.T) {
|
||||
podLister := &testActivePodsLister{}
|
||||
|
||||
testGpuManager := &nvidiaGPUManager{
|
||||
activePodsLister: podLister,
|
||||
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
|
||||
allocated: newPodGPUs(),
|
||||
}
|
||||
|
||||
// Expect that no devices are in use.
|
||||
gpusInUse := testGpuManager.gpusInUse()
|
||||
as := assert.New(t)
|
||||
as.Equal(len(gpusInUse.devices()), 0)
|
||||
|
||||
// Allocated GPUs for a pod with two containers.
|
||||
podA := makeTestPod(1, 1)
|
||||
// Allocate for the first container.
|
||||
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devicesA), 1)
|
||||
|
||||
podLister.activePods = append(podLister.activePods, podA)
|
||||
|
||||
// further allocations should fail.
|
||||
podB := makeTestPod(1, 1)
|
||||
// Allocate for the first container.
|
||||
devicesB, err := testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devicesB), 1)
|
||||
as.NotEqual(devicesA, devicesB, "expected pods to get different devices")
|
||||
}
|
||||
|
||||
func TestPodContainerRestart(t *testing.T) {
|
||||
podLister := &testActivePodsLister{}
|
||||
|
||||
testGpuManager := &nvidiaGPUManager{
|
||||
activePodsLister: podLister,
|
||||
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
|
||||
allocated: newPodGPUs(),
|
||||
defaultDevices: []string{"/dev/nvidia-smi"},
|
||||
}
|
||||
|
||||
// Expect that no devices are in use.
|
||||
gpusInUse := testGpuManager.gpusInUse()
|
||||
as := assert.New(t)
|
||||
as.Equal(len(gpusInUse.devices()), 0)
|
||||
|
||||
// Make a pod with one containers that requests two GPUs.
|
||||
podA := makeTestPod(1, 2)
|
||||
// Allocate GPUs
|
||||
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devicesA), 3)
|
||||
|
||||
podLister.activePods = append(podLister.activePods, podA)
|
||||
|
||||
// further allocations should fail.
|
||||
podB := makeTestPod(1, 1)
|
||||
_, err = testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
|
||||
as.NotNil(err)
|
||||
|
||||
// Allcate GPU for existing Pod A.
|
||||
// The same gpus must be returned.
|
||||
devicesAretry, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
||||
as.Nil(err)
|
||||
as.Equal(len(devicesA), 3)
|
||||
as.True(sets.NewString(devicesA...).Equal(sets.NewString(devicesAretry...)))
|
||||
}
|
||||
32
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/types.go
generated
vendored
Normal file
32
vendor/k8s.io/kubernetes/pkg/kubelet/gpu/types.go
generated
vendored
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package gpu
|
||||
|
||||
import "k8s.io/api/core/v1"
|
||||
|
||||
// GPUManager manages GPUs on a local node.
|
||||
// Implementations are expected to be thread safe.
|
||||
type GPUManager interface {
|
||||
// Start logically initializes GPUManager
|
||||
Start() error
|
||||
// Capacity returns the total number of GPUs on the node.
|
||||
Capacity() v1.ResourceList
|
||||
// AllocateGPU attempts to allocate GPUs for input container.
|
||||
// Returns paths to allocated GPUs and nil on success.
|
||||
// Returns an error on failure.
|
||||
AllocateGPU(*v1.Pod, *v1.Container) ([]string, error)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue