Skip to content

Commit ddc2ee5

Browse files
committed
feat: change the logic for docker stop and patch container or volume
1. now the stop api, you can choose whether to restore gpu or port resource controllers by using the RestoreGpus and RestorePorts parameters 2. when you patch a container, older version of the container will stop running and port resources will be restored
1 parent 3dc98aa commit ddc2ee5

File tree

3 files changed

+88
-36
lines changed

3 files changed

+88
-36
lines changed

internal/api/container.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,14 @@ func (ch *ContainerHandler) stop(c *gin.Context) {
248248
ResponseError(c, CodeContainerNameMustContainVersion)
249249
}
250250

251-
if err := cs.StopContainer(name); err != nil {
251+
var spec model.ContainerStop
252+
if err := c.ShouldBindJSON(&spec); err != nil {
253+
log.Error("failed to stop container, error:", err.Error())
254+
ResponseError(c, CodeInvalidParams)
255+
return
256+
}
257+
258+
if err := cs.StopContainer(name, &spec); err != nil {
252259
log.Errorf("service.StopContainer failed, original error: %T %v", errors.Cause(err), err)
253260
log.Errorf("stack trace: \n%+v\n", err)
254261
ResponseError(c, CodeContainerStopFailed)

internal/model/container.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,15 @@ type ContainerVolumePatch struct {
3030
}
3131

3232
type ContainerDelete struct {
33-
Force bool `json:"force"`
34-
DelEtcdInfoAndVersionRecord bool `json:"delEtcdInfoAndVersionRecord"`
33+
Force bool `json:"force,omitempty"`
34+
DelEtcdInfoAndVersionRecord bool `json:"delEtcdInfoAndVersionRecord,omitempty"`
3535
}
3636

3737
type ContainerCommit struct {
3838
NewImageName string `json:"newImageName"`
3939
}
40+
41+
type ContainerStop struct {
42+
RestoreGpus bool `json:"restoreGpus,omitempty"`
43+
RestorePorts bool `json:"restorePorts,omitempty"`
44+
}

internal/service/container.go

Lines changed: 73 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,16 @@ func (cs *ContainerService) RunGpuContainer(spec *model.ContainerRun) (id, conta
5656
Tty: true,
5757
}
5858

59-
// 只想容器要暴露的端口,添加到创建容器的信息中
59+
// 只将容器要暴露的端口,添加到创建容器的信息中
6060
// 具体这个端口要映射到宿主机的哪个端口,交给 runContainer 方法
6161
// 这样做的好处就是,不管是创建容器、变更容器 GPU/Volume、重启动容器,都无需关心端口的配置
62-
hostConfig.PortBindings = make(nat.PortMap, len(spec.ContainerPorts))
63-
config.ExposedPorts = make(nat.PortSet, len(spec.ContainerPorts))
64-
for _, port := range spec.ContainerPorts {
65-
config.ExposedPorts[nat.Port(port+"/tcp")] = struct{}{}
66-
hostConfig.PortBindings[nat.Port(port+"/tcp")] = nil
62+
if len(spec.ContainerPorts) > 0 {
63+
hostConfig.PortBindings = make(nat.PortMap, len(spec.ContainerPorts))
64+
config.ExposedPorts = make(nat.PortSet, len(spec.ContainerPorts))
65+
for _, port := range spec.ContainerPorts {
66+
config.ExposedPorts[nat.Port(port+"/tcp")] = struct{}{}
67+
hostConfig.PortBindings[nat.Port(port+"/tcp")] = nil
68+
}
6769
}
6870

6971
// 绑定 GPU 资源信息
@@ -188,7 +190,8 @@ func (cs *ContainerService) PatchContainerGpuInfo(name string, spec *model.Conta
188190
return id, newContainerName, errors.WithMessage(err, "json.Unmarshal failed")
189191
}
190192

191-
// 只有 etcd 中的 Volume 对象的版本和要修改的 Volume 版本一致时,才能更新
193+
// 只有 etcd 中的 Container 对象的版本和要修改的 Container 版本一致时,才能更新
194+
// 也就是说,当前容器经过更新,版本号已经为 2 了,此时你就不能对版本号为 0、1 的容器进行更新操作
192195
if strconv.FormatInt(info.Version, 10) != strings.Split(name, "-")[1] {
193196
return id, newContainerName, errors.Wrapf(xerrors.NewVersionNotMatchError(),
194197
"container: %s, etcd version: %d, patch version: %s", name, info.Version, strings.Split(name, "-")[1])
@@ -254,6 +257,14 @@ func (cs *ContainerService) PatchContainerGpuInfo(name string, spec *model.Conta
254257
OldResource: info.ContainerName,
255258
NewResource: newContainerName,
256259
}
260+
261+
// 停止旧的容器
262+
// 选择不归还 GPU 资源,因为降低 GPU 配置时,已经归还了 GPU 资源。而升级配置时,会使用原有的卡,所以不需要归还
263+
_ = cs.StopContainer(name, &model.ContainerStop{
264+
RestoreGpus: false,
265+
RestorePorts: true,
266+
})
267+
257268
log.Infof("service.PatchContainerGpuInfo, container: %s patch gpu info successfully", name)
258269
return
259270
}
@@ -278,7 +289,7 @@ func (cs *ContainerService) PatchContainerVolumeInfo(name string, spec *model.Co
278289
return id, newContainerName, errors.WithMessage(err, "json.Unmarshal failed")
279290
}
280291

281-
// 只有 etcd 中的 Volume 对象的版本和要修改的 Volume 版本一致时,才能更新
292+
// 只有 etcd 中的 Container 对象的版本和要修改的 Container 版本一致时,才能更新
282293
if strconv.FormatInt(info.Version, 10) != strings.Split(name, "-")[1] {
283294
return id, newContainerName, errors.Wrapf(xerrors.NewVersionNotMatchError(),
284295
"container: %s, etcd version: %d, patch version: %s", name, info.Version, strings.Split(name, "-")[1])
@@ -305,27 +316,39 @@ func (cs *ContainerService) PatchContainerVolumeInfo(name string, spec *model.Co
305316
NewResource: newContainerName,
306317
}
307318

319+
// 停止旧的容器
320+
// 选择不归还 GPU 资源,因为更改 Volume 配置不涉及 GPU 资源的申请与释放
321+
_ = cs.StopContainer(name, &model.ContainerStop{
322+
RestoreGpus: false,
323+
RestorePorts: true,
324+
})
325+
308326
log.Infof("service.PatchContainerVolumeInfo, container: %s patch volume info successfully", name)
309327
return
310328
}
311329

312-
// StopContainer 停止容器,会归还端口资源,如果是 GPU 容器,会归还使用的资源
313-
func (cs *ContainerService) StopContainer(name string) error {
314-
// 归还 gpu 资源
315-
uuids, err := cs.containerDeviceRequestsDeviceIDs(name)
316-
if err != nil {
317-
return errors.WithMessage(err, "service.containerDeviceRequestsDeviceIDs failed")
330+
// StopContainer 停止容器
331+
// restoreGpus 是否释放 gpu 资源
332+
// restorePorts 是否释放端口资源
333+
func (cs *ContainerService) StopContainer(name string, spec *model.ContainerStop) error {
334+
if spec.RestoreGpus {
335+
// 归还 gpu 资源
336+
uuids, err := cs.containerDeviceRequestsDeviceIDs(name)
337+
if err != nil {
338+
return errors.WithMessage(err, "service.containerDeviceRequestsDeviceIDs failed")
339+
}
340+
gpuscheduler.Scheduler.RestoreGpus(uuids)
341+
log.Infof("service.StopContainer, container: %s restore %d gpus, uuids: %+v",
342+
name, len(uuids), uuids)
318343
}
319-
gpuscheduler.Scheduler.RestoreGpus(uuids)
320-
log.Infof("service.StopContainer, container: %s restore %d gpus, uuids: %+v",
321-
name, len(uuids), uuids)
322-
323-
// 归还端口资源
324-
ports, err := cs.containerPortBindings(name)
325-
if err != nil {
326-
return errors.WithMessage(err, "service.containerPortBindings failed")
344+
if spec.RestorePorts {
345+
// 归还端口资源
346+
ports, err := cs.containerPortBindings(name)
347+
if err != nil {
348+
return errors.WithMessage(err, "service.containerPortBindings failed")
349+
}
350+
portscheduler.Scheduler.RestorePorts(ports)
327351
}
328-
portscheduler.Scheduler.RestorePorts(ports)
329352

330353
// 停止容器
331354
ctx := context.Background()
@@ -423,6 +446,18 @@ func (cs *ContainerService) CommitContainer(name string, spec model.ContainerCom
423446
return imageName, err
424447
}
425448

449+
func (cs *ContainerService) GetContainerInfo(name string) (info model.EtcdContainerInfo, err error) {
450+
infoBytes, err := etcd.Get(etcd.Containers, name)
451+
if err != nil {
452+
return info, errors.WithMessage(err, "etcd.Get failed")
453+
}
454+
455+
if err = json.Unmarshal(infoBytes, &info); err != nil {
456+
return info, errors.WithMessage(err, "json.Unmarshal failed")
457+
}
458+
return
459+
}
460+
426461
// 真正创建容器和启动容器的方法,这个方法不区分是用来创建 GPU 容器还是普通容器,因为它只会根据入参来创建容器
427462
// 用于创建容器、变更容器的 GPU 信息、变更容器的 Volume 信息、重启动 GPU 容器等
428463
func (cs *ContainerService) runContainer(ctx context.Context, name string, info model.EtcdContainerInfo) (id, containerName string, err error) {
@@ -450,18 +485,22 @@ func (cs *ContainerService) runContainer(ctx context.Context, name string, info
450485
// 生成此次要创建的容器的名称
451486
containerName = fmt.Sprintf("%s-%d", name, version)
452487

453-
availableOSPorts, err := portscheduler.Scheduler.ApplyPorts(len(info.HostConfig.PortBindings))
454-
if err != nil {
455-
return id, containerName, errors.Wrapf(err, "portscheduler.ApplyPorts failed, info: %+v", info)
488+
// 申请宿主机端口
489+
if info.HostConfig.PortBindings != nil && len(info.HostConfig.PortBindings) > 0 {
490+
availableOSPorts, err := portscheduler.Scheduler.ApplyPorts(len(info.HostConfig.PortBindings))
491+
if err != nil {
492+
return id, containerName, errors.Wrapf(err, "portscheduler.ApplyPorts failed, info: %+v", info)
493+
}
494+
var index int
495+
for k := range info.HostConfig.PortBindings {
496+
info.HostConfig.PortBindings[k] = []nat.PortBinding{{
497+
HostPort: strconv.Itoa(availableOSPorts[index]),
498+
}}
499+
index++
500+
}
456501
}
457502

458-
var index int
459-
for k := range info.HostConfig.PortBindings {
460-
info.HostConfig.PortBindings[k] = []nat.PortBinding{{
461-
HostPort: strconv.Itoa(availableOSPorts[index]),
462-
}}
463-
index++
464-
}
503+
// 创建容器
465504
resp, err := docker.Cli.ContainerCreate(ctx, info.Config, info.HostConfig, info.NetworkingConfig, info.Platform, containerName)
466505
if err != nil {
467506
return id, containerName, errors.Wrapf(err, "docker.ContainerCreate failed, name: %s", containerName)
@@ -521,6 +560,7 @@ func (cs *ContainerService) containerDeviceRequestsDeviceIDs(name string) ([]str
521560
return resp.HostConfig.DeviceRequests[0].DeviceIDs, nil
522561
}
523562

563+
// 获取容器的端口绑定信息
524564
func (cs *ContainerService) containerPortBindings(name string) ([]int, error) {
525565
ctx := context.Background()
526566
resp, err := docker.Cli.ContainerInspect(ctx, name)

0 commit comments

Comments
 (0)