@@ -56,14 +56,16 @@ func (cs *ContainerService) RunGpuContainer(spec *model.ContainerRun) (id, conta
56
56
Tty : true ,
57
57
}
58
58
59
- // 只想容器要暴露的端口 ,添加到创建容器的信息中
59
+ // 只将容器要暴露的端口 ,添加到创建容器的信息中
60
60
// 具体这个端口要映射到宿主机的哪个端口,交给 runContainer 方法
61
61
// 这样做的好处就是,不管是创建容器、变更容器 GPU/Volume、重启动容器,都无需关心端口的配置
62
- hostConfig .PortBindings = make (nat.PortMap , len (spec .ContainerPorts ))
63
- config .ExposedPorts = make (nat.PortSet , len (spec .ContainerPorts ))
64
- for _ , port := range spec .ContainerPorts {
65
- config .ExposedPorts [nat .Port (port + "/tcp" )] = struct {}{}
66
- hostConfig .PortBindings [nat .Port (port + "/tcp" )] = nil
62
+ if len (spec .ContainerPorts ) > 0 {
63
+ hostConfig .PortBindings = make (nat.PortMap , len (spec .ContainerPorts ))
64
+ config .ExposedPorts = make (nat.PortSet , len (spec .ContainerPorts ))
65
+ for _ , port := range spec .ContainerPorts {
66
+ config .ExposedPorts [nat .Port (port + "/tcp" )] = struct {}{}
67
+ hostConfig .PortBindings [nat .Port (port + "/tcp" )] = nil
68
+ }
67
69
}
68
70
69
71
// 绑定 GPU 资源信息
@@ -188,7 +190,8 @@ func (cs *ContainerService) PatchContainerGpuInfo(name string, spec *model.Conta
188
190
return id , newContainerName , errors .WithMessage (err , "json.Unmarshal failed" )
189
191
}
190
192
191
- // 只有 etcd 中的 Volume 对象的版本和要修改的 Volume 版本一致时,才能更新
193
+ // 只有 etcd 中的 Container 对象的版本和要修改的 Container 版本一致时,才能更新
194
+ // 也就是说,当前容器经过更新,版本号已经为 2 了,此时你就不能对版本号为 0、1 的容器进行更新操作
192
195
if strconv .FormatInt (info .Version , 10 ) != strings .Split (name , "-" )[1 ] {
193
196
return id , newContainerName , errors .Wrapf (xerrors .NewVersionNotMatchError (),
194
197
"container: %s, etcd version: %d, patch version: %s" , name , info .Version , strings .Split (name , "-" )[1 ])
@@ -254,6 +257,14 @@ func (cs *ContainerService) PatchContainerGpuInfo(name string, spec *model.Conta
254
257
OldResource : info .ContainerName ,
255
258
NewResource : newContainerName ,
256
259
}
260
+
261
+ // 停止旧的容器
262
+ // 选择不归还 GPU 资源,因为降低 GPU 配置时,已经归还了 GPU 资源。而升级配置时,会使用原有的卡,所以不需要归还
263
+ _ = cs .StopContainer (name , & model.ContainerStop {
264
+ RestoreGpus : false ,
265
+ RestorePorts : true ,
266
+ })
267
+
257
268
log .Infof ("service.PatchContainerGpuInfo, container: %s patch gpu info successfully" , name )
258
269
return
259
270
}
@@ -278,7 +289,7 @@ func (cs *ContainerService) PatchContainerVolumeInfo(name string, spec *model.Co
278
289
return id , newContainerName , errors .WithMessage (err , "json.Unmarshal failed" )
279
290
}
280
291
281
- // 只有 etcd 中的 Volume 对象的版本和要修改的 Volume 版本一致时,才能更新
292
+ // 只有 etcd 中的 Container 对象的版本和要修改的 Container 版本一致时,才能更新
282
293
if strconv .FormatInt (info .Version , 10 ) != strings .Split (name , "-" )[1 ] {
283
294
return id , newContainerName , errors .Wrapf (xerrors .NewVersionNotMatchError (),
284
295
"container: %s, etcd version: %d, patch version: %s" , name , info .Version , strings .Split (name , "-" )[1 ])
@@ -305,27 +316,39 @@ func (cs *ContainerService) PatchContainerVolumeInfo(name string, spec *model.Co
305
316
NewResource : newContainerName ,
306
317
}
307
318
319
+ // 停止旧的容器
320
+ // 选择不归还 GPU 资源,因为更改 Volume 配置不涉及 GPU 资源的申请与释放
321
+ _ = cs .StopContainer (name , & model.ContainerStop {
322
+ RestoreGpus : false ,
323
+ RestorePorts : true ,
324
+ })
325
+
308
326
log .Infof ("service.PatchContainerVolumeInfo, container: %s patch volume info successfully" , name )
309
327
return
310
328
}
311
329
312
- // StopContainer 停止容器,会归还端口资源,如果是 GPU 容器,会归还使用的资源
313
- func (cs * ContainerService ) StopContainer (name string ) error {
314
- // 归还 gpu 资源
315
- uuids , err := cs .containerDeviceRequestsDeviceIDs (name )
316
- if err != nil {
317
- return errors .WithMessage (err , "service.containerDeviceRequestsDeviceIDs failed" )
330
+ // StopContainer 停止容器
331
+ // restoreGpus 是否释放 gpu 资源
332
+ // restorePorts 是否释放端口资源
333
+ func (cs * ContainerService ) StopContainer (name string , spec * model.ContainerStop ) error {
334
+ if spec .RestoreGpus {
335
+ // 归还 gpu 资源
336
+ uuids , err := cs .containerDeviceRequestsDeviceIDs (name )
337
+ if err != nil {
338
+ return errors .WithMessage (err , "service.containerDeviceRequestsDeviceIDs failed" )
339
+ }
340
+ gpuscheduler .Scheduler .RestoreGpus (uuids )
341
+ log .Infof ("service.StopContainer, container: %s restore %d gpus, uuids: %+v" ,
342
+ name , len (uuids ), uuids )
318
343
}
319
- gpuscheduler .Scheduler .RestoreGpus (uuids )
320
- log .Infof ("service.StopContainer, container: %s restore %d gpus, uuids: %+v" ,
321
- name , len (uuids ), uuids )
322
-
323
- // 归还端口资源
324
- ports , err := cs .containerPortBindings (name )
325
- if err != nil {
326
- return errors .WithMessage (err , "service.containerPortBindings failed" )
344
+ if spec .RestorePorts {
345
+ // 归还端口资源
346
+ ports , err := cs .containerPortBindings (name )
347
+ if err != nil {
348
+ return errors .WithMessage (err , "service.containerPortBindings failed" )
349
+ }
350
+ portscheduler .Scheduler .RestorePorts (ports )
327
351
}
328
- portscheduler .Scheduler .RestorePorts (ports )
329
352
330
353
// 停止容器
331
354
ctx := context .Background ()
@@ -423,6 +446,18 @@ func (cs *ContainerService) CommitContainer(name string, spec model.ContainerCom
423
446
return imageName , err
424
447
}
425
448
449
+ func (cs * ContainerService ) GetContainerInfo (name string ) (info model.EtcdContainerInfo , err error ) {
450
+ infoBytes , err := etcd .Get (etcd .Containers , name )
451
+ if err != nil {
452
+ return info , errors .WithMessage (err , "etcd.Get failed" )
453
+ }
454
+
455
+ if err = json .Unmarshal (infoBytes , & info ); err != nil {
456
+ return info , errors .WithMessage (err , "json.Unmarshal failed" )
457
+ }
458
+ return
459
+ }
460
+
426
461
// 真正创建容器和启动容器的方法,这个方法不区分是用来创建 GPU 容器还是普通容器,因为它只会根据入参来创建容器
427
462
// 用于创建容器、变更容器的 GPU 信息、变更容器的 Volume 信息、重启动 GPU 容器等
428
463
func (cs * ContainerService ) runContainer (ctx context.Context , name string , info model.EtcdContainerInfo ) (id , containerName string , err error ) {
@@ -450,18 +485,22 @@ func (cs *ContainerService) runContainer(ctx context.Context, name string, info
450
485
// 生成此次要创建的容器的名称
451
486
containerName = fmt .Sprintf ("%s-%d" , name , version )
452
487
453
- availableOSPorts , err := portscheduler .Scheduler .ApplyPorts (len (info .HostConfig .PortBindings ))
454
- if err != nil {
455
- return id , containerName , errors .Wrapf (err , "portscheduler.ApplyPorts failed, info: %+v" , info )
488
+ // 申请宿主机端口
489
+ if info .HostConfig .PortBindings != nil && len (info .HostConfig .PortBindings ) > 0 {
490
+ availableOSPorts , err := portscheduler .Scheduler .ApplyPorts (len (info .HostConfig .PortBindings ))
491
+ if err != nil {
492
+ return id , containerName , errors .Wrapf (err , "portscheduler.ApplyPorts failed, info: %+v" , info )
493
+ }
494
+ var index int
495
+ for k := range info .HostConfig .PortBindings {
496
+ info .HostConfig .PortBindings [k ] = []nat.PortBinding {{
497
+ HostPort : strconv .Itoa (availableOSPorts [index ]),
498
+ }}
499
+ index ++
500
+ }
456
501
}
457
502
458
- var index int
459
- for k := range info .HostConfig .PortBindings {
460
- info .HostConfig .PortBindings [k ] = []nat.PortBinding {{
461
- HostPort : strconv .Itoa (availableOSPorts [index ]),
462
- }}
463
- index ++
464
- }
503
+ // 创建容器
465
504
resp , err := docker .Cli .ContainerCreate (ctx , info .Config , info .HostConfig , info .NetworkingConfig , info .Platform , containerName )
466
505
if err != nil {
467
506
return id , containerName , errors .Wrapf (err , "docker.ContainerCreate failed, name: %s" , containerName )
@@ -521,6 +560,7 @@ func (cs *ContainerService) containerDeviceRequestsDeviceIDs(name string) ([]str
521
560
return resp .HostConfig .DeviceRequests [0 ].DeviceIDs , nil
522
561
}
523
562
563
+ // 获取容器的端口绑定信息
524
564
func (cs * ContainerService ) containerPortBindings (name string ) ([]int , error ) {
525
565
ctx := context .Background ()
526
566
resp , err := docker .Cli .ContainerInspect (ctx , name )
0 commit comments