@@ -56,14 +56,16 @@ func (cs *ContainerService) RunGpuContainer(spec *model.ContainerRun) (id, conta
5656		Tty :       true ,
5757	}
5858
59- 	// 只想容器要暴露的端口 ,添加到创建容器的信息中 
59+ 	// 只将容器要暴露的端口 ,添加到创建容器的信息中 
6060	// 具体这个端口要映射到宿主机的哪个端口,交给 runContainer 方法 
6161	// 这样做的好处就是,不管是创建容器、变更容器 GPU/Volume、重启动容器,都无需关心端口的配置 
62- 	hostConfig .PortBindings  =  make (nat.PortMap , len (spec .ContainerPorts ))
63- 	config .ExposedPorts  =  make (nat.PortSet , len (spec .ContainerPorts ))
64- 	for  _ , port  :=  range  spec .ContainerPorts  {
65- 		config .ExposedPorts [nat .Port (port + "/tcp" )] =  struct {}{}
66- 		hostConfig .PortBindings [nat .Port (port + "/tcp" )] =  nil 
62+ 	if  len (spec .ContainerPorts ) >  0  {
63+ 		hostConfig .PortBindings  =  make (nat.PortMap , len (spec .ContainerPorts ))
64+ 		config .ExposedPorts  =  make (nat.PortSet , len (spec .ContainerPorts ))
65+ 		for  _ , port  :=  range  spec .ContainerPorts  {
66+ 			config .ExposedPorts [nat .Port (port + "/tcp" )] =  struct {}{}
67+ 			hostConfig .PortBindings [nat .Port (port + "/tcp" )] =  nil 
68+ 		}
6769	}
6870
6971	// 绑定 GPU 资源信息 
@@ -188,7 +190,8 @@ func (cs *ContainerService) PatchContainerGpuInfo(name string, spec *model.Conta
188190		return  id , newContainerName , errors .WithMessage (err , "json.Unmarshal failed" )
189191	}
190192
191- 	// 只有 etcd 中的 Volume 对象的版本和要修改的 Volume 版本一致时,才能更新 
193+ 	// 只有 etcd 中的 Container 对象的版本和要修改的 Container 版本一致时,才能更新 
194+ 	// 也就是说,当前容器经过更新,版本号已经为 2 了,此时你就不能对版本号为 0、1 的容器进行更新操作 
192195	if  strconv .FormatInt (info .Version , 10 ) !=  strings .Split (name , "-" )[1 ] {
193196		return  id , newContainerName , errors .Wrapf (xerrors .NewVersionNotMatchError (),
194197			"container: %s, etcd version: %d, patch version: %s" , name , info .Version , strings .Split (name , "-" )[1 ])
@@ -254,6 +257,14 @@ func (cs *ContainerService) PatchContainerGpuInfo(name string, spec *model.Conta
254257		OldResource : info .ContainerName ,
255258		NewResource : newContainerName ,
256259	}
260+ 
261+ 	// 停止旧的容器 
262+ 	// 选择不归还 GPU 资源,因为降低 GPU 配置时,已经归还了 GPU 资源。而升级配置时,会使用原有的卡,所以不需要归还 
263+ 	_  =  cs .StopContainer (name , & model.ContainerStop {
264+ 		RestoreGpus :  false ,
265+ 		RestorePorts : true ,
266+ 	})
267+ 
257268	log .Infof ("service.PatchContainerGpuInfo, container: %s patch gpu info successfully" , name )
258269	return 
259270}
@@ -278,7 +289,7 @@ func (cs *ContainerService) PatchContainerVolumeInfo(name string, spec *model.Co
278289		return  id , newContainerName , errors .WithMessage (err , "json.Unmarshal failed" )
279290	}
280291
281- 	// 只有 etcd 中的 Volume  对象的版本和要修改的 Volume  版本一致时,才能更新 
292+ 	// 只有 etcd 中的 Container  对象的版本和要修改的 Container  版本一致时,才能更新 
282293	if  strconv .FormatInt (info .Version , 10 ) !=  strings .Split (name , "-" )[1 ] {
283294		return  id , newContainerName , errors .Wrapf (xerrors .NewVersionNotMatchError (),
284295			"container: %s, etcd version: %d, patch version: %s" , name , info .Version , strings .Split (name , "-" )[1 ])
@@ -305,27 +316,39 @@ func (cs *ContainerService) PatchContainerVolumeInfo(name string, spec *model.Co
305316		NewResource : newContainerName ,
306317	}
307318
319+ 	// 停止旧的容器 
320+ 	// 选择不归还 GPU 资源,因为更改 Volume 配置不涉及 GPU 资源的申请与释放 
321+ 	_  =  cs .StopContainer (name , & model.ContainerStop {
322+ 		RestoreGpus :  false ,
323+ 		RestorePorts : true ,
324+ 	})
325+ 
308326	log .Infof ("service.PatchContainerVolumeInfo, container: %s patch volume info successfully" , name )
309327	return 
310328}
311329
312- // StopContainer 停止容器,会归还端口资源,如果是 GPU 容器,会归还使用的资源 
313- func  (cs  * ContainerService ) StopContainer (name  string ) error  {
314- 	// 归还 gpu 资源 
315- 	uuids , err  :=  cs .containerDeviceRequestsDeviceIDs (name )
316- 	if  err  !=  nil  {
317- 		return  errors .WithMessage (err , "service.containerDeviceRequestsDeviceIDs failed" )
330+ // StopContainer 停止容器 
331+ // restoreGpus 是否释放 gpu 资源 
332+ // restorePorts 是否释放端口资源 
333+ func  (cs  * ContainerService ) StopContainer (name  string , spec  * model.ContainerStop ) error  {
334+ 	if  spec .RestoreGpus  {
335+ 		// 归还 gpu 资源 
336+ 		uuids , err  :=  cs .containerDeviceRequestsDeviceIDs (name )
337+ 		if  err  !=  nil  {
338+ 			return  errors .WithMessage (err , "service.containerDeviceRequestsDeviceIDs failed" )
339+ 		}
340+ 		gpuscheduler .Scheduler .RestoreGpus (uuids )
341+ 		log .Infof ("service.StopContainer, container: %s restore %d gpus, uuids: %+v" ,
342+ 			name , len (uuids ), uuids )
318343	}
319- 	gpuscheduler .Scheduler .RestoreGpus (uuids )
320- 	log .Infof ("service.StopContainer, container: %s restore %d gpus, uuids: %+v" ,
321- 		name , len (uuids ), uuids )
322- 
323- 	// 归还端口资源 
324- 	ports , err  :=  cs .containerPortBindings (name )
325- 	if  err  !=  nil  {
326- 		return  errors .WithMessage (err , "service.containerPortBindings failed" )
344+ 	if  spec .RestorePorts  {
345+ 		// 归还端口资源 
346+ 		ports , err  :=  cs .containerPortBindings (name )
347+ 		if  err  !=  nil  {
348+ 			return  errors .WithMessage (err , "service.containerPortBindings failed" )
349+ 		}
350+ 		portscheduler .Scheduler .RestorePorts (ports )
327351	}
328- 	portscheduler .Scheduler .RestorePorts (ports )
329352
330353	// 停止容器 
331354	ctx  :=  context .Background ()
@@ -423,6 +446,18 @@ func (cs *ContainerService) CommitContainer(name string, spec model.ContainerCom
423446	return  imageName , err 
424447}
425448
449+ func  (cs  * ContainerService ) GetContainerInfo (name  string ) (info  model.EtcdContainerInfo , err  error ) {
450+ 	infoBytes , err  :=  etcd .Get (etcd .Containers , name )
451+ 	if  err  !=  nil  {
452+ 		return  info , errors .WithMessage (err , "etcd.Get failed" )
453+ 	}
454+ 
455+ 	if  err  =  json .Unmarshal (infoBytes , & info ); err  !=  nil  {
456+ 		return  info , errors .WithMessage (err , "json.Unmarshal failed" )
457+ 	}
458+ 	return 
459+ }
460+ 
426461// 真正创建容器和启动容器的方法,这个方法不区分是用来创建 GPU 容器还是普通容器,因为它只会根据入参来创建容器 
427462// 用于创建容器、变更容器的 GPU 信息、变更容器的 Volume 信息、重启动 GPU 容器等 
428463func  (cs  * ContainerService ) runContainer (ctx  context.Context , name  string , info  model.EtcdContainerInfo ) (id , containerName  string , err  error ) {
@@ -450,18 +485,22 @@ func (cs *ContainerService) runContainer(ctx context.Context, name string, info
450485	// 生成此次要创建的容器的名称 
451486	containerName  =  fmt .Sprintf ("%s-%d" , name , version )
452487
453- 	availableOSPorts , err  :=  portscheduler .Scheduler .ApplyPorts (len (info .HostConfig .PortBindings ))
454- 	if  err  !=  nil  {
455- 		return  id , containerName , errors .Wrapf (err , "portscheduler.ApplyPorts failed, info: %+v" , info )
488+ 	// 申请宿主机端口 
489+ 	if  info .HostConfig .PortBindings  !=  nil  &&  len (info .HostConfig .PortBindings ) >  0  {
490+ 		availableOSPorts , err  :=  portscheduler .Scheduler .ApplyPorts (len (info .HostConfig .PortBindings ))
491+ 		if  err  !=  nil  {
492+ 			return  id , containerName , errors .Wrapf (err , "portscheduler.ApplyPorts failed, info: %+v" , info )
493+ 		}
494+ 		var  index  int 
495+ 		for  k  :=  range  info .HostConfig .PortBindings  {
496+ 			info .HostConfig .PortBindings [k ] =  []nat.PortBinding {{
497+ 				HostPort : strconv .Itoa (availableOSPorts [index ]),
498+ 			}}
499+ 			index ++ 
500+ 		}
456501	}
457502
458- 	var  index  int 
459- 	for  k  :=  range  info .HostConfig .PortBindings  {
460- 		info .HostConfig .PortBindings [k ] =  []nat.PortBinding {{
461- 			HostPort : strconv .Itoa (availableOSPorts [index ]),
462- 		}}
463- 		index ++ 
464- 	}
503+ 	// 创建容器 
465504	resp , err  :=  docker .Cli .ContainerCreate (ctx , info .Config , info .HostConfig , info .NetworkingConfig , info .Platform , containerName )
466505	if  err  !=  nil  {
467506		return  id , containerName , errors .Wrapf (err , "docker.ContainerCreate failed, name: %s" , containerName )
@@ -521,6 +560,7 @@ func (cs *ContainerService) containerDeviceRequestsDeviceIDs(name string) ([]str
521560	return  resp .HostConfig .DeviceRequests [0 ].DeviceIDs , nil 
522561}
523562
563+ // 获取容器的端口绑定信息 
524564func  (cs  * ContainerService ) containerPortBindings (name  string ) ([]int , error ) {
525565	ctx  :=  context .Background ()
526566	resp , err  :=  docker .Cli .ContainerInspect (ctx , name )
0 commit comments