Skip to content

Commit 50ad821

Browse files
committed
[Feat] ✨ Add Mock GPU for OnlyCPU Node #7
1 parent 6305d0f commit 50ad821

File tree

9 files changed

+609
-128
lines changed

9 files changed

+609
-128
lines changed

.github/workflows/build.yml

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,17 @@ jobs:
2727
COMMIT=$(git rev-parse --short HEAD)
2828
GO_VERSION=$(go env GOVERSION)
2929
BUILD_TIME=$(date +%FT%T%z)
30-
GOOS=linux GOARCH=amd64 go build -ldflags="-s -w -X main.BRANCH=${BRANCH} -X main.VERSION=${VERSION} -X main.COMMIT=${COMMIT} -X main.GoVersion=${GO_VERSION} -X main.BuildTime=${BUILD_TIME}" -trimpath -o gpu-docker-api cmd/gpu-docker-api/main.go && tar -zcvf gpu-docker-api-linux-amd64.tar.gz gpu-docker-api
30+
GOOS=linux GOARCH=amd64 go build -ldflags="-s -w -X main.BRANCH=${BRANCH} -X main.VERSION=${VERSION} -X main.COMMIT=${COMMIT} -X main.GoVersion=${GO_VERSION} -X main.BuildTime=${BUILD_TIME}" -tag "nvidia" -trimpath -o gpu-docker-api cmd/gpu-docker-api/main.go && tar -zcvf gpu-docker-api-nvidia-linux-amd64.tar.gz gpu-docker-api
31+
GOOS=linux GOARCH=amd64 go build -ldflags="-s -w -X main.BRANCH=${BRANCH} -X main.VERSION=${VERSION} -X main.COMMIT=${COMMIT} -X main.GoVersion=${GO_VERSION} -X main.BuildTime=${BUILD_TIME}" -tag "mock" -trimpath -o gpu-docker-api cmd/gpu-docker-api/main.go && tar -zcvf gpu-docker-api-mock-linux-amd64.tar.gz gpu-docker-api
3132
- name: Create Release and Upload Release Asset
3233
uses: softprops/action-gh-release@v1
3334
with:
3435
files: |
35-
gpu-docker-api-linux-amd64.tar.gz
36+
gpu-docker-api-nvidia-linux-amd64.tar.gz
37+
gpu-docker-api-mock-linux-amd64.tar.gz
3638
37-
docker-build:
38-
name: docker-build
39+
docker-build-nvidia:
40+
name: docker-build-nvidia
3941
runs-on: ubuntu-latest
4042
permissions:
4143
packages: write
@@ -69,4 +71,45 @@ jobs:
6971
push: true
7072
tags: |
7173
docker.io/xshengtech/gpu-docker-api:${{ github.ref_name }}
74+
docker.io/xshengtech/gpu-docker-api:${{ github.ref_name }}-nvidia
7275
docker.io/xshengtech/gpu-docker-api:latest
76+
docker.io/xshengtech/gpu-docker-api:latest-nvidia
77+
docker.io/xshengtech/gpu-docker-api:nvidia
78+
79+
docker-build-mock:
80+
name: docker-build-mock
81+
runs-on: ubuntu-latest
82+
permissions:
83+
packages: write
84+
contents: read
85+
steps:
86+
- uses: actions/checkout@v4
87+
88+
- name: Setup timezone
89+
uses: zcong1993/setup-timezone@master
90+
with:
91+
timezone: Asia/Shanghai
92+
93+
- name: Login to DockerHub
94+
uses: docker/login-action@v2
95+
with:
96+
username: ${{ secrets.DOCKERHUB_USERNAME }}
97+
password: ${{ secrets.DOCKERHUB_TOKEN }}
98+
99+
- name: Set up QEMU
100+
uses: docker/setup-qemu-action@v2
101+
102+
- name: Set up Docker Buildx
103+
uses: docker/setup-buildx-action@v2
104+
105+
- name: Build and push
106+
uses: docker/build-push-action@v4
107+
with:
108+
platforms: linux/amd64
109+
context: .
110+
file: ./Dockerfile
111+
push: true
112+
tags: |
113+
docker.io/xshengtech/gpu-docker-api:${{ github.ref_name }}-mock
114+
docker.io/xshengtech/gpu-docker-api:latest-mock
115+
docker.io/xshengtech/gpu-docker-api:mock

Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@ COPY go.sum .
1515
RUN go mod download
1616

1717
COPY . .
18-
RUN make linux
18+
RUN make nvidia_linux
1919

2020
FROM ubuntu:22.04
2121

2222
VOLUME /data
2323
WORKDIR /data
2424

25-
COPY --from=builder /build/bin/gpu-docker-api-linux-amd64 /data/gpu-docker-api
25+
COPY --from=builder /build/bin/gpu-docker-api-nvidia-linux-amd64 /data/gpu-docker-api
2626

2727
EXPOSE 2378
2828

Dockerfile.mock

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
FROM golang:1.23.6-alpine AS builder
2+
LABEL stage=gobuilder \
3+
mainatiner=https://github.com/XShengTech/gpu-docker-api
4+
5+
# RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.tuna.tsinghua.edu.cn/g' /etc/apk/repositories
6+
RUN apk add gcc g++ make libffi-dev openssl-dev libtool git
7+
8+
ENV CGO_ENABLED=0
9+
# ENV GOPROXY=https://goproxy.cn,direct
10+
11+
WORKDIR /build
12+
13+
COPY go.mod .
14+
COPY go.sum .
15+
RUN go mod download
16+
17+
COPY . .
18+
RUN make mock_linux
19+
20+
FROM ubuntu:22.04
21+
22+
VOLUME /data
23+
WORKDIR /data
24+
25+
COPY --from=builder /build/bin/gpu-docker-api-mock-linux-amd64 /data/gpu-docker-api
26+
27+
EXPOSE 2378
28+
29+
ENTRYPOINT ["./gpu-docker-api"]

Makefile

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,30 +13,38 @@ CURRENT_DIR = $(shell pwd)
1313
BUILD_DIR = ${CURRENT_DIR}/cmd/${BINARY}
1414
BIN_DIR= ${CURRENT_DIR}/bin
1515

16+
NVIDIA = nvidia
17+
MOCK = mock
18+
1619
LDFLAGS = -ldflags "-s -w -X main.BRANCH=${BRANCH} -X main.VERSION=${VERSION} -X main.COMMIT=${COMMIT} -X main.GoVersion=${GO_VERSION} -X main.BuildTime=${BUILD_TIME}"
1720

1821
all: fmt imports clean linux darwin windows
1922

2023
build: clean linux darwin windows
2124

22-
linux:
25+
nvidia_linux:
26+
cd ${BUILD_DIR}; \
27+
GOOS=linux GOARCH=${GOARCH} go build ${LDFLAGS} -tags "${NVIDIA}" -o ${BIN_DIR}/${BINARY}-${NVIDIA}-linux-${GOARCH} . ; \
28+
cd - >/dev/null
29+
30+
nvidia_linux_no_ldflags:
2331
cd ${BUILD_DIR}; \
24-
GOOS=linux GOARCH=${GOARCH} go build ${LDFLAGS} -o ${BIN_DIR}/${BINARY}-linux-${GOARCH} . ; \
32+
GOOS=linux GOARCH=${GOARCH} go build -tags "${NVIDIA}" -o ${BIN_DIR}/${BINARY}-${NVIDIA}-linux-${GOARCH} . ; \
2533
cd - >/dev/null
2634

27-
linux_no_ldflags:
35+
nvidia_darwin:
2836
cd ${BUILD_DIR}; \
29-
GOOS=linux GOARCH=${GOARCH} go build -o ${BIN_DIR}/${BINARY}-linux-${GOARCH} . ; \
37+
GOOS=darwin GOARCH=${GOARCH} go build ${LDFLAGS} -tags "${NVIDIA}" -o ${BIN_DIR}/${BINARY}-${NVIDIA}-darwin-${GOARCH} . ; \
3038
cd - >/dev/null
3139

32-
darwin:
40+
nvidia_windows:
3341
cd ${BUILD_DIR}; \
34-
GOOS=darwin GOARCH=${GOARCH} go build ${LDFLAGS} -o ${BIN_DIR}/${BINARY}-darwin-${GOARCH} . ; \
42+
GOOS=windows GOARCH=${GOARCH} go build ${LDFLAGS} -tags "${NVIDIA}" -o ${BIN_DIR}/${BINARY}-${NVIDIA}-windows-${GOARCH}.exe . ; \
3543
cd - >/dev/null
3644

37-
windows:
45+
mock_linux:
3846
cd ${BUILD_DIR}; \
39-
GOOS=windows GOARCH=${GOARCH} go build ${LDFLAGS} -o ${BIN_DIR}/${BINARY}-windows-${GOARCH}.exe . ; \
47+
GOOS=linux GOARCH=${GOARCH} go build ${LDFLAGS} -tags "${MOCK}" -o ${BIN_DIR}/${BINARY}-${MOCK}-linux-${GOARCH} . ; \
4048
cd - >/dev/null
4149

4250
docker_build:

internal/schedulers/gpuscheduler.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
//go:build !mock
2+
13
package schedulers
24

35
import (
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
//go:build mock
2+
3+
package schedulers
4+
5+
import (
6+
"encoding/json"
7+
"strconv"
8+
"strings"
9+
"sync"
10+
11+
"github.com/pkg/errors"
12+
13+
"github.com/mayooot/gpu-docker-api/internal/etcd"
14+
"github.com/mayooot/gpu-docker-api/internal/workQueue"
15+
"github.com/mayooot/gpu-docker-api/internal/xerrors"
16+
)
17+
18+
const (
19+
gpuStatusMapKey = "gpuStatusMapKey"
20+
)
21+
22+
var GpuScheduler *gpuScheduler
23+
24+
type gpu struct {
25+
Index int `json:"index"`
26+
UUID *string `json:"uuid"`
27+
}
28+
29+
type gpuScheduler struct {
30+
sync.RWMutex
31+
32+
AvailableGpuNums int `json:"availableGpuNums"`
33+
GpuStatusMap map[string]byte `json:"gpuStatusMap"`
34+
}
35+
36+
func InitGPuScheduler() error {
37+
var err error
38+
GpuScheduler, err = initGpuFormEtcd()
39+
if err != nil {
40+
return errors.Wrap(err, "initFormEtcd failed")
41+
}
42+
43+
if GpuScheduler.AvailableGpuNums == 0 || len(GpuScheduler.GpuStatusMap) == 0 {
44+
// if it has not been initialized
45+
gpus, err := getAllGpuUUID()
46+
if err != nil {
47+
return errors.Wrap(err, "getAllGpuUUID failed")
48+
}
49+
50+
GpuScheduler.AvailableGpuNums = len(gpus)
51+
for i := 0; i < len(gpus); i++ {
52+
GpuScheduler.GpuStatusMap[*gpus[i].UUID] = 0
53+
}
54+
}
55+
return nil
56+
}
57+
58+
func CloseGpuScheduler() error {
59+
return etcd.Put(etcd.Gpus, gpuStatusMapKey, GpuScheduler.serialize())
60+
}
61+
62+
func initGpuFormEtcd() (s *gpuScheduler, err error) {
63+
bytes, err := etcd.GetValue(etcd.Gpus, gpuStatusMapKey)
64+
if err != nil {
65+
if xerrors.IsNotExistInEtcdError(err) {
66+
err = nil
67+
} else {
68+
return s, err
69+
}
70+
}
71+
72+
s = &gpuScheduler{
73+
GpuStatusMap: make(map[string]byte),
74+
}
75+
if len(bytes) != 0 {
76+
err = json.Unmarshal(bytes, &s)
77+
}
78+
return s, err
79+
}
80+
81+
// Apply for a specified number of gpus
82+
func (gs *gpuScheduler) Apply(num int) ([]string, error) {
83+
if num <= 0 || num > gs.AvailableGpuNums {
84+
return nil, errors.New("num must be greater than 0 and less than " + strconv.Itoa(gs.AvailableGpuNums))
85+
}
86+
87+
gs.Lock()
88+
defer gs.Unlock()
89+
90+
var availableGpus []string
91+
for k, v := range gs.GpuStatusMap {
92+
if v == 0 {
93+
gs.GpuStatusMap[k] = 1
94+
availableGpus = append(availableGpus, k)
95+
if len(availableGpus) == num {
96+
break
97+
}
98+
}
99+
}
100+
101+
if len(availableGpus) < num {
102+
gs.restore(availableGpus)
103+
return nil, xerrors.NewGpuNotEnoughError()
104+
}
105+
106+
go gs.putToEtcd()
107+
108+
return availableGpus, nil
109+
}
110+
111+
// Restore a specified number of gpu
112+
func (gs *gpuScheduler) Restore(gpus []string) {
113+
if len(gpus) <= 0 || len(gpus) > gs.AvailableGpuNums {
114+
return
115+
}
116+
117+
gs.Lock()
118+
defer gs.Unlock()
119+
120+
gs.restore(gpus)
121+
122+
go gs.putToEtcd()
123+
}
124+
125+
func (gs *gpuScheduler) restore(gpus []string) {
126+
if len(gpus) <= 0 || len(gpus) > gs.AvailableGpuNums {
127+
return
128+
}
129+
130+
for _, gpu := range gpus {
131+
gs.GpuStatusMap[gpu] = 0
132+
}
133+
}
134+
135+
func (gs *gpuScheduler) serialize() *string {
136+
gs.RLock()
137+
defer gs.RUnlock()
138+
139+
bytes, _ := json.Marshal(gs)
140+
tmp := string(bytes)
141+
return &tmp
142+
}
143+
144+
func (gs *gpuScheduler) GetGpuStatus() map[string]byte {
145+
gs.RLock()
146+
defer gs.RUnlock()
147+
148+
copyMap := make(map[string]byte, len(gs.GpuStatusMap))
149+
for k, v := range gs.GpuStatusMap {
150+
copyMap[k] = v
151+
}
152+
153+
return copyMap
154+
}
155+
156+
func (gs *gpuScheduler) putToEtcd() {
157+
workQueue.Queue <- etcd.PutKeyValue{
158+
Resource: etcd.Gpus,
159+
Key: gpuStatusMapKey,
160+
Value: GpuScheduler.serialize(),
161+
}
162+
}
163+
164+
func getAllGpuUUID() ([]*gpu, error) {
165+
uuids := []string{
166+
"MockGPU-0",
167+
"MockGPU-1",
168+
"MockGPU-2",
169+
"MockGPU-3",
170+
"MockGPU-4",
171+
"MockGPU-5",
172+
"MockGPU-6",
173+
"MockGPU-7",
174+
}
175+
gpuList := []*gpu{}
176+
for i, uuid := range uuids {
177+
gpuList = append(gpuList, &gpu{
178+
Index: i,
179+
UUID: &uuid,
180+
})
181+
}
182+
183+
return gpuList, nil
184+
}
185+
186+
func parseOutput(output string) (gpuList []*gpu, err error) {
187+
lines := strings.Split(output, "\n")
188+
gpuList = make([]*gpu, 0, len(lines))
189+
for _, line := range lines {
190+
if line == "" {
191+
continue
192+
}
193+
194+
fields := strings.Split(line, ", ")
195+
if len(fields) == 2 {
196+
index, err := strconv.Atoi(fields[0])
197+
if err != nil {
198+
return gpuList, errors.Errorf("invaild index: %s, ", fields[0])
199+
}
200+
uuid := "nvidia.com/gpu=" + fields[1]
201+
gpuList = append(gpuList, &gpu{
202+
Index: index,
203+
UUID: &uuid,
204+
})
205+
}
206+
}
207+
return
208+
}

0 commit comments

Comments
 (0)