Skip to content

Commit 652a009

Browse files
Distinguishing between replica rank and group rank across the project (#181) (#187)
* Distinguishing between replica rank and group rank across the project (#181) * lint --------- Co-authored-by: Tristan Rice <[email protected]>
1 parent 93c230b commit 652a009

File tree

9 files changed

+162
-154
lines changed

9 files changed

+162
-154
lines changed

proto/torchft.proto

+5-5
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ service LighthouseService {
7373
}
7474

7575
message ManagerQuorumRequest {
76-
int64 rank = 1;
76+
int64 group_rank = 1;
7777
int64 step = 2;
7878
string checkpoint_metadata = 3;
7979
bool shrink_only = 4;
@@ -84,12 +84,12 @@ message ManagerQuorumRequest {
8484
message ManagerQuorumResponse {
8585
int64 quorum_id = 1;
8686
string recover_src_manager_address = 2;
87-
optional int64 recover_src_rank = 3;
88-
repeated int64 recover_dst_ranks = 4;
87+
optional int64 recover_src_replica_rank = 3;
88+
repeated int64 recover_dst_replica_ranks = 4;
8989
string store_address = 5;
9090
// These are information for the replicas which are at the max step.
9191
int64 max_step = 6;
92-
optional int64 max_rank = 7;
92+
optional int64 max_replica_rank = 7;
9393
int64 max_world_size = 8;
9494
// These are information for all replicas including behind replicas.
9595
int64 replica_rank = 9;
@@ -108,7 +108,7 @@ message CheckpointMetadataResponse {
108108

109109
message ShouldCommitRequest {
110110
bool should_commit = 1;
111-
int64 rank = 2;
111+
int64 group_rank = 2;
112112
int64 step = 3;
113113
}
114114
message ShouldCommitResponse {

src/lib.rs

+13-13
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ impl ManagerClient {
172172
fn _quorum(
173173
&self,
174174
py: Python<'_>,
175-
rank: i64,
175+
group_rank: i64,
176176
step: i64,
177177
checkpoint_metadata: String,
178178
shrink_only: bool,
@@ -182,7 +182,7 @@ impl ManagerClient {
182182
) -> Result<QuorumResult, StatusError> {
183183
py.allow_threads(move || {
184184
let mut request = tonic::Request::new(ManagerQuorumRequest {
185-
rank: rank,
185+
group_rank: group_rank,
186186
step: step,
187187
checkpoint_metadata: checkpoint_metadata,
188188
shrink_only: shrink_only,
@@ -201,11 +201,11 @@ impl ManagerClient {
201201
replica_rank: resp.replica_rank,
202202
replica_world_size: resp.replica_world_size,
203203
recover_src_manager_address: resp.recover_src_manager_address,
204-
recover_src_rank: resp.recover_src_rank,
205-
recover_dst_ranks: resp.recover_dst_ranks,
204+
recover_src_replica_rank: resp.recover_src_replica_rank,
205+
recover_dst_replica_ranks: resp.recover_dst_replica_ranks,
206206
store_address: resp.store_address,
207207
max_step: resp.max_step,
208-
max_rank: resp.max_rank,
208+
max_replica_rank: resp.max_replica_rank,
209209
max_world_size: resp.max_world_size,
210210
heal: resp.heal,
211211
})
@@ -250,14 +250,14 @@ impl ManagerClient {
250250
fn should_commit(
251251
&self,
252252
py: Python<'_>,
253-
rank: i64,
253+
group_rank: i64,
254254
step: i64,
255255
should_commit: bool,
256256
timeout: Duration,
257257
) -> Result<bool, StatusError> {
258258
py.allow_threads(move || {
259259
let mut request = tonic::Request::new(ShouldCommitRequest {
260-
rank: rank,
260+
group_rank: group_rank,
261261
step: step,
262262
should_commit: should_commit,
263263
});
@@ -281,11 +281,11 @@ struct QuorumResult {
281281
replica_rank: i64,
282282
replica_world_size: i64,
283283
recover_src_manager_address: String,
284-
recover_src_rank: Option<i64>,
285-
recover_dst_ranks: Vec<i64>,
284+
recover_src_replica_rank: Option<i64>,
285+
recover_dst_replica_ranks: Vec<i64>,
286286
store_address: String,
287287
max_step: i64,
288-
max_rank: Option<i64>,
288+
max_replica_rank: Option<i64>,
289289
max_world_size: i64,
290290
heal: bool,
291291
}
@@ -299,11 +299,11 @@ impl QuorumResult {
299299
replica_rank: 0,
300300
replica_world_size: 1,
301301
recover_src_manager_address: "".to_string(),
302-
recover_src_rank: None,
303-
recover_dst_ranks: Vec::new(),
302+
recover_src_replica_rank: None,
303+
recover_dst_replica_ranks: Vec::new(),
304304
store_address: "".to_string(),
305305
max_step: 0,
306-
max_rank: None,
306+
max_replica_rank: None,
307307
max_world_size: 1,
308308
heal: false,
309309
}

0 commit comments

Comments
 (0)