Skip to content

Commit 8aed1fc

Browse files
committed
Merge remote-tracking branch 'origin' into HTTP/2_Multi_Room_Lighthouse
2 parents 273d3ee + dafb968 commit 8aed1fc

21 files changed

+1381
-192
lines changed

.lintrunner.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,6 @@ init_command = [
6565
'pip_init',
6666
'--dry-run={{DRYRUN}}',
6767
'pyre-check==0.9.23',
68+
'click==8.1.0',
6869
]
6970
is_formatter = false

proto/torchft.proto

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ service LighthouseService {
7373
}
7474

7575
message ManagerQuorumRequest {
76-
int64 rank = 1;
76+
int64 group_rank = 1;
7777
int64 step = 2;
7878
string checkpoint_metadata = 3;
7979
bool shrink_only = 4;
@@ -84,12 +84,12 @@ message ManagerQuorumRequest {
8484
message ManagerQuorumResponse {
8585
int64 quorum_id = 1;
8686
string recover_src_manager_address = 2;
87-
optional int64 recover_src_rank = 3;
88-
repeated int64 recover_dst_ranks = 4;
87+
optional int64 recover_src_replica_rank = 3;
88+
repeated int64 recover_dst_replica_ranks = 4;
8989
string store_address = 5;
9090
// These are information for the replicas which are at the max step.
9191
int64 max_step = 6;
92-
optional int64 max_rank = 7;
92+
optional int64 max_replica_rank = 7;
9393
int64 max_world_size = 8;
9494
// These are information for all replicas including behind replicas.
9595
int64 replica_rank = 9;
@@ -108,7 +108,7 @@ message CheckpointMetadataResponse {
108108

109109
message ShouldCommitRequest {
110110
bool should_commit = 1;
111-
int64 rank = 2;
111+
int64 group_rank = 2;
112112
int64 step = 3;
113113
}
114114
message ShouldCommitResponse {

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ Issues = "https://github.com/pytorch-labs/torchft/issues"
2222

2323
[project.optional-dependencies]
2424
dev = [
25-
"pytest",
25+
"pytest==8.3.4",
2626
"pytest-timeout",
2727
"black",
2828
"pyre-check",

src/lib.rs

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ impl ManagerClient {
181181
fn _quorum(
182182
&self,
183183
py: Python<'_>,
184-
rank: i64,
184+
group_rank: i64,
185185
step: i64,
186186
checkpoint_metadata: String,
187187
shrink_only: bool,
@@ -191,7 +191,7 @@ impl ManagerClient {
191191
) -> Result<QuorumResult, StatusError> {
192192
py.allow_threads(move || {
193193
let mut request = tonic::Request::new(ManagerQuorumRequest {
194-
rank: rank,
194+
group_rank: group_rank,
195195
step: step,
196196
checkpoint_metadata: checkpoint_metadata,
197197
shrink_only: shrink_only,
@@ -210,11 +210,11 @@ impl ManagerClient {
210210
replica_rank: resp.replica_rank,
211211
replica_world_size: resp.replica_world_size,
212212
recover_src_manager_address: resp.recover_src_manager_address,
213-
recover_src_rank: resp.recover_src_rank,
214-
recover_dst_ranks: resp.recover_dst_ranks,
213+
recover_src_replica_rank: resp.recover_src_replica_rank,
214+
recover_dst_replica_ranks: resp.recover_dst_replica_ranks,
215215
store_address: resp.store_address,
216216
max_step: resp.max_step,
217-
max_rank: resp.max_rank,
217+
max_replica_rank: resp.max_replica_rank,
218218
max_world_size: resp.max_world_size,
219219
heal: resp.heal,
220220
})
@@ -259,14 +259,14 @@ impl ManagerClient {
259259
fn should_commit(
260260
&self,
261261
py: Python<'_>,
262-
rank: i64,
262+
group_rank: i64,
263263
step: i64,
264264
should_commit: bool,
265265
timeout: Duration,
266266
) -> Result<bool, StatusError> {
267267
py.allow_threads(move || {
268268
let mut request = tonic::Request::new(ShouldCommitRequest {
269-
rank: rank,
269+
group_rank: group_rank,
270270
step: step,
271271
should_commit: should_commit,
272272
});
@@ -290,11 +290,11 @@ struct QuorumResult {
290290
replica_rank: i64,
291291
replica_world_size: i64,
292292
recover_src_manager_address: String,
293-
recover_src_rank: Option<i64>,
294-
recover_dst_ranks: Vec<i64>,
293+
recover_src_replica_rank: Option<i64>,
294+
recover_dst_replica_ranks: Vec<i64>,
295295
store_address: String,
296296
max_step: i64,
297-
max_rank: Option<i64>,
297+
max_replica_rank: Option<i64>,
298298
max_world_size: i64,
299299
heal: bool,
300300
}
@@ -308,11 +308,11 @@ impl QuorumResult {
308308
replica_rank: 0,
309309
replica_world_size: 1,
310310
recover_src_manager_address: "".to_string(),
311-
recover_src_rank: None,
312-
recover_dst_ranks: Vec::new(),
311+
recover_src_replica_rank: None,
312+
recover_dst_replica_ranks: Vec::new(),
313313
store_address: "".to_string(),
314314
max_step: 0,
315-
max_rank: None,
315+
max_replica_rank: None,
316316
max_world_size: 1,
317317
heal: false,
318318
}

0 commit comments

Comments
 (0)