Merge pull request #83 from smartnodes-lab/testnet-v2

mattjhawken · web-flow · commit 5aa9c0073291 · 2025-10-15T12:52:09.000-04:00
Added expected model output for HF models
diff --git a/tensorlink/ml/module.py b/tensorlink/ml/module.py
@@ -734,7 +734,11 @@ def _initialize_distribution(self):
                     module["training"] = self.training
 
         else:
-            distribution = {"model_name": self.model}
+            distribution = {
+                "model_name": self.model,
+                "training": self.training,
+                "optimizer": optimizer_type,
+            }
 
         # Request job from network
         distributed_config = self.node.send_request(
diff --git a/tensorlink/ml/utils.py b/tensorlink/ml/utils.py
@@ -873,25 +873,42 @@ def combine_micro_batches(micro_batches):
 def replace_output_with_custom_grad(combined_output, custom_grad_output):
     """
     Replace the main output tensor (logits, last_hidden_state, etc.) in the combined_output
-    with the custom_grad_output, preserving the original structure.
+    with the custom_grad_output, preserving structure and returning a ModelOutput when possible.
     """
-    if hasattr(combined_output, "logits"):
-        return combined_output.__class__(
-            **{**combined_output, "logits": custom_grad_output}
-        )
-    elif hasattr(combined_output, "last_hidden_state"):
-        return combined_output.__class__(
-            **{**combined_output, "last_hidden_state": custom_grad_output}
-        )
-    elif isinstance(combined_output, torch.Tensor):
+    # If the combined output is already a tensor
+    if isinstance(combined_output, torch.Tensor):
         return custom_grad_output
-    else:
-        # For custom ModelOutput-like structures, replace the first tensor found
-        for key, value in combined_output.items():
-            if isinstance(value, torch.Tensor):
-                combined_output[key] = custom_grad_output
-                break
-        return combined_output
+
+    # Handle ModelOutput subclasses (SequenceClassifierOutput, etc.)
+    if isinstance(combined_output, ModelOutput):
+        data = combined_output.to_dict()
+        if "logits" in data:
+            data["logits"] = custom_grad_output
+        elif "last_hidden_state" in data:
+            data["last_hidden_state"] = custom_grad_output
+        else:
+            for k, v in data.items():
+                if isinstance(v, torch.Tensor):
+                    data[k] = custom_grad_output
+                    break
+        return combined_output.__class__(**data)
+
+    # Handle dict outputs
+    if isinstance(combined_output, dict):
+        new_output = dict(combined_output)
+        if "logits" in new_output:
+            new_output["logits"] = custom_grad_output
+        elif "last_hidden_state" in new_output:
+            new_output["last_hidden_state"] = custom_grad_output
+        else:
+            for k, v in new_output.items():
+                if isinstance(v, torch.Tensor):
+                    new_output[k] = custom_grad_output
+                    break
+        # Wrap dict in a generic ModelOutput for consistency
+        return ModelOutput(**new_output)
+
+    raise TypeError(f"Unsupported output type: {type(combined_output)}")
 
 
 def split_into_micro_batches(combined_output, n_micro_batch):
diff --git a/tensorlink/nodes/user.py b/tensorlink/nodes/user.py
@@ -293,7 +293,7 @@ def request_job(self, n_pipelines, dp_factor, distribution, training):
         # self.debug_print("request_job: Job requested on Smart Contract!")
         # validator_ids = self.contract.functions.getJobValidators(job_id).call()
         validator_ids = [random.choice(self.validators)]
-        if len(distribution) != 1:
+        if not distribution.get("model_name"):
             # The case where we have a custom model with distributed config
             distribution = {
                 k: v for k, v in distribution.items() if v["type"] == "offloaded"
@@ -322,6 +322,7 @@ def request_job(self, n_pipelines, dp_factor, distribution, training):
             }
         else:
             # The case where we have a huggingface model name for inference
+            optimizer_type = distribution.get("optimizer")
             job_request = {
                 "author": self.rsa_key_hash,
                 "active": True,
@@ -334,6 +335,7 @@ def request_job(self, n_pipelines, dp_factor, distribution, training):
                 "distribution": {},
                 "n_workers": 0,
                 "model_name": distribution.get("model_name"),
+                "optimizer": f"{optimizer_type.__module__}.{optimizer_type.__name__}",
                 "seed_validators": validator_ids,
             }
 
diff --git a/tensorlink/nodes/validator.py b/tensorlink/nodes/validator.py
@@ -355,7 +355,7 @@ def create_hf_job(self, job_info: dict, requesters_ip: str = None):
 
         # Huggingface model info checks
         (vram, ram) = estimate_hf_model_memory(
-            job_info.get("model_name"), training=False
+            job_info.get("model_name"), training=job_info.get("training", False)
         )
 
         if job_info.get("payment", 0) == 0:
@@ -367,6 +367,7 @@ def create_hf_job(self, job_info: dict, requesters_ip: str = None):
         job_data["ram"] = ram
         job_data["vram"] = vram
         job_data["time"] = _time
+
         # Hand off model dissection and worker assignment to DistributedValidator process
         request_value = "HF-JOB-REQ" + json.dumps(job_data)
         self._store_request(self.rsa_key_hash, request_value)
@@ -527,7 +528,6 @@ def check_job_availability(self, job_data: dict):
                         tag="Validator",
                     )
                     return False
-
         return assigned_workers
 
     def create_base_job(self, job_data: dict):
diff --git a/tensorlink/p2p/torch_node.py b/tensorlink/p2p/torch_node.py
@@ -157,7 +157,7 @@ def _handle_optimizer_response(self, data: bytes, node: Connection):
             node.ghosts += 1
             return False
         else:
-            module_id, response_type = json.dumps(data[18:]).encode()
+            module_id, response_type = json.loads(data[18:])
 
             if response_type == "loaded":
                 self.debug_print(