langfuse · davidmc971 · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025
diff --git a/langfuse/api/resources/commons/types/base_score.py b/langfuse/api/resources/commons/types/base_score.py
@@ -23,6 +23,7 @@ class BaseScore(pydantic_v1.BaseModel):
         alias="authorUserId", default=None
     )
     comment: typing.Optional[str] = None
+    metadata: typing.Optional[typing.Any] = None
     config_id: typing.Optional[str] = pydantic_v1.Field(alias="configId", default=None)
     """
     Reference a score config on a score. When set, config and score name must be equal and value must comply to optionally defined numerical range

diff --git a/langfuse/api/resources/commons/types/score.py b/langfuse/api/resources/commons/types/score.py
@@ -26,6 +26,7 @@ class Score_Numeric(pydantic_v1.BaseModel):
         alias="authorUserId", default=None
     )
     comment: typing.Optional[str] = None
+    metadata: typing.Optional[typing.Any] = None
     config_id: typing.Optional[str] = pydantic_v1.Field(alias="configId", default=None)
     queue_id: typing.Optional[str] = pydantic_v1.Field(alias="queueId", default=None)
     environment: typing.Optional[str] = None
@@ -84,6 +85,7 @@ class Score_Categorical(pydantic_v1.BaseModel):
         alias="authorUserId", default=None
     )
     comment: typing.Optional[str] = None
+    metadata: typing.Optional[typing.Any] = None
     config_id: typing.Optional[str] = pydantic_v1.Field(alias="configId", default=None)
     queue_id: typing.Optional[str] = pydantic_v1.Field(alias="queueId", default=None)
     environment: typing.Optional[str] = None
@@ -142,6 +144,7 @@ class Score_Boolean(pydantic_v1.BaseModel):
         alias="authorUserId", default=None
     )
     comment: typing.Optional[str] = None
+    metadata: typing.Optional[typing.Any] = None
     config_id: typing.Optional[str] = pydantic_v1.Field(alias="configId", default=None)
     queue_id: typing.Optional[str] = pydantic_v1.Field(alias="queueId", default=None)
     environment: typing.Optional[str] = None

diff --git a/langfuse/api/resources/ingestion/types/score_body.py b/langfuse/api/resources/ingestion/types/score_body.py
@@ -35,6 +35,7 @@ class ScoreBody(pydantic_v1.BaseModel):
         alias="observationId", default=None
     )
     comment: typing.Optional[str] = None
+    metadata: typing.Optional[typing.Any] = None
     data_type: typing.Optional[ScoreDataType] = pydantic_v1.Field(
         alias="dataType", default=None
     )

diff --git a/langfuse/api/resources/score/types/create_score_request.py b/langfuse/api/resources/score/types/create_score_request.py
@@ -34,6 +34,7 @@ class CreateScoreRequest(pydantic_v1.BaseModel):
         alias="observationId", default=None
     )
     comment: typing.Optional[str] = None
+    metadata: typing.Optional[typing.Any] = None
     environment: typing.Optional[str] = pydantic_v1.Field(default=None)
     """
     The environment of the score. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.

diff --git a/langfuse/api/resources/score/types/get_scores_response_data.py b/langfuse/api/resources/score/types/get_scores_response_data.py
@@ -28,6 +28,7 @@ class GetScoresResponseData_Numeric(pydantic_v1.BaseModel):
         alias="authorUserId", default=None
     )
     comment: typing.Optional[str] = None
+    metadata: typing.Optional[typing.Any] = None
     config_id: typing.Optional[str] = pydantic_v1.Field(alias="configId", default=None)
     queue_id: typing.Optional[str] = pydantic_v1.Field(alias="queueId", default=None)
     environment: typing.Optional[str] = None
@@ -87,6 +88,7 @@ class GetScoresResponseData_Categorical(pydantic_v1.BaseModel):
         alias="authorUserId", default=None
     )
     comment: typing.Optional[str] = None
+    metadata: typing.Optional[typing.Any] = None
     config_id: typing.Optional[str] = pydantic_v1.Field(alias="configId", default=None)
     queue_id: typing.Optional[str] = pydantic_v1.Field(alias="queueId", default=None)
     environment: typing.Optional[str] = None
@@ -146,6 +148,7 @@ class GetScoresResponseData_Boolean(pydantic_v1.BaseModel):
         alias="authorUserId", default=None
     )
     comment: typing.Optional[str] = None
+    metadata: typing.Optional[typing.Any] = None
     config_id: typing.Optional[str] = pydantic_v1.Field(alias="configId", default=None)
     queue_id: typing.Optional[str] = pydantic_v1.Field(alias="queueId", default=None)
     environment: typing.Optional[str] = None

diff --git a/langfuse/client.py b/langfuse/client.py
@@ -1573,6 +1573,7 @@ def score(
         trace_id: typing.Optional[str] = None,
         id: typing.Optional[str] = None,
         comment: typing.Optional[str] = None,
+        metadata: typing.Optional[typing.Any] = None,
         observation_id: typing.Optional[str] = None,
         config_id: typing.Optional[str] = None,
         **kwargs,
@@ -1588,6 +1589,7 @@ def score(
         trace_id: typing.Optional[str] = None,
         id: typing.Optional[str] = None,
         comment: typing.Optional[str] = None,
+        metadata: typing.Optional[typing.Any] = None,
         observation_id: typing.Optional[str] = None,
         config_id: typing.Optional[str] = None,
         **kwargs,
@@ -1602,6 +1604,7 @@ def score(
         trace_id: typing.Optional[str] = None,
         id: typing.Optional[str] = None,
         comment: typing.Optional[str] = None,
+        metadata: typing.Optional[typing.Any] = None,
         observation_id: typing.Optional[str] = None,
         config_id: typing.Optional[str] = None,
         **kwargs,
@@ -1616,6 +1619,7 @@ def score(
             trace_id (str): The id of the trace to which the score should be attached.
             id (Optional[str]): The id of the score. If not provided, a new UUID is generated.
             comment (Optional[str]): Additional context/explanation of the score.
+            metadata (Optional[Any]): Additional metadata of the score. Can be any JSON object. Metadata is merged when being updated via the API.
             observation_id (Optional[str]): The id of the observation to which the score should be attached.
             config_id (Optional[str]): The id of the score config. When set, the score value is validated against the config. Defaults to None.
             **kwargs: Additional keyword arguments to include in the score.
@@ -1655,6 +1659,7 @@ def score(
                 "value": value,
                 "data_type": data_type,
                 "comment": comment,
+                "metadata": metadata,
                 "config_id": config_id,
                 "environment": self.environment,
                 **kwargs,
@@ -2415,6 +2420,7 @@ def score(
         value: float,
         data_type: typing.Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
         comment: typing.Optional[str] = None,
+        metadata: typing.Optional[typing.Any] = None,
         config_id: typing.Optional[str] = None,
         **kwargs,
     ) -> "StatefulClient": ...
@@ -2428,6 +2434,7 @@ def score(
         value: str,
         data_type: typing.Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
         comment: typing.Optional[str] = None,
+        metadata: typing.Optional[typing.Any] = None,
         config_id: typing.Optional[str] = None,
         **kwargs,
     ) -> "StatefulClient": ...
@@ -2440,6 +2447,7 @@ def score(
         value: typing.Union[float, str],
         data_type: typing.Optional[ScoreDataType] = None,
         comment: typing.Optional[str] = None,
+        metadata: typing.Optional[typing.Any] = None,
         config_id: typing.Optional[str] = None,
         **kwargs,
     ) -> "StatefulClient":
@@ -2451,6 +2459,7 @@ def score(
             data_type (Optional[ScoreDataType]): The data type of the score. When not set, the data type is inferred from the score config's data type, when present.
               When no config is set, the data type is inferred from the value's type, i.e. float values are categorized as numeric scores and string values as categorical scores.
             comment (Optional[str]): Additional context/explanation of the score.
+            metadata (Optional[Any]): Additional metadata of the score. Can be any JSON object. Metadata is merged when being updated via the API.
             id (Optional[str]): The id of the score. If not provided, a new UUID is generated.
             config_id (Optional[str]): The id of the score config. When set, the score value is validated against the config. Defaults to None.
             **kwargs: Additional keyword arguments to include in the score.
@@ -2484,6 +2493,7 @@ def score(
                 "value": value,
                 "data_type": data_type,
                 "comment": comment,
+                "metadata": metadata,
                 "config_id": config_id,
                 "environment": self.environment,
                 **kwargs,

diff --git a/langfuse/extract_model.py b/langfuse/extract_model.py
@@ -106,7 +106,10 @@ def _extract_model_name(
 
 
 def _extract_model_from_repr_by_pattern(
-    id: str, serialized: Optional[Dict[str, Any]], pattern: str, default: Optional[str] = None
+    id: str,
+    serialized: Optional[Dict[str, Any]],
+    pattern: str,
+    default: Optional[str] = None,
 ):
     if serialized is None:
         return None

diff --git a/tests/test_core_sdk.py b/tests/test_core_sdk.py
@@ -502,7 +502,8 @@ def test_score_trace():
         trace_id=langfuse.get_trace_id(),
         name="valuation",
         value=0.5,
-        comment="This is a comment",
+        comment="tests/test_core_sdk.py::test_score_trace",
+        metadata={"key": "value"},
     )
 
     langfuse.flush()
@@ -519,9 +520,10 @@ def test_score_trace():
 
     assert score["name"] == "valuation"
     assert score["value"] == 0.5
-    assert score["comment"] == "This is a comment"
+    assert score["comment"] == "tests/test_core_sdk.py::test_score_trace"
     assert score["observationId"] is None
     assert score["dataType"] == "NUMERIC"
+    assert score["metadata"] == {"key": "value"}
 
 
 def test_score_trace_nested_trace():
@@ -534,7 +536,8 @@ def test_score_trace_nested_trace():
     trace.score(
         name="valuation",
         value=0.5,
-        comment="This is a comment",
+        comment="tests/test_core_sdk.py::test_score_trace_nested_trace",
+        metadata={"key": "value"},
     )
 
     langfuse.flush()
@@ -551,9 +554,10 @@ def test_score_trace_nested_trace():
 
     assert score.name == "valuation"
     assert score.value == 0.5
-    assert score.comment == "This is a comment"
+    assert score.comment == "tests/test_core_sdk.py::test_score_trace_nested_trace"
     assert score.observation_id is None
     assert score.data_type == "NUMERIC"
+    assert score.metadata == {"key": "value"}
 
 
 def test_score_trace_nested_observation():
@@ -567,7 +571,8 @@ def test_score_trace_nested_observation():
     span.score(
         name="valuation",
         value=0.5,
-        comment="This is a comment",
+        comment="tests/test_core_sdk.py::test_score_trace_nested_observation",
+        metadata={"key": "value"},
     )
 
     langfuse.flush()
@@ -584,9 +589,12 @@ def test_score_trace_nested_observation():
 
     assert score.name == "valuation"
     assert score.value == 0.5
-    assert score.comment == "This is a comment"
+    assert (
+        score.comment == "tests/test_core_sdk.py::test_score_trace_nested_observation"
+    )
     assert score.observation_id == span.id
     assert score.data_type == "NUMERIC"
+    assert score.metadata == {"key": "value"}
 
 
 def test_score_span():
@@ -610,7 +618,8 @@ def test_score_span():
         observation_id=spanId,
         name="valuation",
         value=1,
-        comment="This is a comment",
+        comment="tests/test_core_sdk.py::test_score_span",
+        metadata={"key": "value"},
     )
 
     langfuse.flush()
@@ -626,9 +635,10 @@ def test_score_span():
 
     assert score["name"] == "valuation"
     assert score["value"] == 1
-    assert score["comment"] == "This is a comment"
+    assert score["comment"] == "tests/test_core_sdk.py::test_score_span"
     assert score["observationId"] == spanId
     assert score["dataType"] == "NUMERIC"
+    assert score["metadata"] == {"key": "value"}
 
 
 def test_create_trace_and_span():
@@ -1584,3 +1594,36 @@ def test_environment_from_env_var(monkeypatch):
 
     fetched_trace = api_wrapper.get_trace(trace.id)
     assert fetched_trace["environment"] == "testing"
+
+
+@pytest.mark.parametrize(
+    "input_metadata, expected_metadata",
+    [
+        ("Test Metadata", {"metadata": "Test Metadata"}),
+        (1, {"metadata": 1}),
+        (1.0, {"metadata": 1.0}),
+        ({"key": "value"}, {"key": "value"}),
+        (["value1", "value2"], {"metadata": ["value1", "value2"]}),
+    ],
+)
+def test_metadata(input_metadata, expected_metadata):
+    langfuse = Langfuse(debug=True)
+    api_wrapper = LangfuseAPI()
+
+    trace = langfuse.trace(name="test_metadata", metadata=input_metadata)
+    observation = trace.generation(name="test_gen", metadata=input_metadata)
+    trace.score(name="test_score", value=1, metadata=input_metadata)
+    langfuse.flush()
+    sleep(1)
+
+    fetched_trace = api_wrapper.get_trace(trace.id)
+    fetched_observation = api_wrapper.get_observation(observation.id)
+    fetched_score = fetched_trace["scores"][0]
+
+    trace_metadata = fetched_trace["metadata"]
+    observation_metadata = fetched_observation["metadata"]
+    score_metadata = fetched_score["metadata"]
+
+    assert trace_metadata == expected_metadata
+    assert observation_metadata == expected_metadata
+    assert score_metadata == expected_metadata