diff --git a/CHANGELOG.md b/CHANGELOG.md index 196f8a26..6a2a7482 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ Versioning](https://semver.org/spec/v2.0.0.html). ## Unreleased #### Added +- Add memory analysis utility for trace internal representation - Add trace format validator - Added multiple trace filter classes and demos. - Added enhanced trace call stack graph implementation. diff --git a/hta/utils/memory_utils.py b/hta/utils/memory_utils.py new file mode 100644 index 00000000..5de938f2 --- /dev/null +++ b/hta/utils/memory_utils.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import pandas as pd + + +def get_memory_usage_in_MB(df: pd.DataFrame) -> float: + """Get the memory usage of a trace dataframe in megabytes (MB).""" + memory_per_column = df.memory_usage(deep=True) + total_memory = memory_per_column.sum() + total_memory_mb = total_memory / (1024 * 1024) + return total_memory_mb + + +def analyze_memory_usage(df: pd.DataFrame) -> pd.DataFrame: + """ + Analyze the memory usage of a trace dataframe. + + Args: + df (pd.DataFrame): The input dataframe to analyze. + + Returns: + pd.DataFrame: A new dataframe containing the memory usage analysis. + """ + _df = pd.DataFrame( + { + "Memory (MB)": df.memory_usage(deep=True), + "Count": df.count(), + "DType": df.dtypes, + } + ) + _df.dropna(inplace=True) + _df["Memory (MB)"] = _df["Memory (MB)"] / (1024 * 1024) + _df["Count"] = _df["Count"].astype(int) + _df["Memory Per Item (B)"] = ( + _df["Memory (MB)"] * 1024 * 1024 / _df["Count"] + ).astype(int) + + return _df.round(2) diff --git a/tests/test_memory_utils.py b/tests/test_memory_utils.py new file mode 100644 index 00000000..afb80622 --- /dev/null +++ b/tests/test_memory_utils.py @@ -0,0 +1,27 @@ +import unittest + +import pandas as pd + +from hta.utils.memory_utils import analyze_memory_usage + + +class TestMemoryUtility(unittest.TestCase): + def test_analyze_memory_usage(self): + n = 1024 * 1024 + data = { + "A": [1] * n, + "B": [2.3] * n, + "C": ["7"] * n, + } + df = pd.DataFrame(data) + + result = analyze_memory_usage(df) + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual( + set(result.columns), + {"Memory (MB)", "Count", "DType", "Memory Per Item (B)"}, + ) + self.assertListEqual(result["Memory (MB)"].tolist(), [8.0, 8.0, 58.0]) + self.assertListEqual(result["Count"].tolist(), [n, n, n]) + self.assertListEqual(result["DType"].tolist(), ["int64", "float64", "object"]) + self.assertListEqual(result["Memory Per Item (B)"].tolist(), [8, 8, 58])