Skip to content

Commit 413d41e

Browse files
perf: fix pandas PerformanceWarning caused due to frame.insert
1 parent 981ace8 commit 413d41e

File tree

1 file changed

+37
-24
lines changed
  • packages/python/plotly/plotly/express

1 file changed

+37
-24
lines changed

packages/python/plotly/plotly/express/_core.py

+37-24
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,6 @@ def make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref):
321321
and args["y"]
322322
and len(trace_data[[args["x"], args["y"]]].dropna()) > 1
323323
):
324-
325324
# sorting is bad but trace_specs with "trendline" have no other attrs
326325
sorted_trace_data = trace_data.sort_values(by=args["x"])
327326
y = sorted_trace_data[args["y"]].values
@@ -562,7 +561,6 @@ def set_cartesian_axis_opts(args, axis, letter, orders):
562561

563562

564563
def configure_cartesian_marginal_axes(args, fig, orders):
565-
566564
if "histogram" in [args["marginal_x"], args["marginal_y"]]:
567565
fig.layout["barmode"] = "overlay"
568566

@@ -885,8 +883,8 @@ def make_trace_spec(args, constructor, attrs, trace_patch):
885883
def make_trendline_spec(args, constructor):
886884
trace_spec = TraceSpec(
887885
constructor=go.Scattergl
888-
if constructor == go.Scattergl # could be contour
889-
else go.Scatter,
886+
if constructor == go.Scattergl
887+
else go.Scatter, # could be contour
890888
attrs=["trendline"],
891889
trace_patch=dict(mode="lines"),
892890
marginal=None,
@@ -1064,14 +1062,25 @@ def _escape_col_name(df_input, col_name, extra):
10641062
return col_name
10651063

10661064

1067-
def to_unindexed_series(x):
1065+
def to_unindexed_series(x, name=None):
10681066
"""
1069-
assuming x is list-like or even an existing pd.Series, return a new pd.Series with
1070-
no index, without extracting the data from an existing Series via numpy, which
1067+
assuming x is list-like or even an existing pd.Series, return a new pd.DataFrame
1068+
with no index, without extracting the data from an existing Series via numpy, which
10711069
seems to mangle datetime columns. Stripping the index from existing pd.Series is
1072-
required to get things to match up right in the new DataFrame we're building
1070+
required to get things to match up right in the new DataFrame we're building.
1071+
It's converted to a frame so that it can be concated easily and it contains
1072+
`columns` attribute, so `_get_cols` can be used.
10731073
"""
1074-
return pd.Series(x).reset_index(drop=True)
1074+
return pd.Series(x, name=name).reset_index(drop=True).to_frame()
1075+
1076+
1077+
def _get_cols(df_list):
1078+
"""
1079+
get all the columns in the current df_list.
1080+
Since this func is called when we raise error, the func is called once.
1081+
So inefficiency here can be tolerated.
1082+
"""
1083+
return [column for df in df_list for column in df.columns]
10751084

10761085

10771086
def process_args_into_dataframe(args, wide_mode, var_name, value_name):
@@ -1086,9 +1095,11 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
10861095
df_input = args["data_frame"]
10871096
df_provided = df_input is not None
10881097

1089-
df_output = pd.DataFrame()
1090-
constants = dict()
1091-
ranges = list()
1098+
# we use append it as list to avoid performance issues in pandas
1099+
# when dealing with large dataframes.
1100+
df_outputs = []
1101+
constants = {}
1102+
ranges = []
10921103
wide_id_vars = set()
10931104
reserved_names = _get_reserved_col_names(args) if df_provided else set()
10941105

@@ -1099,7 +1110,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
10991110
"No data were provided. Please provide data either with the `data_frame` or with the `dimensions` argument."
11001111
)
11011112
else:
1102-
df_output[df_input.columns] = df_input[df_input.columns]
1113+
df_outputs.append(df_input[df_input.columns])
11031114

11041115
# hover_data is a dict
11051116
hover_data_is_dict = (
@@ -1140,7 +1151,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
11401151
# argument_list and field_list ready, iterate over them
11411152
# Core of the loop starts here
11421153
for i, (argument, field) in enumerate(zip(argument_list, field_list)):
1143-
length = len(df_output)
1154+
length = len(df_outputs[0]) if len(df_outputs) else 0
11441155
if argument is None:
11451156
continue
11461157
col_name = None
@@ -1181,11 +1192,11 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
11811192
% (
11821193
argument,
11831194
len(real_argument),
1184-
str(list(df_output.columns)),
1195+
str(_get_cols(df_outputs)),
11851196
length,
11861197
)
11871198
)
1188-
df_output[col_name] = to_unindexed_series(real_argument)
1199+
df_outputs.append(to_unindexed_series(real_argument, col_name))
11891200
elif not df_provided:
11901201
raise ValueError(
11911202
"String or int arguments are only possible when a "
@@ -1214,13 +1225,13 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
12141225
% (
12151226
field,
12161227
len(df_input[argument]),
1217-
str(list(df_output.columns)),
1228+
str(_get_cols(df_outputs)),
12181229
length,
12191230
)
12201231
)
12211232
else:
12221233
col_name = str(argument)
1223-
df_output[col_name] = to_unindexed_series(df_input[argument])
1234+
df_outputs.append(to_unindexed_series(df_input[argument], col_name))
12241235
# ----------------- argument is likely a column / array / list.... -------
12251236
else:
12261237
if df_provided and hasattr(argument, "name"):
@@ -1247,9 +1258,9 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
12471258
"All arguments should have the same length. "
12481259
"The length of argument `%s` is %d, whereas the "
12491260
"length of previously-processed arguments %s is %d"
1250-
% (field, len(argument), str(list(df_output.columns)), length)
1261+
% (field, len(argument), str(_get_cols(df_outputs)), length)
12511262
)
1252-
df_output[str(col_name)] = to_unindexed_series(argument)
1263+
df_outputs.append(to_unindexed_series(argument, str(col_name)))
12531264

12541265
# Finally, update argument with column name now that column exists
12551266
assert col_name is not None, (
@@ -1267,12 +1278,14 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
12671278
if field_name != "wide_variable":
12681279
wide_id_vars.add(str(col_name))
12691280

1270-
for col_name in ranges:
1271-
df_output[col_name] = range(len(df_output))
1281+
length = len(df_outputs[0])
1282+
df_outputs.extend([pd.Series(range(length), name=col_name) for col_name in ranges])
12721283

1273-
for col_name in constants:
1274-
df_output[col_name] = constants[col_name]
1284+
df_outputs.extend(
1285+
[pd.Series(constants[col_name], name=col_name) for col_name in constants]
1286+
)
12751287

1288+
df_output = pd.concat(df_outputs, axis=1)
12761289
return df_output, wide_id_vars
12771290

12781291

0 commit comments

Comments
 (0)