@@ -321,7 +321,6 @@ def make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref):
321
321
and args ["y" ]
322
322
and len (trace_data [[args ["x" ], args ["y" ]]].dropna ()) > 1
323
323
):
324
-
325
324
# sorting is bad but trace_specs with "trendline" have no other attrs
326
325
sorted_trace_data = trace_data .sort_values (by = args ["x" ])
327
326
y = sorted_trace_data [args ["y" ]].values
@@ -562,7 +561,6 @@ def set_cartesian_axis_opts(args, axis, letter, orders):
562
561
563
562
564
563
def configure_cartesian_marginal_axes (args , fig , orders ):
565
-
566
564
if "histogram" in [args ["marginal_x" ], args ["marginal_y" ]]:
567
565
fig .layout ["barmode" ] = "overlay"
568
566
@@ -885,8 +883,8 @@ def make_trace_spec(args, constructor, attrs, trace_patch):
885
883
def make_trendline_spec (args , constructor ):
886
884
trace_spec = TraceSpec (
887
885
constructor = go .Scattergl
888
- if constructor == go .Scattergl # could be contour
889
- else go .Scatter ,
886
+ if constructor == go .Scattergl
887
+ else go .Scatter , # could be contour
890
888
attrs = ["trendline" ],
891
889
trace_patch = dict (mode = "lines" ),
892
890
marginal = None ,
@@ -1064,14 +1062,25 @@ def _escape_col_name(df_input, col_name, extra):
1064
1062
return col_name
1065
1063
1066
1064
1067
- def to_unindexed_series (x ):
1065
+ def to_unindexed_series (x , name = None ):
1068
1066
"""
1069
- assuming x is list-like or even an existing pd.Series, return a new pd.Series with
1070
- no index, without extracting the data from an existing Series via numpy, which
1067
+ assuming x is list-like or even an existing pd.Series, return a new pd.DataFrame
1068
+ with no index, without extracting the data from an existing Series via numpy, which
1071
1069
seems to mangle datetime columns. Stripping the index from existing pd.Series is
1072
- required to get things to match up right in the new DataFrame we're building
1070
+ required to get things to match up right in the new DataFrame we're building.
1071
+ It's converted to a frame so that it can be concated easily and it contains
1072
+ `columns` attribute, so `_get_cols` can be used.
1073
1073
"""
1074
- return pd .Series (x ).reset_index (drop = True )
1074
+ return pd .Series (x , name = name ).reset_index (drop = True ).to_frame ()
1075
+
1076
+
1077
+ def _get_cols (df_list ):
1078
+ """
1079
+ get all the columns in the current df_list.
1080
+ Since this func is called when we raise error, the func is called once.
1081
+ So inefficiency here can be tolerated.
1082
+ """
1083
+ return [column for df in df_list for column in df .columns ]
1075
1084
1076
1085
1077
1086
def process_args_into_dataframe (args , wide_mode , var_name , value_name ):
@@ -1086,9 +1095,11 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
1086
1095
df_input = args ["data_frame" ]
1087
1096
df_provided = df_input is not None
1088
1097
1089
- df_output = pd .DataFrame ()
1090
- constants = dict ()
1091
- ranges = list ()
1098
+ # we use append it as list to avoid performance issues in pandas
1099
+ # when dealing with large dataframes.
1100
+ df_outputs = []
1101
+ constants = {}
1102
+ ranges = []
1092
1103
wide_id_vars = set ()
1093
1104
reserved_names = _get_reserved_col_names (args ) if df_provided else set ()
1094
1105
@@ -1099,7 +1110,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
1099
1110
"No data were provided. Please provide data either with the `data_frame` or with the `dimensions` argument."
1100
1111
)
1101
1112
else :
1102
- df_output [ df_input . columns ] = df_input [df_input .columns ]
1113
+ df_outputs . append ( df_input [df_input .columns ])
1103
1114
1104
1115
# hover_data is a dict
1105
1116
hover_data_is_dict = (
@@ -1140,7 +1151,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
1140
1151
# argument_list and field_list ready, iterate over them
1141
1152
# Core of the loop starts here
1142
1153
for i , (argument , field ) in enumerate (zip (argument_list , field_list )):
1143
- length = len (df_output )
1154
+ length = len (df_outputs [ 0 ]) if len ( df_outputs ) else 0
1144
1155
if argument is None :
1145
1156
continue
1146
1157
col_name = None
@@ -1181,11 +1192,11 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
1181
1192
% (
1182
1193
argument ,
1183
1194
len (real_argument ),
1184
- str (list ( df_output . columns )),
1195
+ str (_get_cols ( df_outputs )),
1185
1196
length ,
1186
1197
)
1187
1198
)
1188
- df_output [ col_name ] = to_unindexed_series (real_argument )
1199
+ df_outputs . append ( to_unindexed_series (real_argument , col_name ) )
1189
1200
elif not df_provided :
1190
1201
raise ValueError (
1191
1202
"String or int arguments are only possible when a "
@@ -1214,13 +1225,13 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
1214
1225
% (
1215
1226
field ,
1216
1227
len (df_input [argument ]),
1217
- str (list ( df_output . columns )),
1228
+ str (_get_cols ( df_outputs )),
1218
1229
length ,
1219
1230
)
1220
1231
)
1221
1232
else :
1222
1233
col_name = str (argument )
1223
- df_output [ col_name ] = to_unindexed_series (df_input [argument ])
1234
+ df_outputs . append ( to_unindexed_series (df_input [argument ], col_name ) )
1224
1235
# ----------------- argument is likely a column / array / list.... -------
1225
1236
else :
1226
1237
if df_provided and hasattr (argument , "name" ):
@@ -1247,9 +1258,9 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
1247
1258
"All arguments should have the same length. "
1248
1259
"The length of argument `%s` is %d, whereas the "
1249
1260
"length of previously-processed arguments %s is %d"
1250
- % (field , len (argument ), str (list ( df_output . columns )), length )
1261
+ % (field , len (argument ), str (_get_cols ( df_outputs )), length )
1251
1262
)
1252
- df_output [ str ( col_name )] = to_unindexed_series (argument )
1263
+ df_outputs . append ( to_unindexed_series (argument , str ( col_name )) )
1253
1264
1254
1265
# Finally, update argument with column name now that column exists
1255
1266
assert col_name is not None , (
@@ -1267,12 +1278,14 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name):
1267
1278
if field_name != "wide_variable" :
1268
1279
wide_id_vars .add (str (col_name ))
1269
1280
1270
- for col_name in ranges :
1271
- df_output [ col_name ] = range ( len ( df_output ) )
1281
+ length = len ( df_outputs [ 0 ])
1282
+ df_outputs . extend ([ pd . Series ( range ( length ), name = col_name ) for col_name in ranges ] )
1272
1283
1273
- for col_name in constants :
1274
- df_output [col_name ] = constants [col_name ]
1284
+ df_outputs .extend (
1285
+ [pd .Series (constants [col_name ], name = col_name ) for col_name in constants ]
1286
+ )
1275
1287
1288
+ df_output = pd .concat (df_outputs , axis = 1 )
1276
1289
return df_output , wide_id_vars
1277
1290
1278
1291
0 commit comments