Skip to content

SNOW-1897441: Fix missing row position sort in DataFrame.describe #2950

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 28, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13558,8 +13558,17 @@ def describe(
axis=0,
)
# Compute top (the mode of each column) + freq (the number of times this mode appears).
top_freq_identifiers = padded_qc._modin_frame.ordered_dataframe.generate_snowflake_quoted_identifiers(
pandas_labels=["top", "freq"]
# Also create a new identifier to track min(__row_position__) of each group to ensure stability
(
top_identifier,
freq_identifier,
min_row_position_identifier,
) = padded_qc._modin_frame.ordered_dataframe.generate_snowflake_quoted_identifiers(
pandas_labels=["top", "freq", "min_row_position"]
)
top_freq_identifiers = (top_identifier, freq_identifier)
row_position_identifier = (
padded_qc._modin_frame.ordered_dataframe.row_position_snowflake_quoted_identifier
)
# To accommodate multi-level columns in the source frame, we generate a new index column
# in the top/freq frame for each level. We transpose this frame later, so the columns
Expand All @@ -13573,7 +13582,7 @@ def describe(
)

def count_freqs(
col_labels: Union[str, tuple[str, ...]], col_ident: str
col_labels: Union[str, tuple[str, ...]], col_identifier: str
) -> OrderedDataFrame:
"""
Helper function to compute the mode ("top") and frequency with which the mode
Expand All @@ -13595,7 +13604,6 @@ def count_freqs(
QC.value_counts(dropna=False) would correctly report NULL as the `top` item, but
reports `freq` as the number of times NULL appears, which we do not want.
"""
top_ident, freq_ident = top_freq_identifiers
col_labels_tuple = (
col_labels if is_list_like(col_labels) else (col_labels,)
)
Expand All @@ -13608,7 +13616,7 @@ def count_freqs(
# IFF(a IS NULL, NULL, COUNT(a)) AS freq
# FROM df
# GROUP BY a
# ORDER BY freq DESC NULLS LAST
# ORDER BY freq DESC, MIN(__row_position__) ASC NULLS LAST
# LIMIT 1
#
# The resulting 1-row frame for column "a": [1, 1, 2] will have the form
Expand Down Expand Up @@ -13649,30 +13657,38 @@ def count_freqs(
# +------+---+
return (
padded_qc._modin_frame.ordered_dataframe.group_by(
[col_ident],
[col_identifier],
[
iff(
col(col_ident).is_null(),
col(col_identifier).is_null(),
pandas_lit(None),
count(col(col_ident)),
).as_(freq_ident),
count(col(col_identifier)),
).as_(freq_identifier),
min_(row_position_identifier).as_(
min_row_position_identifier
),
],
)
.sort(OrderingColumn(freq_ident, ascending=False, na_last=True))
.sort(
OrderingColumn(freq_identifier, ascending=False, na_last=True),
OrderingColumn(min_row_position_identifier, ascending=True),
)
.limit(1)
.select(
*(
# If the original frame had multi-level columns, we must create
# a multi-level index to transpose this frame later.
[
pandas_lit(col_label).as_(index_ident)
for col_label, index_ident in zip(
pandas_lit(col_label).as_(index_identifier)
for col_label, index_identifier in zip(
col_labels_tuple, new_index_identifiers
)
]
+ [
col(col_ident).cast(VariantType()).as_(top_ident),
freq_ident,
col(col_identifier)
.cast(VariantType())
.as_(top_identifier),
freq_identifier,
]
)
)
Expand Down
Loading