diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 22c66719f7..c5ae9813c6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3485,16 +3485,22 @@ def join( *, on: Optional[str] = None, how: str = "left", + lsuffix: str = "", + rsuffix: str = "", ) -> DataFrame: if isinstance(other, bigframes.series.Series): other = other.to_frame() left, right = self, other - if not left.columns.intersection(right.columns).empty: - raise NotImplementedError( - f"Deduping column names is not implemented. {constants.FEEDBACK_LINK}" - ) + col_intersection = left.columns.intersection(right.columns) + + if not col_intersection.empty: + if lsuffix == rsuffix == "": + raise ValueError( + f"columns overlap but no suffix specified: {col_intersection}" + ) + if how == "cross": if on is not None: raise ValueError("'on' is not supported for cross join.") @@ -3502,7 +3508,7 @@ def join( right._block, left_join_ids=[], right_join_ids=[], - suffixes=("", ""), + suffixes=(lsuffix, rsuffix), how="cross", sort=True, ) @@ -3510,45 +3516,107 @@ def join( # Join left columns with right index if on is not None: + if left._has_index and (on in left.index.names): + if on in left.columns: + raise ValueError( + f"'{on}' is both an index level and a column label, which is ambiguous." + ) + else: + raise NotImplementedError( + f"Joining on index level '{on}' is not yet supported. {constants.FEEDBACK_LINK}" + ) + if (left.columns == on).sum() > 1: + raise ValueError(f"The column label '{on}' is not unique.") + if other._block.index.nlevels != 1: raise ValueError( "Join on columns must match the index level of the other DataFrame. Join on column with multi-index haven't been supported." ) - # Switch left index with on column - left_columns = left.columns - left_idx_original_names = left.index.names if left._has_index else () - left_idx_names_in_cols = [ - f"bigframes_left_idx_name_{i}" - for i in range(len(left_idx_original_names)) - ] - if left._has_index: - left.index.names = left_idx_names_in_cols - left = left.reset_index(drop=False) - left = left.set_index(on) - - # Join on index and switch back - combined_df = left._perform_join_by_index(right, how=how) - combined_df.index.name = on - combined_df = combined_df.reset_index(drop=False) - combined_df = combined_df.set_index(left_idx_names_in_cols) - - # To be consistent with Pandas - if combined_df._has_index: - combined_df.index.names = ( - left_idx_original_names - if how in ("inner", "left") - else ([None] * len(combined_df.index.names)) - ) - # Reorder columns - combined_df = combined_df[list(left_columns) + list(right.columns)] - return combined_df + return self._join_on_key( + other, + on=on, + how=how, + lsuffix=lsuffix, + rsuffix=rsuffix, + should_duplicate_on_key=(on in col_intersection), + ) # Join left index with right index if left._block.index.nlevels != right._block.index.nlevels: raise ValueError("Index to join on must have the same number of levels.") - return left._perform_join_by_index(right, how=how) + return left._perform_join_by_index(right, how=how)._add_join_suffix( + left.columns, right.columns, lsuffix=lsuffix, rsuffix=rsuffix + ) + + def _join_on_key( + self, + other: DataFrame, + on: str, + how: str, + lsuffix: str, + rsuffix: str, + should_duplicate_on_key: bool, + ) -> DataFrame: + left, right = self, other + # Replace all columns names with unique names for reordering. + left_col_original_names = left.columns + on_col_name = "bigframes_left_col_on" + dup_on_col_name = "bigframes_left_col_on_dup" + left_col_temp_names = [ + f"bigframes_left_col_name_{i}" if col_name != on else on_col_name + for i, col_name in enumerate(left_col_original_names) + ] + left.columns = pandas.Index(left_col_temp_names) + # if on column is also in right df, we need to duplicate the column + # and set it to be the first column + if should_duplicate_on_key: + left[dup_on_col_name] = left[on_col_name] + on_col_name = dup_on_col_name + left_col_temp_names = [on_col_name] + left_col_temp_names + left = left[left_col_temp_names] + + # Switch left index with on column + left_idx_original_names = left.index.names if left._has_index else () + left_idx_names_in_cols = [ + f"bigframes_left_idx_name_{i}" for i in range(len(left_idx_original_names)) + ] + if left._has_index: + left.index.names = left_idx_names_in_cols + left = left.reset_index(drop=False) + left = left.set_index(on_col_name) + + right_col_original_names = right.columns + right_col_temp_names = [ + f"bigframes_right_col_name_{i}" + for i in range(len(right_col_original_names)) + ] + right.columns = pandas.Index(right_col_temp_names) + + # Join on index and switch back + combined_df = left._perform_join_by_index(right, how=how) + combined_df.index.name = on_col_name + combined_df = combined_df.reset_index(drop=False) + combined_df = combined_df.set_index(left_idx_names_in_cols) + + # To be consistent with Pandas + if combined_df._has_index: + combined_df.index.names = ( + left_idx_original_names + if how in ("inner", "left") + else ([None] * len(combined_df.index.names)) + ) + + # Reorder columns + combined_df = combined_df[left_col_temp_names + right_col_temp_names] + return combined_df._add_join_suffix( + left_col_original_names, + right_col_original_names, + lsuffix=lsuffix, + rsuffix=rsuffix, + extra_col=on if on_col_name == dup_on_col_name else None, + ) def _perform_join_by_index( self, @@ -3562,6 +3630,30 @@ def _perform_join_by_index( ) return DataFrame(block) + def _add_join_suffix( + self, + left_columns, + right_columns, + lsuffix: str = "", + rsuffix: str = "", + extra_col: typing.Optional[str] = None, + ): + col_intersection = left_columns.intersection(right_columns) + final_col_names = [] if extra_col is None else [extra_col] + for col_name in left_columns: + if col_name in col_intersection: + final_col_names.append(f"{col_name}{lsuffix}") + else: + final_col_names.append(col_name) + + for col_name in right_columns: + if col_name in col_intersection: + final_col_names.append(f"{col_name}{rsuffix}") + else: + final_col_names.append(col_name) + self.columns = pandas.Index(final_col_names) + return self + @validations.requires_ordering() def rolling( self, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index caf39bd9e9..1c92565ab6 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2899,12 +2899,99 @@ def test_join_different_table( assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) -def test_join_duplicate_columns_raises_not_implemented(scalars_dfs): - scalars_df, _ = scalars_dfs - df_a = scalars_df[["string_col", "float64_col"]] - df_b = scalars_df[["float64_col"]] - with pytest.raises(NotImplementedError): - df_a.join(df_b, how="outer").to_pandas() +@all_joins +def test_join_different_table_with_duplicate_column_name( + scalars_df_index, scalars_pandas_df_index, how +): + bf_df_a = scalars_df_index[["string_col", "int64_col", "int64_too"]].rename( + columns={"int64_too": "int64_col"} + ) + bf_df_b = scalars_df_index.dropna()[ + ["string_col", "int64_col", "int64_too"] + ].rename(columns={"int64_too": "int64_col"}) + bf_result = bf_df_a.join(bf_df_b, how=how, lsuffix="_l", rsuffix="_r").to_pandas() + pd_df_a = scalars_pandas_df_index[["string_col", "int64_col", "int64_too"]].rename( + columns={"int64_too": "int64_col"} + ) + pd_df_b = scalars_pandas_df_index.dropna()[ + ["string_col", "int64_col", "int64_too"] + ].rename(columns={"int64_too": "int64_col"}) + pd_result = pd_df_a.join(pd_df_b, how=how, lsuffix="_l", rsuffix="_r") + + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@all_joins +def test_join_param_on_with_duplicate_column_name_not_on_col( + scalars_df_index, scalars_pandas_df_index, how +): + # This test is for duplicate column names, but the 'on' column is not duplicated. + if how == "cross": + return + bf_df_a = scalars_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_df_b = scalars_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_result = bf_df_a.join( + bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ).to_pandas() + pd_df_a = scalars_pandas_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_df_b = scalars_pandas_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_result = pd_df_a.join( + pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ) + pd.testing.assert_frame_equal( + bf_result.sort_index(), + pd_result.sort_index(), + check_like=True, + check_index_type=False, + check_names=False, + ) + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + + +@pytest.mark.skipif( + pandas.__version__.startswith("1."), reason="bad left join in pandas 1.x" +) +@all_joins +def test_join_param_on_with_duplicate_column_name_on_col( + scalars_df_index, scalars_pandas_df_index, how +): + # This test is for duplicate column names, and the 'on' column is duplicated. + if how == "cross": + return + bf_df_a = scalars_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_df_b = scalars_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_result = bf_df_a.join( + bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ).to_pandas() + pd_df_a = scalars_pandas_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_df_b = scalars_pandas_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_result = pd_df_a.join( + pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ) + pd.testing.assert_frame_equal( + bf_result.sort_index(), + pd_result.sort_index(), + check_like=True, + check_index_type=False, + check_names=False, + ) + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) @all_joins diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index eae800d409..f52188d34f 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -2444,12 +2444,40 @@ def test_join_different_table( assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) -def test_join_duplicate_columns_raises_not_implemented(scalars_dfs): +@all_joins +def test_join_raise_when_param_on_duplicate_with_column(scalars_df_index, how): + if how == "cross": + return + bf_df_a = scalars_df_index[["string_col", "int64_col"]].rename( + columns={"int64_col": "string_col"} + ) + bf_df_b = scalars_df_index.dropna()["string_col"] + with pytest.raises( + ValueError, match="The column label 'string_col' is not unique." + ): + bf_df_a.join(bf_df_b, on="string_col", how=how, lsuffix="_l", rsuffix="_r") + + +def test_join_duplicate_columns_raises_value_error(scalars_dfs): scalars_df, _ = scalars_dfs df_a = scalars_df[["string_col", "float64_col"]] df_b = scalars_df[["float64_col"]] - with pytest.raises(NotImplementedError): - df_a.join(df_b, how="outer").to_pandas() + with pytest.raises(ValueError, match="columns overlap but no suffix specified"): + df_a.join(df_b, how="outer") + + +@all_joins +def test_join_param_on_duplicate_with_index_raises_value_error(scalars_df_index, how): + if how == "cross": + return + bf_df_a = scalars_df_index[["string_col"]] + bf_df_a.index.name = "string_col" + bf_df_b = scalars_df_index.dropna()["string_col"] + with pytest.raises( + ValueError, + match="'string_col' is both an index level and a column label, which is ambiguous.", + ): + bf_df_a.join(bf_df_b, on="string_col", how=how, lsuffix="_l", rsuffix="_r") @all_joins @@ -2461,7 +2489,7 @@ def test_join_param_on(scalars_dfs, how): bf_df_b = bf_df[["float64_col"]] if how == "cross": - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="'on' is not supported for cross join."): bf_df_a.join(bf_df_b, on="rowindex_2", how=how) else: bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 731e9a24eb..1f79c428c1 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4574,7 +4574,15 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: # ---------------------------------------------------------------------- # Merging / joining methods - def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: + def join( + self, + other, + *, + on: Optional[str] = None, + how: str, + lsuffix: str = "", + rsuffix: str = "", + ) -> DataFrame: """Join columns of another DataFrame. Join columns with `other` DataFrame on index @@ -4647,6 +4655,19 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: [2 rows x 4 columns] + If there are overlapping columns, `lsuffix` and `rsuffix` can be used: + + >>> df1 = bpd.DataFrame({'key': ['K0', 'K1', 'K2'], 'A': ['A0', 'A1', 'A2']}) + >>> df2 = bpd.DataFrame({'key': ['K0', 'K1', 'K2'], 'A': ['B0', 'B1', 'B2']}) + >>> df1.set_index('key').join(df2.set_index('key'), lsuffix='_left', rsuffix='_right') + A_left A_right + key + K0 A0 B0 + K1 A1 B1 + K2 A2 B2 + + [3 rows x 2 columns] + Args: other: DataFrame or Series with an Index similar to the Index of this one. @@ -4663,6 +4684,10 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: index, preserving the order of the calling's one. ``cross``: creates the cartesian product from both frames, preserves the order of the left keys. + lsuffix(str, default ''): + Suffix to use from left frame's overlapping columns. + rsuffix(str, default ''): + Suffix to use from right frame's overlapping columns. Returns: bigframes.pandas.DataFrame: @@ -4677,6 +4702,10 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: ValueError: If left index to join on does not have the same number of levels as the right index. + ValueError: + If columns overlap but no suffix is specified. + ValueError: + If `on` column is not unique. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)