diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 31ae5182ee9e..c4022e7fdd9a 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -789,6 +789,10 @@ def _data_from_pandas( if len(data.shape) != 2 or data.shape[0] < 1: raise ValueError('Input data must be 2 dimensional and non empty.') + # take shallow copy in case we modify categorical columns + # whole column modifications don't change the original df + data = data.copy(deep=False) + # determine feature names if feature_name == 'auto': feature_name = [str(col) for col in data.columns] @@ -805,7 +809,6 @@ def _data_from_pandas( if list(data[col].cat.categories) != list(category): data[col] = data[col].cat.set_categories(category) if len(cat_cols): # cat_cols is list - data = data.copy(deep=False) # not alter origin DataFrame data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) if categorical_feature == 'auto': # use cat cols from DataFrame categorical_feature = cat_cols_not_ordered diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 2f6b07e7a77f..b8ef43e41397 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -822,21 +822,34 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): @pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto']) -def test_categorical_code_conversion_doesnt_modify_original_data(feature_name): +@pytest.mark.parametrize('categories', ['seen', 'unseen']) +def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories): pd = pytest.importorskip('pandas') X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) column_name = 'a' if feature_name == 'auto' else feature_name[0] df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category') + if categories == 'seen': + pandas_categorical = [['a', 'b']] + else: + pandas_categorical = [['a']] data = lgb.basic._data_from_pandas( data=df, feature_name=feature_name, categorical_feature="auto", - pandas_categorical=None + pandas_categorical=pandas_categorical, )[0] # check that the original data wasn't modified np.testing.assert_equal(df[column_name], X[:, 0]) # check that the built data has the codes - np.testing.assert_equal(df[column_name].cat.codes, data[:, 0]) + if categories == 'seen': + # if all categories were seen during training we just take the codes + codes = df[column_name].cat.codes + else: + # if we only saw 'a' during training we just replace its code + # and leave the rest as nan + a_code = df[column_name].cat.categories.get_loc('a') + codes = np.where(df[column_name] == 'a', a_code, np.nan) + np.testing.assert_equal(codes, data[:, 0]) @pytest.mark.parametrize('min_data_in_bin', [2, 10])