From f17b96cba715ef71b71510b51556ce6685488edf Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 21 May 2025 10:46:37 -0400 Subject: [PATCH 1/3] Update indexing.rst Add pd_lookup_het() and pd_lookup_hom() --- doc/source/user_guide/indexing.rst | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index ed5c7806b2e23..e8e6ac0dd3ce9 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1461,16 +1461,32 @@ Looking up values by index/column labels Sometimes you want to extract a set of values given a sequence of row labels and column labels, this can be achieved by ``pandas.factorize`` and NumPy indexing. -For instance: + +For heterogeneous column types, we subset columns to avoid unnecessary numpy conversions: + +.. ipython:: python + + def pd_lookup_het(df, row_labels, col_labels): + rows = df.index.get_indexer(row_labels) + cols = df.columns.get_indexer(col_labels) + sub = df.take(np.unique(cols), axis=1) + sub = sub.take(np.unique(rows), axis=0) + rows = sub.index.get_indexer(row_labels) + values = sub.melt()["value"] + cols = sub.columns.get_indexer(col_labels) + flat_index = rows + cols * len(sub) + result = values[flat_index] + return result + +For homogeneous column types, it is fastest to skip column subsetting and go directly to numpy: .. ipython:: python - df = pd.DataFrame({'col': ["A", "A", "B", "B"], - 'A': [80, 23, np.nan, 22], - 'B': [80, 55, 76, 67]}) - df - idx, cols = pd.factorize(df['col']) - df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx] + def pd_lookup_hom(df, row_labels, col_labels): + rows = df.index.get_indexer(row_labels) + cols = df.columns.get_indexer(col_labels) + result = df.to_numpy()[rows, cols] + return result Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method which was deprecated in version 1.2.0 and removed in version 2.0.0. From 12b65725ebea2b37b582f84d2a2598c18170ef49 Mon Sep 17 00:00:00 2001 From: stevenae Date: Wed, 21 May 2025 10:59:35 -0400 Subject: [PATCH 2/3] Update indexing.rst --- doc/source/user_guide/indexing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index e8e6ac0dd3ce9..587f80ecc5c4d 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1464,7 +1464,7 @@ and column labels, this can be achieved by ``pandas.factorize`` and NumPy index For heterogeneous column types, we subset columns to avoid unnecessary numpy conversions: -.. ipython:: python +.. code-block:: python def pd_lookup_het(df, row_labels, col_labels): rows = df.index.get_indexer(row_labels) @@ -1480,7 +1480,7 @@ For heterogeneous column types, we subset columns to avoid unnecessary numpy con For homogeneous column types, it is fastest to skip column subsetting and go directly to numpy: -.. ipython:: python +.. code-block:: python def pd_lookup_hom(df, row_labels, col_labels): rows = df.index.get_indexer(row_labels) From 7292c170b6951a7793563966ada8547f714d9d4a Mon Sep 17 00:00:00 2001 From: stevenae Date: Fri, 23 May 2025 10:47:41 -0400 Subject: [PATCH 3/3] address https://github.com/pandas-dev/pandas/pull/61471#pullrequestreview-2859035990 --- doc/source/user_guide/indexing.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 587f80ecc5c4d..270030a06a41b 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1462,7 +1462,7 @@ Looking up values by index/column labels Sometimes you want to extract a set of values given a sequence of row labels and column labels, this can be achieved by ``pandas.factorize`` and NumPy indexing. -For heterogeneous column types, we subset columns to avoid unnecessary numpy conversions: +For heterogeneous column types, we subset columns to avoid unnecessary NumPy conversions: .. code-block:: python @@ -1478,12 +1478,13 @@ For heterogeneous column types, we subset columns to avoid unnecessary numpy con result = values[flat_index] return result -For homogeneous column types, it is fastest to skip column subsetting and go directly to numpy: +For homogeneous column types, it is fastest to skip column subsetting and go directly to NumPy: .. code-block:: python def pd_lookup_hom(df, row_labels, col_labels): rows = df.index.get_indexer(row_labels) + df = df.loc[:, sorted(set(col_labels))] cols = df.columns.get_indexer(col_labels) result = df.to_numpy()[rows, cols] return result