diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index d908a68a8be4a..d2899e3838683 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -30,7 +30,7 @@ jobs: env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml, actions-313.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] - pandas_future_infer_string: ["0"] + pandas_future_infer_string: ["1"] include: - name: "Downstream Compat" env_file: actions-311-downstream_compat.yaml @@ -45,6 +45,10 @@ jobs: env_file: actions-313-freethreading.yaml pattern: "not slow and not network and not single_cpu" platform: ubuntu-24.04 + - name: "Without PyArrow" + env_file: actions-312.yaml + pattern: "not slow and not network and not single_cpu" + platform: ubuntu-24.04 - name: "Locale: it_IT" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" @@ -67,18 +71,9 @@ jobs: # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" platform: ubuntu-24.04 - - name: "Future infer strings" + - name: "Past no infer strings" env_file: actions-312.yaml - pandas_future_infer_string: "1" - platform: ubuntu-24.04 - - name: "Future infer strings (without pyarrow)" - env_file: actions-311.yaml - pandas_future_infer_string: "1" - platform: ubuntu-24.04 - - name: "Pypy" - env_file: actions-pypy-39.yaml - pattern: "not slow and not network and not single_cpu" - test_args: "--max-worker-restart 0" + pandas_future_infer_string: "0" platform: ubuntu-24.04 - name: "Numpy Dev" env_file: actions-311-numpydev.yaml @@ -88,7 +83,6 @@ jobs: - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" - pandas_future_infer_string: "1" platform: ubuntu-24.04 fail-fast: false name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }} @@ -103,7 +97,7 @@ jobs: PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} # Clipboard tests QT_QPA_PLATFORM: offscreen - REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} + REMOVE_PYARROW: ${{ matrix.name == 'Without PyArrow' && '1' || '0' }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}-${{ matrix.platform }} @@ -169,12 +163,9 @@ jobs: with: # xref https://github.com/cython/cython/issues/6870 werror: ${{ matrix.name != 'Freethreading' }} - # TODO: Re-enable once Pypy has Pypy 3.10 on conda-forge - if: ${{ matrix.name != 'Pypy' }} - name: Test (not single_cpu) uses: ./.github/actions/run-tests - if: ${{ matrix.name != 'Pypy' }} env: # Set pattern to not single_cpu if not already set PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 4de7aec4f551a..a38ec5ee359d9 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -101,7 +101,6 @@ jobs: - [macos-14, macosx_arm64] - [windows-2022, win_amd64] - [windows-11-arm, win_arm64] - # TODO: support PyPy? python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] include: # Build Pyodide wheels and upload them to Anaconda.org diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b5856810b749e..8174c5515af1f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.12 + rev: v0.12.2 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -47,7 +47,7 @@ repos: types_or: [python, rst, markdown, cython, c] additional_dependencies: [tomli] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.6 + rev: v0.16.7 hooks: - id: cython-lint - id: double-quote-cython-strings @@ -95,14 +95,14 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v20.1.5 + rev: v20.1.7 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] - repo: https://github.com/trim21/pre-commit-mirror-meson - rev: v1.8.1 + rev: v1.8.2 hooks: - id: meson-fmt args: ['--inplace'] diff --git a/README.md b/README.md index ebab2e6016850..895cfb69e5edd 100644 --- a/README.md +++ b/README.md @@ -175,7 +175,7 @@ All contributions, bug reports, bug fixes, documentation improvements, enhanceme A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas.pydata.org/docs/dev/development/contributing.html)**. -If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out. +If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?q=is%3Aissue%20state%3Aopen%20label%3ADocs%20sort%3Aupdated-desc) and [good first issue](https://github.com/pandas-dev/pandas/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22%20sort%3Aupdated-desc) where you could start out. You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas). diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index a0c4189c72d0e..a2f1db9ef6b87 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -36,7 +36,7 @@ from .pandas_vb_common import BaseIO # isort:skip -def test_parallel(num_threads=2, kwargs_list=None): +def run_parallel(num_threads=2, kwargs_list=None): """ Decorator to run the same function multiple times in parallel. @@ -95,7 +95,7 @@ def setup(self, threads, method): {"key": np.random.randint(0, ngroups, size=N), "data": np.random.randn(N)} ) - @test_parallel(num_threads=threads) + @run_parallel(num_threads=threads) def parallel(): getattr(df.groupby("key")["data"], method)() @@ -123,7 +123,7 @@ def setup(self, threads): ngroups = 10**3 data = Series(np.random.randint(0, ngroups, size=size)) - @test_parallel(num_threads=threads) + @run_parallel(num_threads=threads) def get_groups(): data.groupby(data).groups @@ -142,7 +142,7 @@ def setup(self, dtype): df = DataFrame({"col": np.arange(N, dtype=dtype)}) indexer = np.arange(100, len(df) - 100) - @test_parallel(num_threads=2) + @run_parallel(num_threads=2) def parallel_take1d(): take_nd(df["col"].values, indexer) @@ -163,7 +163,7 @@ def setup(self): k = 5 * 10**5 kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}] - @test_parallel(num_threads=2, kwargs_list=kwargs_list) + @run_parallel(num_threads=2, kwargs_list=kwargs_list) def parallel_kth_smallest(arr): algos.kth_smallest(arr, k) @@ -180,42 +180,42 @@ def setup(self): self.period = self.dti.to_period("D") def time_datetime_field_year(self): - @test_parallel(num_threads=2) + @run_parallel(num_threads=2) def run(dti): dti.year run(self.dti) def time_datetime_field_day(self): - @test_parallel(num_threads=2) + @run_parallel(num_threads=2) def run(dti): dti.day run(self.dti) def time_datetime_field_daysinmonth(self): - @test_parallel(num_threads=2) + @run_parallel(num_threads=2) def run(dti): dti.days_in_month run(self.dti) def time_datetime_field_normalize(self): - @test_parallel(num_threads=2) + @run_parallel(num_threads=2) def run(dti): dti.normalize() run(self.dti) def time_datetime_to_period(self): - @test_parallel(num_threads=2) + @run_parallel(num_threads=2) def run(dti): dti.to_period("s") run(self.dti) def time_period_to_datetime(self): - @test_parallel(num_threads=2) + @run_parallel(num_threads=2) def run(period): period.to_timestamp() @@ -232,7 +232,7 @@ def setup(self, method): if hasattr(DataFrame, "rolling"): df = DataFrame(arr).rolling(win) - @test_parallel(num_threads=2) + @run_parallel(num_threads=2) def parallel_rolling(): getattr(df, method)() @@ -249,7 +249,7 @@ def parallel_rolling(): "std": rolling_std, } - @test_parallel(num_threads=2) + @run_parallel(num_threads=2) def parallel_rolling(): rolling[method](arr, win) @@ -286,7 +286,7 @@ def setup(self, dtype): self.fname = f"__test_{dtype}__.csv" df.to_csv(self.fname) - @test_parallel(num_threads=2) + @run_parallel(num_threads=2) def parallel_read_csv(): read_csv(self.fname) @@ -305,7 +305,7 @@ class ParallelFactorize: def setup(self, threads): strings = Index([f"i-{i}" for i in range(100000)], dtype=object) - @test_parallel(num_threads=threads) + @run_parallel(num_threads=threads) def parallel(): factorize(strings) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 3a15f754ae523..9ee867260aa39 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -53,6 +53,25 @@ def time_frame(self, kind): self.df.to_csv(self.fname) +class ToCSVFloatFormatVariants(BaseIO): + fname = "__test__.csv" + + def setup(self): + self.df = DataFrame(np.random.default_rng(seed=42).random((1000, 1000))) + + def time_old_style_percent_format(self): + self.df.to_csv(self.fname, float_format="%.6f") + + def time_new_style_brace_format(self): + self.df.to_csv(self.fname, float_format="{:.6f}") + + def time_new_style_thousands_format(self): + self.df.to_csv(self.fname, float_format="{:,.2f}") + + def time_callable_format(self): + self.df.to_csv(self.fname, float_format=lambda x: f"{x:.6f}") + + class ToCSVMultiIndexUnusedLevels(BaseIO): fname = "__test__.csv" diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a0d23aa0478d2..3a941deb2c68d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -58,7 +58,9 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then MSG='Python and Cython Doctests' ; echo "$MSG" python -c 'import pandas as pd; pd.test(run_doctests=True)' - RET=$(($RET + $?)) ; echo "$MSG" "DONE" + # TEMP don't let doctests fail the build until all string dtype changes are fixed + # RET=$(($RET + $?)) ; echo "$MSG" "DONE" + echo "$MSG" "DONE" fi @@ -72,6 +74,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ + -i "pandas.errors.IncompatibleFrequency SA01,SS06,EX01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index 9f12fe941d488..ee2d083ffc56f 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -22,7 +22,7 @@ dependencies: # required dependencies - python-dateutil=2.8.2 - - numpy=1.23.5 + - numpy=1.26.0 # optional dependencies - beautifulsoup4=4.12.3 @@ -41,7 +41,7 @@ dependencies: - qtpy=2.3.0 - openpyxl=3.1.2 - psycopg2=2.9.6 - - pyarrow=10.0.1 + - pyarrow=12.0.1 - pyiceberg=0.7.1 - pymysql=1.1.0 - pyqt=5.15.9 @@ -62,4 +62,4 @@ dependencies: - pip: - adbc-driver-postgresql==0.10.0 - adbc-driver-sqlite==0.8.0 - - tzdata==2022.7 + - tzdata==2023.3 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 66d49475bf34b..83386f07b631c 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -39,7 +39,7 @@ dependencies: - qtpy>=2.3.0 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyqt>=5.15.9 @@ -60,4 +60,4 @@ dependencies: - pip: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 + - tzdata>=2023.3 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 70e66a18daba9..f96e148c41e6d 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -40,7 +40,7 @@ dependencies: - qtpy>=2.3.0 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyqt>=5.15.9 @@ -73,4 +73,4 @@ dependencies: - pip: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 + - tzdata>=2023.3 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 99cbe0415b4f9..f8a84441ddb3b 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -24,4 +24,4 @@ dependencies: - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--pre" - "numpy" - - "tzdata>=2022.7" + - "tzdata>=2023.3" diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index da0cecda0fb46..5c74c243f0f6c 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -22,7 +22,7 @@ dependencies: - pip - pip: - - "tzdata>=2022.7" + - "tzdata>=2023.3" - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--prefer-binary" - "--pre" diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 9669c1e29a435..deb646a7ba86a 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -40,7 +40,7 @@ dependencies: - pyqt>=5.15.9 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 61f1d602bb241..5a24b0c1077d0 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -40,7 +40,7 @@ dependencies: - pyqt>=5.15.9 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 @@ -60,4 +60,4 @@ dependencies: - pip: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 + - tzdata>=2023.3 diff --git a/ci/deps/actions-313-freethreading.yaml b/ci/deps/actions-313-freethreading.yaml index 14e3ade976b01..e118080bc4c40 100644 --- a/ci/deps/actions-313-freethreading.yaml +++ b/ci/deps/actions-313-freethreading.yaml @@ -25,5 +25,5 @@ dependencies: - pip: # No free-threaded coveragepy (with the C-extension) on conda-forge yet - pytest-cov - - "tzdata>=2022.7" + - tzdata>=2023.3 - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" diff --git a/ci/deps/actions-313.yaml b/ci/deps/actions-313.yaml index 11f4428be27e5..ffca09b901852 100644 --- a/ci/deps/actions-313.yaml +++ b/ci/deps/actions-313.yaml @@ -41,7 +41,7 @@ dependencies: - pyqt>=5.15.9 - openpyxl>=3.1.2 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 - pytables>=3.8.0 @@ -60,4 +60,4 @@ dependencies: - pip: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 + - tzdata>=2023.3 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml deleted file mode 100644 index e0ddc6954e4a4..0000000000000 --- a/ci/deps/actions-pypy-39.yaml +++ /dev/null @@ -1,26 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - # TODO: Add the rest of the dependencies in here - # once the other plentiful failures/segfaults - # with base pandas has been dealt with - - python=3.9[build=*_pypy] - - # build dependencies - - versioneer - - cython<4.0.0a0 - - meson=1.2.1 - - meson-python=0.13.1 - - # test dependencies - - pytest>=7.3.2 - - pytest-cov - - pytest-xdist>=3.4.0 - - hypothesis>=6.84.0 - - # required - - numpy - - python-dateutil - - pip: - - tzdata>=2022.7 diff --git a/ci/meta.yaml b/ci/meta.yaml index a4c9e8189f082..853c3093fa5bc 100644 --- a/ci/meta.yaml +++ b/ci/meta.yaml @@ -37,7 +37,7 @@ requirements: - numpy >=1.21.6 # [py<311] - numpy >=1.23.2 # [py>=311] - python-dateutil >=2.8.2 - - python-tzdata >=2022.7 + - python-tzdata >=2023.3 test: imports: diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 1589fea5f8953..8bb93406f617d 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -148,9 +148,9 @@ pandas requires the following dependencies. ================================================================ ========================== Package Minimum supported version ================================================================ ========================== -`NumPy `__ 1.23.5 +`NumPy `__ 1.26.0 `python-dateutil `__ 2.8.2 -`tzdata `__ 2022.7 +`tzdata `__ 2023.3 ================================================================ ========================== .. _install.optional_dependencies: @@ -307,7 +307,7 @@ Dependency Minimum Version pip ex `PyTables `__ 3.8.0 hdf5 HDF5-based reading / writing `zlib `__ hdf5 Compression for HDF5 `fastparquet `__ 2024.2.0 - Parquet reading / writing (pyarrow is default) -`pyarrow `__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing +`pyarrow `__ 12.0.1 parquet, feather Parquet, ORC, and feather reading / writing `PyIceberg `__ 0.7.1 iceberg Apache Iceberg reading / writing `pyreadstat `__ 1.2.6 spss SPSS files (.sav) reading `odfpy `__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing diff --git a/doc/source/reference/testing.rst b/doc/source/reference/testing.rst index 1f164d1aa98b4..2c9c2dcae0f69 100644 --- a/doc/source/reference/testing.rst +++ b/doc/source/reference/testing.rst @@ -36,6 +36,7 @@ Exceptions and warnings errors.DuplicateLabelError errors.EmptyDataError errors.IncompatibilityWarning + errors.IncompatibleFrequency errors.IndexingError errors.InvalidColumnName errors.InvalidComparison diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 8155aa0ae03fa..3fdd15462b51e 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -590,7 +590,7 @@ arguments. The special value ``all`` can also be used: .. ipython:: python - frame.describe(include=["object"]) + frame.describe(include=["str"]) frame.describe(include=["number"]) frame.describe(include="all") diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index 230b2b86b2ffd..85e91859b90d0 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -87,5 +87,6 @@ Guides enhancingperf scale sparse + migration-3-strings gotchas cookbook diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 605f9501c5b23..47ff92c163b01 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -700,7 +700,7 @@ to have different probabilities, you can pass the ``sample`` function sampling w s = pd.Series([0, 1, 2, 3, 4, 5]) example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] - s.sample(n=3, weights=example_weights) + s.sample(n=2, weights=example_weights) # Weights will be re-normalized automatically example_weights2 = [0.5, 0, 0, 0, 0, 0] @@ -714,7 +714,7 @@ as a string. df2 = pd.DataFrame({'col1': [9, 8, 7, 6], 'weight_column': [0.5, 0.4, 0.1, 0]}) - df2.sample(n=3, weights='weight_column') + df2.sample(n=2, weights='weight_column') ``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 25f1e11e6b603..52038ad4b66c1 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5228,33 +5228,32 @@ languages easy. Parquet can use a variety of compression techniques to shrink th while still maintaining good read performance. Parquet is designed to faithfully serialize and de-serialize ``DataFrame`` s, supporting all of the pandas -dtypes, including extension dtypes such as datetime with tz. +dtypes, including extension dtypes such as datetime with timezone. Several caveats. * Duplicate column names and non-string columns names are not supported. -* The ``pyarrow`` engine always writes the index to the output, but ``fastparquet`` only writes non-default - indexes. This extra column can cause problems for non-pandas consumers that are not expecting it. You can - force including or omitting indexes with the ``index`` argument, regardless of the underlying engine. +* The DataFrame index is written as separate column(s) when it is a non-default range index. + This extra column can cause problems for non-pandas consumers that are not expecting it. You can + force including or omitting indexes with the ``index`` argument. * Index level names, if specified, must be strings. * In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. -* The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. -* Non supported types include ``Interval`` and actual Python object types. These will raise a helpful error message - on an attempt at serialization. ``Period`` type is supported with pyarrow >= 0.16.0. +* The ``pyarrow`` engine supports the ``Period`` and ``Interval`` dtypes. ``fastparquet`` does not support those. +* Non supported types include actual Python object types. These will raise a helpful error message + on an attempt at serialization. * The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data - type (requiring pyarrow >= 0.16.0, and requiring the extension type to implement the needed protocols, + type (this can also work for external extension types, requiring the extension type to implement the needed protocols, see the :ref:`extension types documentation `). You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, -then ``pyarrow`` is tried, and falling back to ``fastparquet``. +then ``pyarrow`` is used when installed, and falling back to ``fastparquet``. See the documentation for `pyarrow `__ and `fastparquet `__. .. note:: - These engines are very similar and should read/write nearly identical parquet format files. - ``pyarrow>=8.0.0`` supports timedelta data, ``fastparquet>=0.1.4`` supports timezone aware datetimes. + These engines are very similar and should read/write nearly identical parquet format files for most cases. These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library). .. ipython:: python @@ -5280,24 +5279,21 @@ Write to a parquet file. .. ipython:: python - df.to_parquet("example_pa.parquet", engine="pyarrow") - df.to_parquet("example_fp.parquet", engine="fastparquet") + # specify engine="pyarrow" or engine="fastparquet" to use a specific engine + df.to_parquet("example.parquet") Read from a parquet file. .. ipython:: python - result = pd.read_parquet("example_fp.parquet", engine="fastparquet") - result = pd.read_parquet("example_pa.parquet", engine="pyarrow") - + result = pd.read_parquet("example.parquet") result.dtypes By setting the ``dtype_backend`` argument you can control the default dtypes used for the resulting DataFrame. .. ipython:: python - result = pd.read_parquet("example_pa.parquet", engine="pyarrow", dtype_backend="pyarrow") - + result = pd.read_parquet("example.parquet", dtype_backend="pyarrow") result.dtypes .. note:: @@ -5309,41 +5305,36 @@ Read only certain columns of a parquet file. .. ipython:: python - result = pd.read_parquet( - "example_fp.parquet", - engine="fastparquet", - columns=["a", "b"], - ) - result = pd.read_parquet( - "example_pa.parquet", - engine="pyarrow", - columns=["a", "b"], - ) + result = pd.read_parquet("example.parquet", columns=["a", "b"]) result.dtypes .. ipython:: python :suppress: - os.remove("example_pa.parquet") - os.remove("example_fp.parquet") + os.remove("example.parquet") Handling indexes '''''''''''''''' Serializing a ``DataFrame`` to parquet may include the implicit index as one or -more columns in the output file. Thus, this code: +more columns in the output file. For example, this code: .. ipython:: python - df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2]) df.to_parquet("test.parquet", engine="pyarrow") -creates a parquet file with *three* columns if you use ``pyarrow`` for serialization: -``a``, ``b``, and ``__index_level_0__``. If you're using ``fastparquet``, the -index `may or may not `_ -be written to the file. +creates a parquet file with *three* columns (``a``, ``b``, and +``__index_level_0__`` when using the ``pyarrow`` engine, or ``index``, ``a``, +and ``b`` when using the ``fastparquet`` engine) because the index in this case +is not a default range index. In general, the index *may or may not* be written +to the file (see the +`preserve_index keyword for pyarrow `__ +or the +`write_index keyword for fastparquet `__ +to check the default behaviour). This unexpected extra column causes some databases like Amazon Redshift to reject the file, because that column doesn't exist in the target table. @@ -5432,7 +5423,7 @@ A simple example loading all data from an Iceberg table ``my_table`` defined in df = pd.read_iceberg("my_table", catalog_name="my_catalog") Catalogs must be defined in the ``.pyiceberg.yaml`` file, usually in the home directory. -It is possible to to change properties of the catalog definition with the +It is possible to change properties of the catalog definition with the ``catalog_properties`` parameter: .. code-block:: python diff --git a/doc/source/user_guide/migration-3-strings.rst b/doc/source/user_guide/migration-3-strings.rst new file mode 100644 index 0000000000000..c415f8f43d3c8 --- /dev/null +++ b/doc/source/user_guide/migration-3-strings.rst @@ -0,0 +1,386 @@ +{{ header }} + +.. _string_migration_guide: + +========================================================= +Migration guide for the new string data type (pandas 3.0) +========================================================= + +The upcoming pandas 3.0 release introduces a new, default string data type. This +will most likely cause some work when upgrading to pandas 3.0, and this page +provides an overview of the issues you might run into and gives guidance on how +to address them. + +This new dtype is already available in the pandas 2.3 release, and you can +enable it with: + +.. code-block:: python + + pd.options.future.infer_string = True + +This allows you to test your code before the final 3.0 release. + +Background +---------- + +Historically, pandas has always used the NumPy ``object`` dtype as the default +to store text data. This has two primary drawbacks. First, ``object`` dtype is +not specific to strings: any Python object can be stored in an ``object``-dtype +array, not just strings, and seeing ``object`` as the dtype for a column with +strings is confusing for users. Second, this is not always very efficient (both +performance wise and for memory usage). + +Since pandas 1.0, an opt-in string data type has been available, but this has +not yet been made the default, and uses the ``pd.NA`` scalar to represent +missing values. + +Pandas 3.0 changes the default dtype for strings to a new string data type, +a variant of the existing optional string data type but using ``NaN`` as the +missing value indicator, to be consistent with the other default data types. + +To improve performance, the new string data type will use the ``pyarrow`` +package by default, if installed (and otherwise it uses object dtype under the +hood as a fallback). + +See `PDEP-14: Dedicated string data type for pandas 3.0 `__ +for more background and details. + +.. - brief primer on the new dtype + +.. - Main characteristics: +.. - inferred by default (Default inference of a string dtype) +.. - only strings (setitem with non string fails) +.. - missing values sentinel is always NaN and uses NaN semantics + +.. - Breaking changes: +.. - dtype is no longer object dtype +.. - None gets coerced to NaN +.. - setitem raises an error for non-string data + +Brief introduction to the new default string dtype +-------------------------------------------------- + +By default, pandas will infer this new string dtype instead of object dtype for +string data (when creating pandas objects, such as in constructors or IO +functions). + +Being a default dtype means that the string dtype will be used in IO methods or +constructors when the dtype is being inferred and the input is inferred to be +string data: + +.. code-block:: python + + >>> pd.Series(["a", "b", None]) + 0 a + 1 b + 2 NaN + dtype: str + +It can also be specified explicitly using the ``"str"`` alias: + +.. code-block:: python + + >>> pd.Series(["a", "b", None], dtype="str") + 0 a + 1 b + 2 NaN + dtype: str + +Similarly, functions like :func:`read_csv`, :func:`read_parquet`, and others +will now use the new string dtype when reading string data. + +In contrast to the current object dtype, the new string dtype will only store +strings. This also means that it will raise an error if you try to store a +non-string value in it (see below for more details). + +Missing values with the new string dtype are always represented as ``NaN`` (``np.nan``), +and the missing value behavior is similar to other default dtypes. + +This new string dtype should otherwise behave the same as the existing +``object`` dtype users are used to. For example, all string-specific methods +through the ``str`` accessor will work the same: + +.. code-block:: python + + >>> ser = pd.Series(["a", "b", None], dtype="str") + >>> ser.str.upper() + 0 A + 1 B + 2 NaN + dtype: str + +.. note:: + + The new default string dtype is an instance of the :class:`pandas.StringDtype` + class. The dtype can be constructed as ``pd.StringDtype(na_value=np.nan)``, + but for general usage we recommend to use the shorter ``"str"`` alias. + +Overview of behavior differences and how to address them +--------------------------------------------------------- + +The dtype is no longer object dtype +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When inferring or reading string data, the data type of the resulting DataFrame +column or Series will silently start being the new ``"str"`` dtype instead of +``"object"`` dtype, and this can have some impact on your code. + +Checking the dtype +^^^^^^^^^^^^^^^^^^ + +When checking the dtype, code might currently do something like: + +.. code-block:: python + + >>> ser = pd.Series(["a", "b", "c"]) + >>> ser.dtype == "object" + +to check for columns with string data (by checking for the dtype being +``"object"``). This will no longer work in pandas 3+, since ``ser.dtype`` will +now be ``"str"`` with the new default string dtype, and the above check will +return ``False``. + +To check for columns with string data, you should instead use: + +.. code-block:: python + + >>> ser.dtype == "str" + +**How to write compatible code** + +For code that should work on both pandas 2.x and 3.x, you can use the +:func:`pandas.api.types.is_string_dtype` function: + +.. code-block:: python + + >>> pd.api.types.is_string_dtype(ser.dtype) + True + +This will return ``True`` for both the object dtype and the string dtypes. + +Hardcoded use of object dtype +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you have code where the dtype is hardcoded in constructors, like + +.. code-block:: python + + >>> pd.Series(["a", "b", "c"], dtype="object") + +this will keep using the object dtype. You will want to update this code to +ensure you get the benefits of the new string dtype. + +**How to write compatible code?** + +First, in many cases it can be sufficient to remove the specific data type, and +let pandas do the inference. But if you want to be specific, you can specify the +``"str"`` dtype: + +.. code-block:: python + + >>> pd.Series(["a", "b", "c"], dtype="str") + +This is actually compatible with pandas 2.x as well, since in pandas < 3, +``dtype="str"`` was essentially treated as an alias for object dtype. + +The missing value sentinel is now always NaN +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using object dtype, multiple possible missing value sentinels are +supported, including ``None`` and ``np.nan``. With the new default string dtype, +the missing value sentinel is always NaN (``np.nan``): + +.. code-block:: python + + # with object dtype, None is preserved as None and seen as missing + >>> ser = pd.Series(["a", "b", None], dtype="object") + >>> ser + 0 a + 1 b + 2 None + dtype: object + >>> print(ser[2]) + None + + # with the new string dtype, any missing value like None is coerced to NaN + >>> ser = pd.Series(["a", "b", None], dtype="str") + >>> ser + 0 a + 1 b + 2 NaN + dtype: str + >>> print(ser[2]) + nan + +Generally this should be no problem when relying on missing value behavior in +pandas methods (for example, ``ser.isna()`` will give the same result as before). +But when you relied on the exact value of ``None`` being present, that can +impact your code. + +**How to write compatible code?** + +When checking for a missing value, instead of checking for the exact value of +``None`` or ``np.nan``, you should use the :func:`pandas.isna` function. This is +the most robust way to check for missing values, as it will work regardless of +the dtype and the exact missing value sentinel: + +.. code-block:: python + + >>> pd.isna(ser[2]) + True + +One caveat: this function works both on scalars and on array-likes, and in the +latter case it will return an array of bools. When using it in a Boolean context +(for example, ``if pd.isna(..): ..``) be sure to only pass a scalar to it. + +"setitem" operations will now raise an error for non-string data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With the new string dtype, any attempt to set a non-string value in a Series or +DataFrame will raise an error: + +.. code-block:: python + + >>> ser = pd.Series(["a", "b", None], dtype="str") + >>> ser[1] = 2.5 + --------------------------------------------------------------------------- + TypeError Traceback (most recent call last) + ... + TypeError: Invalid value '2.5' for dtype 'str'. Value should be a string or missing value, got 'float' instead. + +If you relied on the flexible nature of object dtype being able to hold any +Python object, but your initial data was inferred as strings, your code might be +impacted by this change. + +**How to write compatible code?** + +You can update your code to ensure you only set string values in such columns, +or otherwise you can explicitly ensure the column has object dtype first. This +can be done by specifying the dtype explicitly in the constructor, or by using +the :meth:`~pandas.Series.astype` method: + +.. code-block:: python + + >>> ser = pd.Series(["a", "b", None], dtype="str") + >>> ser = ser.astype("object") + >>> ser[1] = 2.5 + +This ``astype("object")`` call will be redundant when using pandas 2.x, but +this code will work for all versions. + +Invalid unicode input +~~~~~~~~~~~~~~~~~~~~~ + +Python allows to have a built-in ``str`` object that represents invalid unicode +data. And since the ``object`` dtype can hold any Python object, you can have a +pandas Series with such invalid unicode data: + +.. code-block:: python + + >>> ser = pd.Series(["\u2600", "\ud83d"], dtype=object) + >>> ser + 0 ☀ + 1 \ud83d + dtype: object + +However, when using the string dtype using ``pyarrow`` under the hood, this can +only store valid unicode data, and otherwise it will raise an error: + +.. code-block:: python + + >>> ser = pd.Series(["\u2600", "\ud83d"]) + --------------------------------------------------------------------------- + UnicodeEncodeError Traceback (most recent call last) + ... + UnicodeEncodeError: 'utf-8' codec can't encode character '\ud83d' in position 0: surrogates not allowed + +If you want to keep the previous behaviour, you can explicitly specify +``dtype=object`` to keep working with object dtype. + +When you have byte data that you want to convert to strings using ``decode()``, +the :meth:`~pandas.Series.str.decode` method now has a ``dtype`` parameter to be +able to specify object dtype instead of the default of string dtype for this use +case. + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +``astype(str)`` preserving missing values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This is a long standing "bug" or misfeature, as discussed in https://github.com/pandas-dev/pandas/issues/25353. + +With pandas < 3, when using ``astype(str)`` (using the built-in :func:`str`, not +``astype("str")``!), the operation would convert every element to a string, +including the missing values: + +.. code-block:: python + + # OLD behavior in pandas < 3 + >>> ser = pd.Series(["a", np.nan], dtype=object) + >>> ser + 0 a + 1 NaN + dtype: object + >>> ser.astype(str) + 0 a + 1 nan + dtype: object + >>> ser.astype(str).to_numpy() + array(['a', 'nan'], dtype=object) + +Note how ``NaN`` (``np.nan``) was converted to the string ``"nan"``. This was +not the intended behavior, and it was inconsistent with how other dtypes handled +missing values. + +With pandas 3, this behavior has been fixed, and now ``astype(str)`` is an alias +for ``astype("str")``, i.e. casting to the new string dtype, which will preserve +the missing values: + +.. code-block:: python + + # NEW behavior in pandas 3 + >>> pd.options.future.infer_string = True + >>> ser = pd.Series(["a", np.nan], dtype=object) + >>> ser.astype(str) + 0 a + 1 NaN + dtype: str + >>> ser.astype(str).values + array(['a', nan], dtype=object) + +If you want to preserve the old behaviour of converting every object to a +string, you can use ``ser.map(str)`` instead. + + +``prod()`` raising for string data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In pandas < 3, calling the :meth:`~pandas.Series.prod` method on a Series with +string data would generally raise an error, except when the Series was empty or +contained only a single string (potentially with missing values): + +.. code-block:: python + + >>> ser = pd.Series(["a", None], dtype=object) + >>> ser.prod() + 'a' + +When the Series contains multiple strings, it will raise a ``TypeError``. This +behaviour stays the same in pandas 3 when using the flexible ``object`` dtype. +But by virtue of using the new string dtype, this will generally consistently +raise an error regardless of the number of strings: + +.. code-block:: python + + >>> ser = pd.Series(["a", None], dtype="str") + >>> ser.prod() + --------------------------------------------------------------------------- + TypeError Traceback (most recent call last) + ... + TypeError: Cannot perform reduction 'prod' with string dtype + +.. For existing users of the nullable ``StringDtype`` +.. -------------------------------------------------- + +.. TODO diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index ac0fc9e53ee94..66b560ea2b902 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -2541,7 +2541,7 @@ Fold is supported only for constructing from naive ``datetime.datetime`` or for constructing from components (see below). Only ``dateutil`` timezones are supported (see `dateutil documentation `__ for ``dateutil`` methods that deal with ambiguous datetimes) as ``pytz`` -timezones do not support fold (see `pytz documentation `__ +timezones do not support fold (see `pytz documentation `__ for details on how ``pytz`` deals with ambiguous datetimes). To localize an ambiguous datetime with ``pytz``, please use :meth:`Timestamp.tz_localize`. In general, we recommend to rely on :meth:`Timestamp.tz_localize` when localizing ambiguous datetimes if you need direct diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index 8e323d8aac5e3..30f7f4dc97e63 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -184,7 +184,7 @@ API changes .. ipython:: python :okwarning: - dfc.loc[0]['A'] = 1111 + dfc.loc[0]['B'] = 1111 :: @@ -198,7 +198,7 @@ API changes .. ipython:: python - dfc.loc[0, 'A'] = 11 + dfc.loc[0, 'B'] = 1111 dfc - ``Panel.reindex`` has the following call signature ``Panel.reindex(items=None, major_axis=None, minor_axis=None, **kwargs)`` diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 1ee7c5cbc6b9e..4e745f042d5c8 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -1025,20 +1025,49 @@ Other: - :func:`describe` on mixed-types DataFrames is more flexible. Type-based column filtering is now possible via the ``include``/``exclude`` arguments. See the :ref:`docs ` (:issue:`8164`). - .. ipython:: python + .. code-block:: python - df = pd.DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, - 'catB': ['a', 'b', 'c', 'd'] * 6, - 'numC': np.arange(24), - 'numD': np.arange(24.) + .5}) - df.describe(include=["object"]) - df.describe(include=["number", "object"], exclude=["float"]) + >>> df = pd.DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, + ... 'catB': ['a', 'b', 'c', 'd'] * 6, + ... 'numC': np.arange(24), + ... 'numD': np.arange(24.) + .5}) + >>> df.describe(include=["object"]) + catA catB + count 24 24 + unique 2 4 + top foo a + freq 16 6 + >>> df.describe(include=["number", "object"], exclude=["float"]) + catA catB numC + count 24 24 24.000000 + unique 2 4 NaN + top foo a NaN + freq 16 6 NaN + mean NaN NaN 11.500000 + std NaN NaN 7.071068 + min NaN NaN 0.000000 + 25% NaN NaN 5.750000 + 50% NaN NaN 11.500000 + 75% NaN NaN 17.250000 + max NaN NaN 23.000000 Requesting all columns is possible with the shorthand 'all' - .. ipython:: python + .. code-block:: python - df.describe(include='all') + >>> df.describe(include='all') + catA catB numC numD + count 24 24 24.000000 24.000000 + unique 2 4 NaN NaN + top foo a NaN NaN + freq 16 6 NaN NaN + mean NaN NaN 11.500000 12.000000 + std NaN NaN 7.071068 7.071068 + min NaN NaN 0.000000 0.500000 + 25% NaN NaN 5.750000 6.250000 + 50% NaN NaN 11.500000 12.000000 + 75% NaN NaN 17.250000 17.750000 + max NaN NaN 23.000000 23.500000 Without those arguments, ``describe`` will behave as before, including only numerical columns or, if none are, only categorical columns. See also the :ref:`docs ` diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index b376530358f53..c15f56ba61447 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -196,7 +196,7 @@ facilitate replication. (:issue:`2419`) # weights are accepted. example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] - example_series.sample(n=3, weights=example_weights) + example_series.sample(n=2, weights=example_weights) # weights will also be normalized if they do not sum to one, # and missing values will be treated as zeros. @@ -210,7 +210,7 @@ when sampling from rows. .. ipython:: python df = pd.DataFrame({"col1": [9, 8, 7, 6], "weight_column": [0.5, 0.4, 0.1, 0]}) - df.sample(n=3, weights="weight_column") + df.sample(n=2, weights="weight_column") .. _whatsnew_0161.enhancements.string: diff --git a/doc/source/whatsnew/v0.4.x.rst b/doc/source/whatsnew/v0.4.x.rst index 83f6a6907f33c..631a6a6411440 100644 --- a/doc/source/whatsnew/v0.4.x.rst +++ b/doc/source/whatsnew/v0.4.x.rst @@ -11,7 +11,7 @@ New features - Added Python 3 support using 2to3 (:issue:`200`) - :ref:`Added ` ``name`` attribute to ``Series``, now prints as part of ``Series.__repr__`` -- :meth:`Series.isnull`` and :meth:`Series.notnull` (:issue:`209`, :issue:`203`) +- :meth:`Series.isnull` and :meth:`Series.notnull` (:issue:`209`, :issue:`203`) - :ref:`Added ` ``Series.align`` method for aligning two series with choice of join method (ENH56_) - :ref:`Added ` method ``get_level_values`` to diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ddcd69c3fd962..9fb592d24d54c 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -984,7 +984,7 @@ Removal of prior version deprecations/changes - Removed :meth:`Series.str.__iter__` (:issue:`28277`) - Removed ``pandas.SparseArray`` in favor of :class:`arrays.SparseArray` (:issue:`30642`) - Removed ``pandas.SparseSeries`` and ``pandas.SparseDataFrame``, including pickle support. (:issue:`30642`) -- Enforced disallowing passing an integer ``fill_value`` to :meth:`DataFrame.shift` and :meth:`Series.shift`` with datetime64, timedelta64, or period dtypes (:issue:`32591`) +- Enforced disallowing passing an integer ``fill_value`` to :meth:`DataFrame.shift` and :meth:`Series.shift` with datetime64, timedelta64, or period dtypes (:issue:`32591`) - Enforced disallowing a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`) - Enforced disallowing passing ``True`` and ``False`` into ``inclusive`` in :meth:`Series.between` in favor of ``"both"`` and ``"neither"`` respectively (:issue:`40628`) - Enforced disallowing using ``usecols`` with out of bounds indices for ``read_csv`` with ``engine="c"`` (:issue:`25623`) @@ -1045,7 +1045,7 @@ Removal of prior version deprecations/changes - Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`) - Changed behavior of :meth:`DataFrame.apply` with list-like so that any partial failure will raise an error (:issue:`43740`) - Changed behaviour of :meth:`DataFrame.to_latex` to now use the Styler implementation via :meth:`.Styler.to_latex` (:issue:`47970`) -- Changed behavior of :meth:`Series.__setitem__` with an integer key and a :class:`Float64Index` when the key is not present in the index; previously we treated the key as positional (behaving like ``series.iloc[key] = val``), now we treat it is a label (behaving like ``series.loc[key] = val``), consistent with :meth:`Series.__getitem__`` behavior (:issue:`33469`) +- Changed behavior of :meth:`Series.__setitem__` with an integer key and a :class:`Float64Index` when the key is not present in the index; previously we treated the key as positional (behaving like ``series.iloc[key] = val``), now we treat it is a label (behaving like ``series.loc[key] = val``), consistent with :meth:`Series.__getitem__` behavior (:issue:`33469`) - Removed ``na_sentinel`` argument from :func:`factorize`, :meth:`.Index.factorize`, and :meth:`.ExtensionArray.factorize` (:issue:`47157`) - Changed behavior of :meth:`Series.diff` and :meth:`DataFrame.diff` with :class:`ExtensionDtype` dtypes whose arrays do not implement ``diff``, these now raise ``TypeError`` rather than casting to numpy (:issue:`31025`) - Enforced deprecation of calling numpy "ufunc"s on :class:`DataFrame` with ``method="outer"``; this now raises ``NotImplementedError`` (:issue:`36955`) diff --git a/doc/source/whatsnew/v2.0.3.rst b/doc/source/whatsnew/v2.0.3.rst index 26e34e0c823ce..e0b795165fd93 100644 --- a/doc/source/whatsnew/v2.0.3.rst +++ b/doc/source/whatsnew/v2.0.3.rst @@ -13,7 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Bug in :meth:`Timestamp.weekday`` was returning incorrect results before ``'0000-02-29'`` (:issue:`53738`) +- Bug in :meth:`Timestamp.weekday` was returning incorrect results before ``'0000-02-29'`` (:issue:`53738`) - Fixed performance regression in merging on datetime-like columns (:issue:`53231`) - Fixed regression when :meth:`DataFrame.to_string` creates extra space for string dtypes (:issue:`52690`) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 495c8244142f9..2817945c55a86 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -721,7 +721,7 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str` that did not raise a ``TypeError`` when iterated (:issue:`54173`) -- Bug in ``repr`` for :class:`DataFrame`` with string-dtype columns (:issue:`54797`) +- Bug in ``repr`` for :class:`DataFrame` with string-dtype columns (:issue:`54797`) Interval ^^^^^^^^ diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 8ca6c0006a604..a174b3bc0bea2 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -10,6 +10,104 @@ including other versions of pandas. .. --------------------------------------------------------------------------- +.. _whatsnew_230.upcoming_changes: + +Upcoming changes in pandas 3.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +pandas 3.0 will bring two bigger changes to the default behavior of pandas. + +Dedicated string data type by default +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Historically, pandas represented string columns with NumPy ``object`` data type. +This representation has numerous problems: it is not specific to strings (any +Python object can be stored in an ``object``-dtype array, not just strings) and +it is often not very efficient (both performance wise and for memory usage). + +Starting with the upcoming pandas 3.0 release, a dedicated string data type will +be enabled by default (backed by PyArrow under the hood, if installed, otherwise +falling back to NumPy). This means that pandas will start inferring columns +containing string data as the new ``str`` data type when creating pandas +objects, such as in constructors or IO functions. + +Old behavior: + +.. code-block:: python + + >>> ser = pd.Series(["a", "b"]) + 0 a + 1 b + dtype: object + +New behavior: + +.. code-block:: python + + >>> ser = pd.Series(["a", "b"]) + 0 a + 1 b + dtype: str + +The string data type that is used in these scenarios will mostly behave as NumPy +object would, including missing value semantics and general operations on these +columns. + +However, the introduction of a new default dtype will also have some breaking +consequences to your code (for example when checking for the ``.dtype`` being +object dtype). To allow testing it in advance of the pandas 3.0 release, this +future dtype inference logic can be enabled in pandas 2.3 with: + +.. code-block:: python + + pd.options.future.infer_string = True + +See the :ref:`string_migration_guide` for more details on the behaviour changes +and how to adapt your code to the new default. + +Copy-on-Write +^^^^^^^^^^^^^ + +The currently optional mode Copy-on-Write will be enabled by default in pandas 3.0. There +won't be an option to retain the legacy behavior. + +In summary, the new "copy-on-write" behaviour will bring changes in behavior in +how pandas operates with respect to copies and views. + +1. The result of *any* indexing operation (subsetting a DataFrame or Series in any way, + i.e. including accessing a DataFrame column as a Series) or any method returning a + new DataFrame or Series, always *behaves as if* it were a copy in terms of user + API. +2. As a consequence, if you want to modify an object (DataFrame or Series), the only way + to do this is to directly modify that object itself. + +Because every single indexing step now behaves as a copy, this also means that +"chained assignment" (updating a DataFrame with multiple setitem steps) will +stop working. Because this now consistently never works, the +``SettingWithCopyWarning`` will be removed. + +The new behavioral semantics are explained in more detail in the +:ref:`user guide about Copy-on-Write `. + +The new behavior can be enabled since pandas 2.0 with the following option: + +.. code-block:: python + + pd.options.mode.copy_on_write = True + +Some of the behaviour changes allow a clear deprecation, like the changes in +chained assignment. Other changes are more subtle and thus, the warnings are +hidden behind an option that can be enabled since pandas 2.2: + +.. code-block:: python + + pd.options.mode.copy_on_write = "warn" + +This mode will warn in many different scenarios that aren't actually relevant to +most queries. We recommend exploring this mode, but it is not necessary to get rid +of all of these warnings. The :ref:`migration guide ` +explains the upgrade process in more detail. + .. _whatsnew_230.enhancements: Enhancements @@ -78,4 +176,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.2.3..v2.3.0|HEAD +.. contributors:: v2.2.3..v2.3.0 diff --git a/doc/source/whatsnew/v2.3.1.rst b/doc/source/whatsnew/v2.3.1.rst index 64e5c1510e1da..52408fa50d11a 100644 --- a/doc/source/whatsnew/v2.3.1.rst +++ b/doc/source/whatsnew/v2.3.1.rst @@ -1,6 +1,6 @@ .. _whatsnew_231: -What's new in 2.3.1 (Month XX, 2025) +What's new in 2.3.1 (July 7, 2025) ------------------------------------ These are the changes in pandas 2.3.1. See :ref:`release` for a full changelog @@ -14,12 +14,16 @@ including other versions of pandas. Improvements and fixes for the StringDtype ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Most changes in this release are related to :class:`StringDtype` which will +become the default string dtype in pandas 3.0. See +:ref:`whatsnew_230.upcoming_changes` for more details. + .. _whatsnew_231.string_fixes.string_comparisons: Comparisons between different string dtypes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In previous versions, comparing :class:`Series` of different string dtypes (e.g. ``pd.StringDtype("pyarrow", na_value=pd.NA)`` against ``pd.StringDtype("python", na_value=np.nan)``) would result in inconsistent resulting dtype or incorrectly raise. pandas will now use the hierarchy +In previous versions, comparing :class:`Series` of different string dtypes (e.g. ``pd.StringDtype("pyarrow", na_value=pd.NA)`` against ``pd.StringDtype("python", na_value=np.nan)``) would result in inconsistent resulting dtype or incorrectly raise (:issue:`60639`). pandas will now use the hierarchy object < (python, NaN) < (pyarrow, NaN) < (python, NA) < (pyarrow, NA) @@ -44,7 +48,7 @@ correctly, rather than defaulting to ``object`` dtype. For example: .. code-block:: python - >>> pd.options.mode.infer_string = True + >>> pd.options.future.infer_string = True >>> df = pd.DataFrame() >>> df.columns.dtype dtype('int64') # default RangeIndex for empty columns @@ -57,32 +61,16 @@ correctly, rather than defaulting to ``object`` dtype. For example: Bug fixes ^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` where all NA values of string dtype would return float instead of string dtype (:issue:`60810`) +- Bug in :meth:`DataFrame.join` incorrectly downcasting object-dtype indexes (:issue:`61771`) - Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` with all NA values of :class:`StringDtype` resulted in ``0`` instead of the empty string ``""`` (:issue:`60229`) - Fixed bug in :meth:`DataFrame.explode` and :meth:`Series.explode` where methods would fail with ``dtype="str"`` (:issue:`61623`) +- Fixed bug in unpickling objects pickled in pandas versions pre-2.3.0 that used :class:`StringDtype` (:issue:`61763`) -.. _whatsnew_231.regressions: - -Fixed regressions -~~~~~~~~~~~~~~~~~ -- - -.. --------------------------------------------------------------------------- -.. _whatsnew_231.bug_fixes: - -Bug fixes -~~~~~~~~~ -- - -.. --------------------------------------------------------------------------- -.. _whatsnew_231.other: - -Other -~~~~~ -- - .. --------------------------------------------------------------------------- .. _whatsnew_231.contributors: Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.3.0..v2.3.1|HEAD diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5ff1ea9d194f6..1e613bd562d4d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -76,6 +76,7 @@ Other enhancements - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) - :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) @@ -306,13 +307,15 @@ pandas 3.0.0 supports Python 3.10 and higher. Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Some minimum supported versions of dependencies were updated. -If installed, we now require: +The following required dependencies were updated: -+-----------------+-----------------+----------+---------+ -| Package | Minimum Version | Required | Changed | -+=================+=================+==========+=========+ -| numpy | 1.23.5 | X | X | -+-----------------+-----------------+----------+---------+ ++-----------------+----------------------+ +| Package | New Minimum Version | ++=================+======================+ +| numpy | 1.26.0 | ++-----------------+----------------------+ +| tzdata | 2023.3 | ++-----------------+----------------------+ For `optional libraries `_ the general recommendation is to use the latest version. The following table lists the lowest version per library that is currently being tested throughout the development of pandas. @@ -321,6 +324,8 @@ Optional libraries below the lowest tested version may still work, but are not c +------------------------+---------------------+ | Package | New Minimum Version | +========================+=====================+ +| pyarrow | 12.0.1 | ++------------------------+---------------------+ | pytz | 2023.4 | +------------------------+---------------------+ | fastparquet | 2024.2.0 | @@ -409,6 +414,9 @@ Other API changes - Index set operations (like union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or empty ``Index`` with object dtype when determining the dtype of the resulting Index (:issue:`60797`) +- :class:`IncompatibleFrequency` now subclasses ``TypeError`` instead of ``ValueError``. As a result, joins with mismatched frequencies now cast to object like other non-comparable joins, and arithmetic with indexes with mismatched frequencies align (:issue:`55782`) +- Comparison operations between :class:`Index` and :class:`Series` now consistently return :class:`Series` regardless of which object is on the left or right (:issue:`36759`) +- Numpy functions like ``np.isinf`` that return a bool dtype when called on a :class:`Index` object now return a bool-dtype :class:`Index` instead of ``np.ndarray`` (:issue:`52676`) .. --------------------------------------------------------------------------- .. _whatsnew_300.deprecations: @@ -454,6 +462,7 @@ Other Deprecations - Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`) - Deprecated behavior of :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups`, in a future version ``groups`` by one element list will return tuple instead of scalar. (:issue:`58858`) - Deprecated behavior of :meth:`Series.dt.to_pytimedelta`, in a future version this will return a :class:`Series` containing python ``datetime.timedelta`` objects instead of an ``ndarray`` of timedelta; this matches the behavior of other :meth:`Series.dt` properties. (:issue:`57463`) +- Deprecated converting object-dtype columns of ``datetime.datetime`` objects to datetime64 when writing to stata (:issue:`56536`) - Deprecated lowercase strings ``d``, ``b`` and ``c`` denoting frequencies in :class:`Day`, :class:`BusinessDay` and :class:`CustomBusinessDay` in favour of ``D``, ``B`` and ``C`` (:issue:`58998`) - Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`) - Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`) @@ -686,6 +695,7 @@ Datetimelike - Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`) - Bug in :class:`DataFrame` raising ``ValueError`` when ``dtype`` is ``timedelta64`` and ``data`` is a list containing ``None`` (:issue:`60064`) - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) +- Bug in :class:`Timestamp` constructor failing to raise when given a ``np.datetime64`` object with non-standard unit (:issue:`25611`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) @@ -703,11 +713,14 @@ Datetimelike - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) - Bug in :meth:`to_datetime` with ``format="ISO8601"`` and ``utc=True`` where naive timestamps incorrectly inherited timezone offset from previous timestamps in a series. (:issue:`61389`) - Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) +- Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) +- Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta ^^^^^^^^^ - Accuracy improvement in :meth:`Timedelta.to_pytimedelta` to round microseconds consistently for large nanosecond based Timedelta (:issue:`57841`) +- Bug in :class:`Timedelta` constructor failing to raise when passed an invalid keyword (:issue:`53801`) - Bug in :meth:`DataFrame.cumsum` which was raising ``IndexError`` if dtype is ``timedelta64[ns]`` (:issue:`57956`) Timezones @@ -717,6 +730,7 @@ Timezones Numeric ^^^^^^^ +- Bug in :func:`api.types.infer_dtype` returning "mixed-integer-float" for float and ``pd.NA`` mix (:issue:`61621`) - Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`) - Bug in :meth:`DataFrame.cov` raises a ``TypeError`` instead of returning potentially incorrect results or other errors (:issue:`53115`) - Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`) @@ -750,6 +764,7 @@ Indexing - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` returning incorrect dtype when selecting from a :class:`DataFrame` with mixed data types. (:issue:`60600`) - Bug in :meth:`DataFrame.loc` with inconsistent behavior of loc-set with 2 given indexes to Series (:issue:`59933`) +- Bug in :meth:`Index.equals` when comparing between :class:`Series` with string dtype :class:`Index` (:issue:`61099`) - Bug in :meth:`Index.get_indexer` and similar methods when ``NaN`` is located at or after position 128 (:issue:`58924`) - Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) - Bug in :meth:`Series.__setitem__` when assigning boolean series with boolean indexer will raise ``LossySetitemError`` (:issue:`57338`) @@ -759,6 +774,7 @@ Indexing Missing ^^^^^^^ - Bug in :meth:`DataFrame.fillna` and :meth:`Series.fillna` that would ignore the ``limit`` argument on :class:`.ExtensionArray` dtypes (:issue:`58001`) +- Bug in :meth:`NA.__and__`, :meth:`NA.__or__` and :meth:`NA.__xor__` when operating with ``np.bool_`` objects (:issue:`58427`) - MultiIndex @@ -773,7 +789,7 @@ MultiIndex I/O ^^^ -- Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) +- Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) - Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`) - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) @@ -858,6 +874,7 @@ Reshaping - Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.pivot_table` incorrectly ignoring the ``values`` argument when also supplied to the ``index`` or ``columns`` parameters (:issue:`57876`, :issue:`61292`) +- Bug in :meth:`DataFrame.pivot_table` where ``margins=True`` did not correctly include groups with ``NaN`` values in the index or columns when ``dropna=False`` was explicitly passed. (:issue:`61509`) - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) @@ -901,6 +918,7 @@ Other - Bug in :meth:`DataFrame.query` where using duplicate column names led to a ``TypeError``. (:issue:`59950`) - Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`) - Bug in :meth:`DataFrame.query` which raised an exception when querying integer column names using backticks. (:issue:`60494`) +- Bug in :meth:`DataFrame.sample` with ``replace=False`` and ``(n * max(weights) / sum(weights)) > 1``, the method would return biased results. Now raises ``ValueError``. (:issue:`61516`) - Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.sort_values` where sorting by a column explicitly named ``None`` raised a ``KeyError`` instead of sorting by the column as expected. (:issue:`61512`) @@ -909,6 +927,7 @@ Other - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`MultiIndex.fillna` error message was referring to ``isna`` instead of ``fillna`` (:issue:`60974`) - Bug in :meth:`Series.describe` where median percentile was always included when the ``percentiles`` argument was passed (:issue:`60550`). +- Bug in :meth:`Series.describe` where statistics with multiple dtypes for ExtensionArrays were coerced to ``float64`` which raised a ``DimensionalityError``` (:issue:`61707`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) diff --git a/environment.yml b/environment.yml index b698c4c2ec131..2a566773b884a 100644 --- a/environment.yml +++ b/environment.yml @@ -43,7 +43,7 @@ dependencies: - openpyxl>=3.1.2 - odfpy>=1.4.1 - psycopg2>=2.9.6 - - pyarrow>=10.0.1 + - pyarrow>=12.0.1 - pyiceberg>=0.7.1 - pymysql>=1.1.0 - pyreadstat>=1.2.6 @@ -123,4 +123,4 @@ dependencies: - adbc-driver-postgresql>=0.10.0 - adbc-driver-sqlite>=0.8.0 - typing_extensions; python_version<"3.11" - - tzdata>=2022.7 + - tzdata>=2023.3 diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 331233f37f63d..310cd3c3d76ec 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -60,7 +60,7 @@ def is_time_array(values: np.ndarray, skipna: bool = ...): ... def is_date_array(values: np.ndarray, skipna: bool = ...): ... def is_datetime_array(values: np.ndarray, skipna: bool = ...): ... def is_string_array(values: np.ndarray, skipna: bool = ...): ... -def is_float_array(values: np.ndarray): ... +def is_float_array(values: np.ndarray, skipna: bool = ...): ... def is_integer_array(values: np.ndarray, skipna: bool = ...): ... def is_bool_array(values: np.ndarray, skipna: bool = ...): ... def fast_multiget( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3b7d659c2150e..6bb8e8ab46e59 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1752,7 +1752,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: return "complex" elif util.is_float_object(val): - if is_float_array(values): + if is_float_array(values, skipna=skipna): return "floating" elif is_integer_float_array(values, skipna=skipna): if is_integer_na_array(values, skipna=skipna): @@ -1954,9 +1954,11 @@ cdef class FloatValidator(Validator): # Note: only python-exposed for tests -cpdef bint is_float_array(ndarray values): +cpdef bint is_float_array(ndarray values, bint skipna=True): cdef: - FloatValidator validator = FloatValidator(values.size, values.dtype) + FloatValidator validator = FloatValidator(values.size, + values.dtype, + skipna=skipna) return validator.validate(values) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 390a527c22bbb..c7f905c4d0be0 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -471,6 +471,10 @@ class NAType(C_NAType): return False elif other is True or other is C_NA: return NA + elif util.is_bool_object(other): + if not other: + return False + return NA return NotImplemented __rand__ = __and__ @@ -480,12 +484,16 @@ class NAType(C_NAType): return True elif other is False or other is C_NA: return NA + elif util.is_bool_object(other): + if not other: + return NA + return True return NotImplemented __ror__ = __or__ def __xor__(self, other): - if other is False or other is True or other is C_NA: + if util.is_bool_object(other) or other is C_NA: return NA return NotImplemented diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index c29cdbcf5975e..5b94f45490da4 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -340,7 +340,7 @@ cdef class TextReader: cdef: parser_t *parser object na_fvalues - object true_values, false_values + list true_values, false_values object handle object orig_header bint na_filter, keep_default_na, has_usecols, has_mi_columns @@ -358,7 +358,7 @@ cdef class TextReader: int64_t leading_cols, table_width object delimiter # bytes or str object converters - object na_values + object na_values # dict[hashable, set[str]] | list[str] list header # list[list[non-negative integers]] object index_col object skiprows @@ -390,8 +390,8 @@ cdef class TextReader: usecols=None, on_bad_lines=ERROR, bint na_filter=True, - na_values=None, - na_fvalues=None, + na_values=None, # dict[hashable, set[str]] | set[str] + na_fvalues=None, # dict[hashable, set[float]] | set[float] bint keep_default_na=True, true_values=None, false_values=None, @@ -486,9 +486,17 @@ cdef class TextReader: self.delimiter = delimiter + # na_fvalues is created from user-provided na_value in _clean_na_values + # which ensures that either + # a) na_values is set[str] and na_fvalues is set[float] + # b) na_values is dict[Hashable, set[str]] and + # na_fvalues is dict[Hashable, set[float]] + # (tests for this case are in test_na_values.py) + if not isinstance(na_values, dict): + # i.e. it must be a set + na_values = list(na_values) + self.na_values = na_values - if na_fvalues is None: - na_fvalues = set() self.na_fvalues = na_fvalues self.true_values = _maybe_encode(true_values) + _true_values @@ -929,10 +937,12 @@ cdef class TextReader: int nused kh_str_starts_t *na_hashset = NULL int64_t start, end - object name, na_flist, col_dtype = None + object name, col_dtype = None + set na_fset bint na_filter = 0 int64_t num_cols dict results + bint is_default_dict_dtype start = self.parser_start @@ -948,26 +958,7 @@ cdef class TextReader: self.parser.line_fields[i] + \ (num_cols >= self.parser.line_fields[i]) * num_cols - usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols - names_larger_num_cols = (self.names and - len(self.names) - self.leading_cols > num_cols) - - if self.table_width - self.leading_cols > num_cols: - if (usecols_not_callable_and_exists - and self.table_width - self.leading_cols < len(self.usecols) - or names_larger_num_cols): - raise ParserError(f"Too many columns specified: expected " - f"{self.table_width - self.leading_cols} " - f"and found {num_cols}") - - if (usecols_not_callable_and_exists and - all(isinstance(u, int) for u in self.usecols)): - missing_usecols = [col for col in self.usecols if col >= num_cols] - if missing_usecols: - raise ParserError( - "Defining usecols with out-of-bounds indices is not allowed. " - f"{missing_usecols} are out of bounds.", - ) + self._validate_usecols_and_names(num_cols) results = {} nused = 0 @@ -995,22 +986,7 @@ cdef class TextReader: nused += 1 conv = self._get_converter(i, name) - - col_dtype = None - if self.dtype is not None: - if isinstance(self.dtype, dict): - if name in self.dtype: - col_dtype = self.dtype[name] - elif i in self.dtype: - col_dtype = self.dtype[i] - elif is_default_dict_dtype: - col_dtype = self.dtype[name] - else: - if self.dtype.names: - # structured array - col_dtype = np.dtype(self.dtype.descr[i][1]) - else: - col_dtype = self.dtype + col_dtype = self._get_col_dtype(i, is_default_dict_dtype, name) if conv: if col_dtype is not None: @@ -1021,18 +997,15 @@ cdef class TextReader: results[i] = _apply_converter(conv, self.parser, i, start, end) continue - # Collect the list of NaN values associated with the column. + # Collect the set of NaN values associated with the column. # If we aren't supposed to do that, or none are collected, # we set `na_filter` to `0` (`1` otherwise). - na_flist = set() + na_fset = set() if self.na_filter: - na_list, na_flist = self._get_na_list(i, name) - if na_list is None: - na_filter = 0 - else: - na_filter = 1 - na_hashset = kset_from_list(na_list) + na_list, na_fset = self._get_na_list(i, name) + na_filter = 1 + na_hashset = kset_from_list(na_list) else: na_filter = 0 @@ -1041,7 +1014,7 @@ cdef class TextReader: try: col_res, na_count = self._convert_tokens( i, start, end, name, na_filter, na_hashset, - na_flist, col_dtype) + na_fset, col_dtype) finally: # gh-21353 # @@ -1075,12 +1048,12 @@ cdef class TextReader: cdef _convert_tokens(self, Py_ssize_t i, int64_t start, int64_t end, object name, bint na_filter, kh_str_starts_t *na_hashset, - object na_flist, object col_dtype): + set na_fset, object col_dtype): if col_dtype is not None: col_res, na_count = self._convert_with_dtype( col_dtype, i, start, end, na_filter, - 1, na_hashset, na_flist) + 1, na_hashset, na_fset) # Fallback on the parse (e.g. we requested int dtype, # but its actually a float). @@ -1094,7 +1067,7 @@ cdef class TextReader: for dt in self.dtype_cast_order: try: col_res, na_count = self._convert_with_dtype( - dt, i, start, end, na_filter, 0, na_hashset, na_flist) + dt, i, start, end, na_filter, 0, na_hashset, na_fset) except ValueError: # This error is raised from trying to convert to uint64, # and we discover that we cannot convert to any numerical @@ -1102,11 +1075,11 @@ cdef class TextReader: # column AS IS with object dtype. col_res, na_count = self._convert_with_dtype( np.dtype("object"), i, start, end, 0, - 0, na_hashset, na_flist) + 0, na_hashset, na_fset) except OverflowError: col_res, na_count = self._convert_with_dtype( np.dtype("object"), i, start, end, na_filter, - 0, na_hashset, na_flist) + 0, na_hashset, na_fset) if col_res is not None: break @@ -1154,7 +1127,7 @@ cdef class TextReader: bint na_filter, bint user_dtype, kh_str_starts_t *na_hashset, - object na_flist): + set na_fset): if isinstance(dtype, CategoricalDtype): # TODO: I suspect that _categorical_convert could be # optimized when dtype is an instance of CategoricalDtype @@ -1212,7 +1185,7 @@ cdef class TextReader: elif dtype.kind == "f": result, na_count = _try_double(self.parser, i, start, end, - na_filter, na_hashset, na_flist) + na_filter, na_hashset, na_fset) if result is not None and dtype != "float64": result = result.astype(dtype) @@ -1261,6 +1234,47 @@ cdef class TextReader: return _string_box_utf8(self.parser, i, start, end, na_filter, na_hashset, self.encoding_errors) + cdef void _validate_usecols_and_names(self, int num_cols): + usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols + names_larger_num_cols = (self.names and + len(self.names) - self.leading_cols > num_cols) + + if self.table_width - self.leading_cols > num_cols: + if (usecols_not_callable_and_exists + and self.table_width - self.leading_cols < len(self.usecols) + or names_larger_num_cols): + raise ParserError(f"Too many columns specified: expected " + f"{self.table_width - self.leading_cols} " + f"and found {num_cols}") + + if (usecols_not_callable_and_exists and + all(isinstance(u, int) for u in self.usecols)): + missing_usecols = [col for col in self.usecols if col >= num_cols] + if missing_usecols: + raise ParserError( + "Defining usecols with out-of-bounds indices is not allowed. " + f"{missing_usecols} are out of bounds.", + ) + + # -> DtypeObj + cdef object _get_col_dtype(self, int64_t i, bint is_default_dict_dtype, name): + col_dtype = None + if self.dtype is not None: + if isinstance(self.dtype, dict): + if name in self.dtype: + col_dtype = self.dtype[name] + elif i in self.dtype: + col_dtype = self.dtype[i] + elif is_default_dict_dtype: + col_dtype = self.dtype[name] + else: + if self.dtype.names: + # structured array + col_dtype = np.dtype(self.dtype.descr[i][1]) + else: + col_dtype = self.dtype + return col_dtype + def _get_converter(self, i: int, name): if self.converters is None: return None @@ -1272,10 +1286,6 @@ cdef class TextReader: return self.converters.get(i) cdef _get_na_list(self, Py_ssize_t i, name): - # Note: updates self.na_values, self.na_fvalues - if self.na_values is None: - return None, set() - if isinstance(self.na_values, dict): key = None values = None @@ -1300,11 +1310,6 @@ cdef class TextReader: return _ensure_encoded(values), fvalues else: - if not isinstance(self.na_values, list): - self.na_values = list(self.na_values) - if not isinstance(self.na_fvalues, set): - self.na_fvalues = set(self.na_fvalues) - return _ensure_encoded(self.na_values), self.na_fvalues cdef _free_na_set(self, kh_str_starts_t *table): @@ -1350,8 +1355,8 @@ cdef _close(TextReader reader): cdef: - object _true_values = [b"True", b"TRUE", b"true"] - object _false_values = [b"False", b"FALSE", b"false"] + list _true_values = [b"True", b"TRUE", b"true"] + list _false_values = [b"False", b"FALSE", b"false"] def _ensure_encoded(list lst): @@ -1622,27 +1627,27 @@ cdef: # -> tuple[ndarray[float64_t], int] | tuple[None, None] cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_starts_t *na_hashset, object na_flist): + bint na_filter, kh_str_starts_t *na_hashset, set na_fset): cdef: int error, na_count = 0 Py_ssize_t lines float64_t *data float64_t NA = na_values[np.float64] - kh_float64_t *na_fset + kh_float64_t *na_fhashset ndarray[float64_t] result - bint use_na_flist = len(na_flist) > 0 + bint use_na_flist = len(na_fset) > 0 lines = line_end - line_start result = np.empty(lines, dtype=np.float64) data = result.data - na_fset = kset_float64_from_list(na_flist) + na_fhashset = kset_float64_from_set(na_fset) with nogil: error = _try_double_nogil(parser, parser.double_converter, col, line_start, line_end, na_filter, na_hashset, use_na_flist, - na_fset, NA, data, &na_count) + na_fhashset, NA, data, &na_count) - kh_destroy_float64(na_fset) + kh_destroy_float64(na_fhashset) if error != 0: return None, None return result, na_count @@ -1655,7 +1660,7 @@ cdef int _try_double_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset, bint use_na_flist, - const kh_float64_t *na_flist, + const kh_float64_t *na_fhashset, float64_t NA, float64_t *data, int *na_count) nogil: cdef: @@ -1694,8 +1699,8 @@ cdef int _try_double_nogil(parser_t *parser, else: return 1 if use_na_flist: - k64 = kh_get_float64(na_flist, data[0]) - if k64 != na_flist.n_buckets: + k64 = kh_get_float64(na_fhashset, data[0]) + if k64 != na_fhashset.n_buckets: na_count[0] += 1 data[0] = NA data += 1 @@ -1977,7 +1982,7 @@ cdef kh_str_starts_t* kset_from_list(list values) except NULL: return table -cdef kh_float64_t* kset_float64_from_list(values) except NULL: +cdef kh_float64_t* kset_float64_from_set(set values) except NULL: # caller takes responsibility for freeing the hash table cdef: kh_float64_t *table diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 45552108f8c15..2a080bcb19ae9 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -5,6 +5,7 @@ import numpy as np cimport numpy as cnp from libc.math cimport log10 from numpy cimport ( + PyDatetimeScalarObject, float64_t, int32_t, int64_t, @@ -358,6 +359,7 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit, cdef: _TSObject obj NPY_DATETIMEUNIT reso + int64_t num obj = _TSObject() @@ -367,6 +369,13 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit, if checknull_with_nat_and_na(ts): obj.value = NPY_NAT elif cnp.is_datetime64_object(ts): + num = (ts).obmeta.num + if num != 1: + raise ValueError( + # GH#25611 + "np.datetime64 objects with units containing a multiplier are " + "not supported" + ) reso = get_supported_reso(get_datetime64_unit(ts)) obj.creso = reso obj.value = get_datetime64_nanos(ts, reso) diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 22f3bdbe668de..5cb9f891b312a 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -15,7 +15,7 @@ from pandas._typing import ( INVALID_FREQ_ERR_MSG: str DIFFERENT_FREQ: str -class IncompatibleFrequency(ValueError): ... +class IncompatibleFrequency(TypeError): ... def periodarr_to_dt64arr( periodarr: npt.NDArray[np.int64], # const int64_t[:] diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 350216cf89ce4..df5c17745b8a4 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1625,7 +1625,11 @@ DIFFERENT_FREQ = ("Input has different freq={other_freq} " "from {cls}(freq={own_freq})") -class IncompatibleFrequency(ValueError): +class IncompatibleFrequency(TypeError): + """ + Raised when trying to compare or operate between Periods with different + frequencies. + """ pass diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 222a6070016e0..6c76e05471577 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -2006,6 +2006,20 @@ class Timedelta(_Timedelta): "milliseconds", "microseconds", "nanoseconds"} def __new__(cls, object value=_no_input, unit=None, **kwargs): + unsupported_kwargs = set(kwargs) + unsupported_kwargs.difference_update(cls._req_any_kwargs_new) + if unsupported_kwargs or ( + value is _no_input and + not cls._req_any_kwargs_new.intersection(kwargs) + ): + raise ValueError( + # GH#53801 + "cannot construct a Timedelta from the passed arguments, " + "allowed keywords are " + "[weeks, days, hours, minutes, seconds, " + "milliseconds, microseconds, nanoseconds]" + ) + if value is _no_input: if not len(kwargs): raise ValueError("cannot construct a Timedelta without a " @@ -2014,16 +2028,6 @@ class Timedelta(_Timedelta): kwargs = {key: _to_py_int_float(kwargs[key]) for key in kwargs} - unsupported_kwargs = set(kwargs) - unsupported_kwargs.difference_update(cls._req_any_kwargs_new) - if unsupported_kwargs or not cls._req_any_kwargs_new.intersection(kwargs): - raise ValueError( - "cannot construct a Timedelta from the passed arguments, " - "allowed keywords are " - "[weeks, days, hours, minutes, seconds, " - "milliseconds, microseconds, nanoseconds]" - ) - # GH43764, convert any input to nanoseconds first and then # create the timedelta. This ensures that any potential # nanosecond contributions from kwargs parsed as floats diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index ec9b5098c97c9..fc447aaba37db 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -18,7 +18,7 @@ set_locale, ) -from pandas.compat import pa_version_under10p1 +from pandas.compat import HAS_PYARROW import pandas as pd from pandas import ( @@ -183,7 +183,7 @@ ] ] -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 9f3bfdc205498..d5dbcb74d29e4 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -26,8 +26,7 @@ from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( HAS_PYARROW, - pa_version_under10p1, - pa_version_under11p0, + pa_version_under12p1, pa_version_under13p0, pa_version_under14p0, pa_version_under14p1, @@ -36,6 +35,7 @@ pa_version_under18p0, pa_version_under19p0, pa_version_under20p0, + pa_version_under21p0, ) if TYPE_CHECKING: @@ -160,8 +160,7 @@ def is_ci_environment() -> bool: "PYPY", "WASM", "is_numpy_dev", - "pa_version_under10p1", - "pa_version_under11p0", + "pa_version_under12p1", "pa_version_under13p0", "pa_version_under14p0", "pa_version_under14p1", @@ -170,4 +169,5 @@ def is_ci_environment() -> bool: "pa_version_under18p0", "pa_version_under19p0", "pa_version_under20p0", + "pa_version_under21p0", ] diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 07a07ba4ab60c..7e882bc242394 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -38,7 +38,7 @@ "openpyxl": "3.1.2", "psycopg2": "2.9.6", # (dt dec pq3 ext lo64) "pymysql": "1.1.0", - "pyarrow": "10.0.1", + "pyarrow": "12.0.1", "pyiceberg": "0.7.1", "pyreadstat": "1.2.6", "pytest": "7.3.2", @@ -54,7 +54,6 @@ "xlrd": "2.0.1", "xlsxwriter": "3.2.0", "zstandard": "0.22.0", - "tzdata": "2022.7", "qtpy": "2.3.0", "pyqt5": "5.15.9", } diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index e95b44c879940..f9368f7d119d0 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -9,19 +9,15 @@ # numpy versioning _np_version = np.__version__ _nlv = Version(_np_version) -np_version_gte1p24 = _nlv >= Version("1.24") -np_version_gte1p24p3 = _nlv >= Version("1.24.3") -np_version_gte1p25 = _nlv >= Version("1.25") np_version_gt2 = _nlv >= Version("2.0.0") is_numpy_dev = _nlv.dev is not None -_min_numpy_ver = "1.23.5" +_min_numpy_ver = "1.26.0" if _nlv < Version(_min_numpy_ver): raise ImportError( - f"this version of pandas is incompatible with numpy < {_min_numpy_ver}\n" - f"your numpy version is {_np_version}.\n" - f"Please upgrade numpy to >= {_min_numpy_ver} to use this pandas version" + f"Please upgrade numpy to >= {_min_numpy_ver} to use this pandas version.\n" + f"Your numpy version is {_np_version}." ) @@ -49,5 +45,4 @@ __all__ = [ "_np_version", "is_numpy_dev", - "np", ] diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index beaaa3f8ed3cc..beb4a69232b27 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -36,7 +36,7 @@ "pandas._libs.internals", "_unpickle_block", ), - # Avoid Cython's warning "contradiction to to Python 'class private name' rules" + # Avoid Cython's warning "contradiction to Python 'class private name' rules" ("pandas._libs.tslibs.nattype", "__nat_unpickle"): ( "pandas._libs.tslibs.nattype", "_nat_unpickle", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 163934bee509c..1e1989b276eb6 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -8,9 +8,7 @@ import pyarrow as pa _palv = Version(Version(pa.__version__).base_version) - pa_version_under10p1 = _palv < Version("10.0.1") - pa_version_under11p0 = _palv < Version("11.0.0") - pa_version_under12p0 = _palv < Version("12.0.0") + pa_version_under12p1 = _palv < Version("12.0.1") pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") pa_version_under14p1 = _palv < Version("14.0.1") @@ -20,11 +18,10 @@ pa_version_under18p0 = _palv < Version("18.0.0") pa_version_under19p0 = _palv < Version("19.0.0") pa_version_under20p0 = _palv < Version("20.0.0") - HAS_PYARROW = True + pa_version_under21p0 = _palv < Version("21.0.0") + HAS_PYARROW = _palv >= Version("12.0.1") except ImportError: - pa_version_under10p1 = True - pa_version_under11p0 = True - pa_version_under12p0 = True + pa_version_under12p1 = True pa_version_under13p0 = True pa_version_under14p0 = True pa_version_under14p1 = True @@ -34,4 +31,5 @@ pa_version_under18p0 = True pa_version_under19p0 = True pa_version_under20p0 = True + pa_version_under21p0 = True HAS_PYARROW = False diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 1ca52ce64bd77..07cbf489cfe1c 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -12,13 +12,12 @@ from pandas._libs import lib from pandas.compat import ( - pa_version_under10p1, - pa_version_under11p0, + HAS_PYARROW, pa_version_under13p0, pa_version_under17p0, ) -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc @@ -132,7 +131,7 @@ def _str_get(self, i: int) -> Self: def _str_slice( self, start: int | None = None, stop: int | None = None, step: int | None = None ) -> Self: - if pa_version_under11p0: + if pa_version_under13p0: # GH#59724 result = self._apply_elementwise(lambda val: val[start:stop:step]) return type(self)(pa.chunked_array(result, type=self._pa_array.type)) diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index b220a94d032b5..7f3da9be0c03d 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -11,14 +11,11 @@ cast, ) -from pandas.compat import ( - pa_version_under10p1, - pa_version_under11p0, -) +from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import is_list_like -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc @@ -46,7 +43,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: def _validate(self, data) -> None: dtype = data.dtype - if pa_version_under10p1 or not isinstance(dtype, ArrowDtype): + if not HAS_PYARROW or not isinstance(dtype, ArrowDtype): # Raise AttributeError so that inspect can handle non-struct Series. raise AttributeError(self._validation_msg.format(dtype=dtype)) @@ -171,11 +168,6 @@ def __getitem__(self, key: int | slice) -> Series: name=self._data.name, ) elif isinstance(key, slice): - if pa_version_under11p0: - raise NotImplementedError( - f"List slice not supported by pyarrow {pa.__version__}." - ) - # TODO: Support negative start/stop/step, ideally this would be added # upstream in pyarrow. start, stop, step = key.start, key.stop, key.step diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c18f06c3a126d..919453b29b7f9 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -22,8 +22,8 @@ timezones, ) from pandas.compat import ( - pa_version_under10p1, - pa_version_under11p0, + HAS_PYARROW, + pa_version_under12p1, pa_version_under13p0, ) from pandas.util._decorators import doc @@ -63,6 +63,7 @@ from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, @@ -74,7 +75,7 @@ from pandas.io._util import _arrow_dtype_mapping from pandas.tseries.frequencies import to_offset -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc @@ -208,16 +209,6 @@ def floordiv_compat( from pandas.core.arrays.timedeltas import TimedeltaArray -def get_unit_from_pa_dtype(pa_dtype) -> str: - # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804 - if pa_version_under11p0: - unit = str(pa_dtype).split("[", 1)[-1][:-1] - if unit not in ["s", "ms", "us", "ns"]: - raise ValueError(pa_dtype) - return unit - return pa_dtype.unit - - def to_pyarrow_type( dtype: ArrowDtype | pa.DataType | Dtype | None, ) -> pa.DataType | None: @@ -300,7 +291,7 @@ class ArrowExtensionArray( _dtype: ArrowDtype def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: - if pa_version_under10p1: + if pa_version_under12p1: msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) if isinstance(values, pa.Array): @@ -510,6 +501,33 @@ def _box_pa_array( value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) value = value.to_numpy() + if pa_type is not None and pa.types.is_timestamp(pa_type): + # Use DatetimeArray to exclude Decimal(NaN) (GH#61774) and + # ensure constructor treats tznaive the same as non-pyarrow + # dtypes (GH#61775) + from pandas.core.arrays.datetimes import ( + DatetimeArray, + tz_to_dtype, + ) + + pass_dtype = tz_to_dtype(tz=pa_type.tz, unit=pa_type.unit) + value = extract_array(value, extract_numpy=True) + if isinstance(value, DatetimeArray): + dta = value + else: + dta = DatetimeArray._from_sequence( + value, copy=copy, dtype=pass_dtype + ) + dta_mask = dta.isna() + value_i8 = cast("npt.NDArray", dta.view("i8")) + if not value_i8.flags["WRITEABLE"]: + # e.g. test_setitem_frame_2d_values + value_i8 = value_i8.copy() + dta = DatetimeArray._from_sequence(value_i8, dtype=dta.dtype) + value_i8[dta_mask] = 0 # GH#61776 avoid __sub__ overflow + pa_array = pa.array(dta._ndarray, type=pa_type, mask=dta_mask) + return pa_array + try: pa_array = pa.array(value, type=pa_type, from_pandas=True) except (pa.ArrowInvalid, pa.ArrowTypeError): @@ -1199,10 +1217,6 @@ def factorize( null_encoding = "mask" if use_na_sentinel else "encode" data = self._pa_array - pa_type = data.type - if pa_version_under11p0 and pa.types.is_duration(pa_type): - # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = data.cast(pa.int64()) if pa.types.is_dictionary(data.type): if null_encoding == "encode": @@ -1227,8 +1241,6 @@ def factorize( ) uniques = type(self)(combined.dictionary) - if pa_version_under11p0 and pa.types.is_duration(pa_type): - uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype)) return indices, uniques def reshape(self, *args, **kwargs): @@ -1515,19 +1527,7 @@ def unique(self) -> Self: ------- ArrowExtensionArray """ - pa_type = self._pa_array.type - - if pa_version_under11p0 and pa.types.is_duration(pa_type): - # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._pa_array.cast(pa.int64()) - else: - data = self._pa_array - - pa_result = pc.unique(data) - - if pa_version_under11p0 and pa.types.is_duration(pa_type): - pa_result = pa_result.cast(pa_type) - + pa_result = pc.unique(self._pa_array) return type(self)(pa_result) def value_counts(self, dropna: bool = True) -> Series: @@ -1547,18 +1547,12 @@ def value_counts(self, dropna: bool = True) -> Series: -------- Series.value_counts """ - pa_type = self._pa_array.type - if pa_version_under11p0 and pa.types.is_duration(pa_type): - # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 - data = self._pa_array.cast(pa.int64()) - else: - data = self._pa_array - from pandas import ( Index, Series, ) + data = self._pa_array vc = data.value_counts() values = vc.field(0) @@ -1568,9 +1562,6 @@ def value_counts(self, dropna: bool = True) -> Series: values = values.filter(mask) counts = counts.filter(mask) - if pa_version_under11p0 and pa.types.is_duration(pa_type): - values = values.cast(pa_type) - counts = ArrowExtensionArray(counts) index = Index(type(self)(values)) @@ -1864,8 +1855,7 @@ def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc] if pa.types.is_duration(pa_type): result = result.cast(pa_type) elif pa.types.is_time(pa_type): - unit = get_unit_from_pa_dtype(pa_type) - result = result.cast(pa.duration(unit)) + result = result.cast(pa.duration(pa_type.unit)) elif pa.types.is_date(pa_type): # go with closest available unit, i.e. "s" result = result.cast(pa.duration("s")) @@ -1946,8 +1936,10 @@ def _explode(self): fill_value = pa.scalar([None], type=self._pa_array.type) mask = counts == 0 if mask.any(): - values = values.copy() - values[mask] = fill_value + # pc.if_else here is similar to `values[mask] = fill_value` + # but this avoids an object-dtype round-trip. + pa_values = pc.if_else(~mask, values._pa_array, fill_value) + values = type(self)(pa_values) counts = counts.copy() counts[mask] = 1 values = values.fillna(fill_value) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3d2ad109a55ba..4595bc16ef336 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -794,28 +794,28 @@ def categories(self) -> Index: >>> ser = pd.Series(["a", "b", "c", "a"], dtype="category") >>> ser.cat.categories - Index(['a', 'b', 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='str') >>> raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"]) >>> ser = pd.Series(raw_cat) >>> ser.cat.categories - Index(['b', 'c', 'd'], dtype='object') + Index(['b', 'c', 'd'], dtype='str') For :class:`pandas.Categorical`: >>> cat = pd.Categorical(["a", "b"], ordered=True) >>> cat.categories - Index(['a', 'b'], dtype='object') + Index(['a', 'b'], dtype='str') For :class:`pandas.CategoricalIndex`: >>> ci = pd.CategoricalIndex(["a", "c", "b", "a", "c", "b"]) >>> ci.categories - Index(['a', 'b', 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='str') >>> ci = pd.CategoricalIndex(["a", "c"], categories=["c", "b", "a"]) >>> ci.categories - Index(['c', 'b', 'a'], dtype='object') + Index(['c', 'b', 'a'], dtype='str') """ return self.dtype.categories diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 9a723a88941b6..50fecc96f8186 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -544,7 +544,7 @@ def _validate_comparison_value(self, other): other = self._scalar_type(other) try: self._check_compatible_with(other) - except (TypeError, IncompatibleFrequency) as err: + except TypeError as err: # e.g. tzawareness mismatch raise InvalidComparison(other) from err @@ -558,7 +558,7 @@ def _validate_comparison_value(self, other): try: other = self._validate_listlike(other, allow_object=True) self._check_compatible_with(other) - except (TypeError, IncompatibleFrequency) as err: + except TypeError as err: if is_object_dtype(getattr(other, "dtype", None)): # We will have to operate element-wise pass @@ -1486,7 +1486,8 @@ def __rsub__(self, other): # GH#19959 datetime - datetime is well-defined as timedelta, # but any other type - datetime is not well-defined. raise TypeError( - f"cannot subtract {type(self).__name__} from {type(other).__name__}" + f"cannot subtract {type(self).__name__} from " + f"{type(other).__name__}[{other.dtype}]" ) elif isinstance(self.dtype, PeriodDtype) and lib.is_np_dtype(other_dtype, "m"): # TODO: Can we simplify/generalize these cases at all? @@ -1495,8 +1496,14 @@ def __rsub__(self, other): self = cast("TimedeltaArray", self) return (-self) + other + flipped = self - other + if flipped.dtype.kind == "M": + # GH#59571 give a more helpful exception message + raise TypeError( + f"cannot subtract {type(self).__name__} from {type(other).__name__}" + ) # We get here with e.g. datetime objects - return -(self - other) + return -flipped def __iadd__(self, other) -> Self: result = self + other diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e7a6b207363c3..fefd70fef35c9 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -59,6 +59,7 @@ masked_reductions, ) from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.array_algos.transforms import shift from pandas.core.arraylike import OpsMixin from pandas.core.arrays._utils import to_numpy_dtype_inference from pandas.core.arrays.base import ExtensionArray @@ -361,6 +362,17 @@ def ravel(self, *args, **kwargs) -> Self: mask = self._mask.ravel(*args, **kwargs) return type(self)(data, mask) + def shift(self, periods: int = 1, fill_value=None) -> Self: + # NB: shift is always along axis=0 + axis = 0 + if fill_value is None: + new_data = shift(self._data, periods, axis, 0) + new_mask = shift(self._mask, periods, axis, True) + else: + new_data = shift(self._data, periods, axis, fill_value) + new_mask = shift(self._mask, periods, axis, False) + return type(self)(new_data, new_mask) + @property def T(self) -> Self: return self._simple_new(self._data.T, self._mask.T) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8048306df91a2..f52b709a59de9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -25,7 +25,7 @@ from pandas._libs.lib import ensure_string_array from pandas.compat import ( HAS_PYARROW, - pa_version_under10p1, + pa_version_under12p1, ) from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -72,6 +72,8 @@ from pandas.io.formats import printing if TYPE_CHECKING: + from collections.abc import MutableMapping + import pyarrow from pandas._typing import ( @@ -182,9 +184,9 @@ def __init__( raise ValueError( f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage == "pyarrow" and pa_version_under10p1: + if storage == "pyarrow" and pa_version_under12p1: raise ImportError( - "pyarrow>=10.0.1 is required for PyArrow backed StringArray." + "pyarrow>=12.0.1 is required for PyArrow backed StringArray." ) if isinstance(na_value, float) and np.isnan(na_value): @@ -218,6 +220,11 @@ def __eq__(self, other: object) -> bool: return self.storage == other.storage and self.na_value is other.na_value return False + def __setstate__(self, state: MutableMapping[str, Any]) -> None: + # back-compat for pandas < 2.3, where na_value did not yet exist + self.storage = state.pop("storage", "python") + self._na_value = state.pop("_na_value", libmissing.NA) + def __hash__(self) -> int: # need to override __hash__ as well because of overriding __eq__ return super().__hash__() diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7264efa3298d9..2ca12870709f0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -14,7 +14,8 @@ missing as libmissing, ) from pandas.compat import ( - pa_version_under10p1, + HAS_PYARROW, + pa_version_under12p1, pa_version_under13p0, pa_version_under16p0, ) @@ -38,7 +39,7 @@ ) from pandas.core.strings.object_array import ObjectStringArrayMixin -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa import pyarrow.compute as pc @@ -63,8 +64,8 @@ def _chk_pyarrow_available() -> None: - if pa_version_under10p1: - msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray." + if pa_version_under12p1: + msg = "pyarrow>=12.0.1 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) diff --git a/pandas/core/common.py b/pandas/core/common.py index 75f8a56aac5db..f4e971c4b4bd4 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -29,12 +29,10 @@ cast, overload, ) -import warnings import numpy as np from pandas._libs import lib -from pandas.compat.numpy import np_version_gte1p24 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -243,12 +241,7 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi return construct_1d_object_array_from_listlike(values) try: - with warnings.catch_warnings(): - # Can remove warning filter once NumPy 1.24 is min version - if not np_version_gte1p24: - # np.VisibleDeprecationWarning only in np.exceptions in 2.0 - warnings.simplefilter("ignore", np.VisibleDeprecationWarning) # type: ignore[attr-defined] - result = np.asarray(values, dtype=dtype) + result = np.asarray(values, dtype=dtype) except ValueError: # Using try/except since it's more performant than checking is_list_like # over each element diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 20fe8cbab1c9f..bf7e8fb02b58e 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -880,7 +880,7 @@ def register_converter_cb(key: str) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False, + False if os.environ.get("PANDAS_FUTURE_INFER_STRING", "1") == "0" else True, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 47e52f36ad121..20fe9b92b4677 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1086,10 +1086,7 @@ def convert_dtypes( elif ( infer_objects and input_array.dtype == object - and ( - isinstance(inferred_dtype, str) - and inferred_dtype == "mixed-integer-float" - ) + and (isinstance(inferred_dtype, str) and inferred_dtype == "floating") ): inferred_dtype = pandas_dtype_func("Float64") diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 570074e047da6..912421dff1026 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -46,7 +46,10 @@ abbrev_to_npy_unit, ) from pandas._libs.tslibs.offsets import BDay -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + HAS_PYARROW, + pa_version_under12p1, +) from pandas.errors import PerformanceWarning from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level @@ -66,7 +69,7 @@ is_list_like, ) -if not pa_version_under10p1: +if HAS_PYARROW: import pyarrow as pa if TYPE_CHECKING: @@ -644,7 +647,7 @@ def categories(self) -> Index: -------- >>> cat_type = pd.CategoricalDtype(categories=["a", "b"], ordered=True) >>> cat_type.categories - Index(['a', 'b'], dtype='object') + Index(['a', 'b'], dtype='str') """ return self._categories @@ -2193,8 +2196,8 @@ class ArrowDtype(StorageExtensionDtype): def __init__(self, pyarrow_dtype: pa.DataType) -> None: super().__init__("pyarrow") - if pa_version_under10p1: - raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") + if pa_version_under12p1: + raise ImportError("pyarrow>=12.0.1 is required for ArrowDtype") if not isinstance(pyarrow_dtype, pa.DataType): raise ValueError( f"pyarrow_dtype ({pyarrow_dtype}) must be an instance " @@ -2346,7 +2349,7 @@ def construct_from_string(cls, string: str) -> ArrowDtype: if string in ("string[pyarrow]", "str[pyarrow]"): # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") - if pa_version_under10p1: + if pa_version_under12p1: raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") base_type = string[:-9] # get rid of "[pyarrow]" diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 71fe0f6e4feb0..408c2858aa876 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -158,9 +158,9 @@ def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: >>> df = pd.DataFrame([["ant", "bee", "cat"], ["dog", None, "fly"]]) >>> df - 0 1 2 - 0 ant bee cat - 1 dog None fly + 0 1 2 + 0 ant bee cat + 1 dog NaN fly >>> pd.isna(df) 0 1 2 0 False False False @@ -373,9 +373,9 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: >>> df = pd.DataFrame([["ant", "bee", "cat"], ["dog", None, "fly"]]) >>> df - 0 1 2 - 0 ant bee cat - 1 dog None fly + 0 1 2 + 0 ant bee cat + 1 dog NaN fly >>> pd.notna(df) 0 1 2 0 True True True diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8053c17437c5e..48a5596e00061 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1015,8 +1015,7 @@ def axes(self) -> list[Index]: -------- >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df.axes - [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], - dtype='object')] + [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], dtype='str')] """ return [self.index, self.columns] @@ -7235,7 +7234,7 @@ def sort_values( indexer = lexsort_indexer( keys_data, orders=ascending, na_position=na_position, key=key ) - elif len(by): + elif by: # len(by) == 1 k = self._get_label_or_level_values(by[0], axis=axis) @@ -14070,7 +14069,7 @@ def values(self) -> np.ndarray: ... columns=("name", "max_speed", "rank"), ... ) >>> df2.dtypes - name object + name str max_speed float64 rank object dtype: object diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 66188d9e91232..6424589843d76 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -236,7 +236,6 @@ class NDFrame(PandasObject, indexing.IndexingMixin): _internal_names: list[str] = [ "_mgr", - "_item_cache", "_cache", "_name", "_metadata", @@ -4570,6 +4569,8 @@ def drop( axis_name = self._get_axis_name(axis) axes = {axis_name: labels} elif index is not None or columns is not None: + if axis == 1: + raise ValueError("Cannot specify both 'axis' and 'index'/'columns'") axes = {"index": index} if self.ndim == 2: axes["columns"] = columns @@ -5815,6 +5816,8 @@ def sample( If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. + When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1`` + in order to avoid biased results. See the Notes below for more details. random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. @@ -5851,6 +5854,11 @@ def sample( ----- If `frac` > 1, `replacement` should be set to `True`. + When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``, + since that would cause results to be biased. E.g. sampling 2 items without replacement + with weights [100, 1, 1] would yield two last items in 1/2 of cases, instead of 1/102. + This is similar to specifying `n=4` without replacement on a Series with 3 elements. + Examples -------- >>> df = pd.DataFrame( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f29423ce5e77c..74497ca723edb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4628,13 +4628,13 @@ def ngroup(self, ascending: bool = True): -------- >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]}) >>> df - color - 0 red - 1 None - 2 red - 3 blue - 4 blue - 5 red + color + 0 red + 1 NaN + 2 red + 3 blue + 4 blue + 5 red >>> df.groupby("color").ngroup() 0 1.0 1 NaN diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2deaaae85e56b..3efd601545212 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -38,7 +38,6 @@ no_default, ) from pandas._libs.tslibs import ( - IncompatibleFrequency, OutOfBoundsDatetime, Timestamp, tz_compare, @@ -368,7 +367,7 @@ class Index(IndexOpsMixin, PandasObject): Index([1, 2, 3], dtype='int64') >>> pd.Index(list("abc")) - Index(['a', 'b', 'c'], dtype='object') + Index(['a', 'b', 'c'], dtype='str') >>> pd.Index([1, 2, 3], dtype="uint8") Index([1, 2, 3], dtype='uint8') @@ -965,12 +964,8 @@ def __array_wrap__(self, result, context=None, return_scalar=False): Gets called after a ufunc and other functions e.g. np.split. """ result = lib.item_from_zerodim(result) - if (not isinstance(result, Index) and is_bool_dtype(result.dtype)) or np.ndim( - result - ) > 1: - # exclude Index to avoid warning from is_bool_dtype deprecation; - # in the Index case it doesn't matter which path we go down. - # reached in plotting tests with e.g. np.nonzero(index) + if np.ndim(result) > 1: + # Reached in plotting tests with e.g. np.nonzero(index) return result return Index(result, name=self.name) @@ -1420,7 +1415,7 @@ def _format_data(self, name=None) -> str_t: is_justify = False elif isinstance(self.dtype, CategoricalDtype): self = cast("CategoricalIndex", self) - if is_object_dtype(self.categories.dtype): + if is_string_dtype(self.categories.dtype): is_justify = False elif isinstance(self, ABCRangeIndex): # We will do the relevant formatting via attrs @@ -3143,7 +3138,7 @@ def _union(self, other: Index, sort: bool | None): # test_union_same_value_duplicated_in_both fails) try: return self._outer_indexer(other)[0] - except (TypeError, IncompatibleFrequency): + except TypeError: # incomparable objects; should only be for object dtype value_list = list(lvals) @@ -5481,11 +5476,7 @@ def equals(self, other: Any) -> bool: # quickly return if the lengths are different return False - if ( - isinstance(self.dtype, StringDtype) - and self.dtype.na_value is np.nan - and other.dtype != self.dtype - ): + if isinstance(self.dtype, StringDtype) and other.dtype != self.dtype: # TODO(infer_string) can we avoid this special case? # special case for object behavior return other.equals(self.astype(object)) @@ -7607,7 +7598,7 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: Examples -------- >>> ensure_index(["a", "b"]) - Index(['a', 'b'], dtype='object') + Index(['a', 'b'], dtype='str') >>> ensure_index([("a", "a"), ("b", "c")]) Index([('a', 'a'), ('b', 'c')], dtype='object') @@ -7635,7 +7626,7 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: # check in clean_index_list index_like = list(index_like) - if len(index_like) and lib.is_all_arraylike(index_like): + if index_like and lib.is_all_arraylike(index_like): from pandas.core.indexes.multi import MultiIndex return MultiIndex.from_arrays(index_like) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index c2fbef1089d5a..daef8287e3263 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -77,7 +77,7 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: >>> df_not_necessarily_pandas = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) >>> interchange_object = df_not_necessarily_pandas.__dataframe__() >>> interchange_object.column_names() - Index(['A', 'B'], dtype='object') + Index(['A', 'B'], dtype='str') >>> df_pandas = pd.api.interchange.from_dataframe( ... interchange_object.select_columns_by_name(["A"]) ... ) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index d64c7e33657d4..12999a44a446b 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -8,6 +8,7 @@ __all__ = [ "Block", # pyright:ignore[reportUnsupportedDunderAll)] "BlockManager", + "DatetimeTZBlock", # pyright:ignore[reportUnsupportedDunderAll)] "ExtensionBlock", # pyright:ignore[reportUnsupportedDunderAll)] "SingleBlockManager", "concatenate_managers", @@ -36,6 +37,7 @@ def __getattr__(name: str): if name in [ "Block", "ExtensionBlock", + "DatetimeTZBlock", ]: warnings.warn( f"{name} is deprecated and will be removed in a future version. " @@ -45,6 +47,10 @@ def __getattr__(name: str): # on hard-coding stacklevel stacklevel=2, ) + if name == "DatetimeTZBlock": + from pandas.core.internals.api import _DatetimeTZBlock as DatetimeTZBlock + + return DatetimeTZBlock if name == "ExtensionBlock": from pandas.core.internals.blocks import ExtensionBlock diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 04944db2ebd9c..c5d6a2fe7a6a6 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -29,6 +29,7 @@ ) from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( + DatetimeLikeBlock, check_ndim, ensure_block_shape, extract_pandas_array, @@ -74,6 +75,14 @@ def _make_block(values: ArrayLike, placement: np.ndarray) -> Block: return klass(values, ndim=2, placement=placement_obj) +class _DatetimeTZBlock(DatetimeLikeBlock): + """implement a datetime64 block with a tz attribute""" + + values: DatetimeArray + + __slots__ = () + + def make_block( values, placement, klass=None, ndim=None, dtype: Dtype | None = None ) -> Block: @@ -114,6 +123,16 @@ def make_block( dtype = dtype or values.dtype klass = get_block_type(dtype) + elif klass is _DatetimeTZBlock and not isinstance(values.dtype, DatetimeTZDtype): + # pyarrow calls get here (pyarrow<15) + values = DatetimeArray._simple_new( + # error: Argument "dtype" to "_simple_new" of "DatetimeArray" has + # incompatible type "Union[ExtensionDtype, dtype[Any], None]"; + # expected "Union[dtype[datetime64], DatetimeTZDtype]" + values, + dtype=dtype, # type: ignore[arg-type] + ) + if not isinstance(placement, BlockPlacement): placement = BlockPlacement(placement) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cb290fde7095c..67d7ffa80462a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1898,10 +1898,6 @@ def _consolidate_check(self) -> None: self._known_consolidated = True def _consolidate_inplace(self) -> None: - # In general, _consolidate_inplace should only be called via - # DataFrame._consolidate_inplace, otherwise we will fail to invalidate - # the DataFrame's _item_cache. The exception is for newly-created - # BlockManager objects not yet attached to a DataFrame. if not self.is_consolidated(): self.blocks = _consolidate(self.blocks) self._is_consolidated = True diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index 944e28a9b0238..4d291c0edaa90 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -12,6 +12,7 @@ ) from typing import ( TYPE_CHECKING, + Any, cast, ) @@ -215,6 +216,14 @@ def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]: return names +def has_multiple_internal_dtypes(d: list[Any]) -> bool: + """Check if the sequence has multiple internal dtypes.""" + if not d: + return False + + return any(type(item) != type(d[0]) for item in d) + + def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: """Describe series containing numerical data. @@ -251,6 +260,10 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: import pyarrow as pa dtype = ArrowDtype(pa.float64()) + elif has_multiple_internal_dtypes(d): + # GH61707: describe() doesn't work on EAs + # with multiple internal dtypes, so return object dtype + dtype = None else: dtype = Float64Dtype() elif series.dtype.kind in "iufb": diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py index 5cbe1c421e05a..e0aa4f44fe2be 100644 --- a/pandas/core/ops/common.py +++ b/pandas/core/ops/common.py @@ -56,20 +56,14 @@ def _unpack_zerodim_and_defer(method: F, name: str) -> F: ------- method """ - stripped_name = name.removeprefix("__").removesuffix("__") - is_cmp = stripped_name in {"eq", "ne", "lt", "le", "gt", "ge"} @wraps(method) def new_method(self, other): - if is_cmp and isinstance(self, ABCIndex) and isinstance(other, ABCSeries): - # For comparison ops, Index does *not* defer to Series - pass - else: - prio = getattr(other, "__pandas_priority__", None) - if prio is not None: - if prio > self.__pandas_priority__: - # e.g. other is DataFrame while self is Index/Series/EA - return NotImplemented + prio = getattr(other, "__pandas_priority__", None) + if prio is not None: + if prio > self.__pandas_priority__: + # e.g. other is DataFrame while self is Index/Series/EA + return NotImplemented other = item_from_zerodim(other) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index cd7cc33e9ae7f..ef7949b778ff7 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -258,7 +258,7 @@ def concat( 1 b 0 c 1 d - dtype: object + dtype: str Clear the existing index and reset it in the result by setting the ``ignore_index`` option to ``True``. @@ -268,7 +268,7 @@ def concat( 1 b 2 c 3 d - dtype: object + dtype: str Add a hierarchical index at the outermost level of the data with the ``keys`` option. @@ -278,7 +278,7 @@ def concat( 1 b s2 0 c 1 d - dtype: object + dtype: str Label the index keys you create with the ``names`` option. @@ -288,7 +288,7 @@ def concat( 1 b s2 0 c 1 d - dtype: object + dtype: str Combine two ``DataFrame`` objects with identical columns. diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index ad4a5db441b89..67fb075110f0d 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -390,7 +390,9 @@ def from_dummies( The default category is the implied category when a value has none of the listed categories specified with a one, i.e. if all dummies in a row are zero. Can be a single value for all variables or a dict directly mapping - the default categories to a prefix of a variable. + the default categories to a prefix of a variable. The default category + will be coerced to the dtype of ``data.columns`` if such coercion is + lossless, and will raise otherwise. Returns ------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f762695eedb3d..285256ac7b16a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1328,13 +1328,13 @@ def _maybe_add_join_keys( # if we have an all missing left_indexer # make sure to just use the right values or vice-versa if left_indexer is not None and (left_indexer == -1).all(): - key_col = Index(rvals) + key_col = Index(rvals, dtype=rvals.dtype, copy=False) result_dtype = rvals.dtype elif right_indexer is not None and (right_indexer == -1).all(): - key_col = Index(lvals) + key_col = Index(lvals, dtype=lvals.dtype, copy=False) result_dtype = lvals.dtype else: - key_col = Index(lvals) + key_col = Index(lvals, dtype=lvals.dtype, copy=False) if left_indexer is not None: mask_left = left_indexer == -1 key_col = key_col.where(~mask_left, rvals) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ac89f19b80a0f..c80ee69047ea1 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -396,6 +396,7 @@ def __internal_pivot_table( observed=dropna, margins_name=margins_name, fill_value=fill_value, + dropna=dropna, ) # discard the top level @@ -422,6 +423,7 @@ def _add_margins( observed: bool, margins_name: Hashable = "All", fill_value=None, + dropna: bool = True, ): if not isinstance(margins_name, str): raise ValueError("margins_name argument must be a string") @@ -461,6 +463,7 @@ def _add_margins( kwargs, observed, margins_name, + dropna, ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -469,7 +472,7 @@ def _add_margins( # no values, and table is a DataFrame assert isinstance(table, ABCDataFrame) marginal_result_set = _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, kwargs, observed, margins_name + table, data, rows, cols, aggfunc, kwargs, observed, margins_name, dropna ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -538,6 +541,7 @@ def _generate_marginal_results( kwargs, observed: bool, margins_name: Hashable = "All", + dropna: bool = True, ): margin_keys: list | Index if len(cols) > 0: @@ -551,7 +555,7 @@ def _all_key(key): if len(rows) > 0: margin = ( data[rows + values] - .groupby(rows, observed=observed) + .groupby(rows, observed=observed, dropna=dropna) .agg(aggfunc, **kwargs) ) cat_axis = 1 @@ -567,7 +571,7 @@ def _all_key(key): else: margin = ( data[cols[:1] + values] - .groupby(cols[:1], observed=observed) + .groupby(cols[:1], observed=observed, dropna=dropna) .agg(aggfunc, **kwargs) .T ) @@ -610,7 +614,9 @@ def _all_key(key): if len(cols) > 0: row_margin = ( - data[cols + values].groupby(cols, observed=observed).agg(aggfunc, **kwargs) + data[cols + values] + .groupby(cols, observed=observed, dropna=dropna) + .agg(aggfunc, **kwargs) ) row_margin = row_margin.stack() @@ -633,6 +639,7 @@ def _generate_marginal_results_without_values( kwargs, observed: bool, margins_name: Hashable = "All", + dropna: bool = True, ): margin_keys: list | Index if len(cols) > 0: @@ -645,7 +652,7 @@ def _all_key(): return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data.groupby(rows, observed=observed)[rows].apply( + margin = data.groupby(rows, observed=observed, dropna=dropna)[rows].apply( aggfunc, **kwargs ) all_key = _all_key() @@ -654,7 +661,9 @@ def _all_key(): margin_keys.append(all_key) else: - margin = data.groupby(level=0, observed=observed).apply(aggfunc, **kwargs) + margin = data.groupby(level=0, observed=observed, dropna=dropna).apply( + aggfunc, **kwargs + ) all_key = _all_key() table[all_key] = margin result = table @@ -665,7 +674,7 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data.groupby(cols, observed=observed)[cols].apply( + row_margin = data.groupby(cols, observed=observed, dropna=dropna)[cols].apply( aggfunc, **kwargs ) else: diff --git a/pandas/core/sample.py b/pandas/core/sample.py index 4f12563e3c5e2..4f476540cf406 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -150,6 +150,14 @@ def sample( else: raise ValueError("Invalid weights: weights sum to zero") + assert weights is not None # for mypy + if not replace and size * weights.max() > 1: + raise ValueError( + "Weighted sampling cannot be achieved with replace=False. Either " + "set replace=True or use smaller weights. See the docstring of " + "sample for details." + ) + return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( np.intp, copy=False ) diff --git a/pandas/core/series.py b/pandas/core/series.py index 7a26be875e7b5..ce5b2e5ed8de5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -263,7 +263,6 @@ class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc] Data type for the output Series. If not specified, this will be inferred from `data`. See the :ref:`user guide ` for more usages. - If ``data`` is Series then is ignored. name : Hashable, default None The name to give to the Series. copy : bool, default False diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 81f7441846589..d1cf1e7504ece 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -305,8 +305,6 @@ def _wrap_result( if isinstance(result.dtype, ArrowDtype): import pyarrow as pa - from pandas.compat import pa_version_under11p0 - from pandas.core.arrays.arrow.array import ArrowExtensionArray value_lengths = pa.compute.list_value_length(result._pa_array) @@ -319,26 +317,14 @@ def _wrap_result( ) if min_len < max_len: # append nulls to each scalar list element up to max_len - if not pa_version_under11p0: - result = ArrowExtensionArray( - pa.compute.list_slice( - result._pa_array, - start=0, - stop=max_len, - return_fixed_size_list=True, - ) + result = ArrowExtensionArray( + pa.compute.list_slice( + result._pa_array, + start=0, + stop=max_len, + return_fixed_size_list=True, ) - else: - all_null = np.full(max_len, fill_value=None, dtype=object) - values = result.to_numpy() - new_values = [] - for row in values: - if len(row) < max_len: - nulls = all_null[: max_len - len(row)] - row = np.append(row, nulls) - new_values.append(row) - pa_type = result._pa_array.type - result = ArrowExtensionArray(pa.array(new_values, type=pa_type)) + ) if name is None: name = range(max_len) result = ( diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index bc45343d6e2d3..c2ffe17adbac5 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -114,6 +114,14 @@ def to_numeric( Numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray. + Raises + ------ + ValueError + If the input contains non-numeric values and `errors='raise'`. + TypeError + If the input is not list-like, 1D, or scalar convertible to numeric, + such as nested lists or unsupported input types (e.g., dict). + See Also -------- DataFrame.astype : Cast argument to a specified dtype. diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 2b5bc450e41d6..a60a75369d0b4 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -9,6 +9,7 @@ from pandas._config.config import OptionError from pandas._libs.tslibs import ( + IncompatibleFrequency, OutOfBoundsDatetime, OutOfBoundsTimedelta, ) @@ -379,7 +380,7 @@ def __init__(self, class_instance, methodtype: str = "method") -> None: types = {"method", "classmethod", "staticmethod", "property"} if methodtype not in types: raise ValueError( - f"methodtype must be one of {methodtype}, got {types} instead." + f"methodtype must be one of {types}, got {methodtype} instead." ) self.methodtype = methodtype self.class_instance = class_instance @@ -917,6 +918,7 @@ class InvalidComparison(Exception): "DuplicateLabelError", "EmptyDataError", "IncompatibilityWarning", + "IncompatibleFrequency", "IndexingError", "IntCastingNaNError", "InvalidColumnName", diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 097e508d4889a..7e0900f64b6bf 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -454,7 +454,7 @@ def __init__( self.na_rep = na_rep self.formatters = self._initialize_formatters(formatters) self.justify = self._initialize_justify(justify) - self.float_format = float_format + self.float_format = self._validate_float_format(float_format) self.sparsify = self._initialize_sparsify(sparsify) self.show_index_names = index_names self.decimal = decimal @@ -849,6 +849,29 @@ def _get_column_name_list(self) -> list[Hashable]: names.append("" if columns.name is None else columns.name) return names + def _validate_float_format( + self, fmt: FloatFormatType | None + ) -> FloatFormatType | None: + """ + Validates and processes the float_format argument. + Converts new-style format strings to callables. + """ + if fmt is None or callable(fmt): + return fmt + + if isinstance(fmt, str): + if "%" in fmt: + # Keeps old-style format strings as they are (C code handles them) + return fmt + else: + try: + _ = fmt.format(1.0) # Test with an arbitrary float + return fmt.format + except (ValueError, KeyError, IndexError) as e: + raise ValueError(f"Invalid new-style format string {fmt!r}") from e + + raise ValueError("float_format must be a string or callable") + class DataFrameRenderer: """Class for creating dataframe output in multiple formats. diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c283f600eb971..23efc9c87e07c 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -243,7 +243,7 @@ def extract(r): names.insert(single_ic, single_ic) # Clean the column names (if we have an index_col). - if len(ic): + if ic: col_names = [ r[ic[0]] if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 547d8c1fe3d19..70f0eefc55fd9 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -281,7 +281,7 @@ def read( index: Index | None columns: Sequence[Hashable] = list(self.orig_names) - if not len(content): # pragma: no cover + if not content: # pragma: no cover # DataFrame with the right metadata, even though it's length 0 # error: Cannot determine type of 'index_col' names = dedup_names( diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 67193f930b4dc..4fbd71ed03662 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1666,7 +1666,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = T return na_values, na_fvalues -def _floatify_na_values(na_values): +def _floatify_na_values(na_values) -> set[float]: # create float versions of the na_values result = set() for v in na_values: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 092c24f0d31c3..08177e76ee237 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -393,14 +393,22 @@ def parse_dates_safe( d["days"] = np.asarray(diff).astype("m8[D]").view("int64") elif infer_dtype(dates, skipna=False) == "datetime": + warnings.warn( + # GH#56536 + "Converting object-dtype columns of datetimes to datetime64 when " + "writing to stata is deprecated. Call " + "`df=df.infer_objects(copy=False)` before writing to stata instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) if delta: delta = dates._values - stata_epoch def f(x: timedelta) -> float: - return US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds + return US_PER_DAY * x.days + 1_000_000 * x.seconds + x.microseconds v = np.vectorize(f) - d["delta"] = v(delta) + d["delta"] = v(delta) // 1_000 # convert back to ms if year: year_month = dates.apply(lambda x: 100 * x.year + x.month) d["year"] = year_month._values // 100 diff --git a/pandas/tests/apply/test_frame_apply_relabeling.py b/pandas/tests/apply/test_frame_apply_relabeling.py index 57c109abba304..86918ec09aa97 100644 --- a/pandas/tests/apply/test_frame_apply_relabeling.py +++ b/pandas/tests/apply/test_frame_apply_relabeling.py @@ -1,7 +1,4 @@ import numpy as np -import pytest - -from pandas.compat.numpy import np_version_gte1p25 import pandas as pd import pandas._testing as tm @@ -45,7 +42,6 @@ def test_agg_relabel_multi_columns_multi_methods(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(np_version_gte1p25, reason="name of min now equals name of np.min") def test_agg_relabel_partial_functions(): # GH 26513, test on partial, functools or more complex cases df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) @@ -57,7 +53,7 @@ def test_agg_relabel_partial_functions(): result = df.agg( foo=("A", min), - bar=("A", np.min), + bar=("B", np.min), cat=("B", max), dat=("C", "min"), f=("B", np.sum), @@ -65,8 +61,8 @@ def test_agg_relabel_partial_functions(): ) expected = pd.DataFrame( { - "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], - "B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0], + "A": [1.0, np.nan, np.nan, np.nan, np.nan, np.nan], + "B": [np.nan, 1.0, 4.0, np.nan, 10.0, 1.0], "C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan], }, index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]), diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index 0730729e2fd94..7ea9d2b0ee23a 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -111,24 +111,19 @@ def xbox2(x): return x.astype(bool) return x - # rev_box: box to use for reversed comparisons - rev_box = xbox - if isinstance(right, Index) and isinstance(left, Series): - rev_box = np.array - result = xbox2(left == right) expected = xbox(np.zeros(result.shape, dtype=np.bool_)) tm.assert_equal(result, expected) result = xbox2(right == left) - tm.assert_equal(result, rev_box(expected)) + tm.assert_equal(result, xbox(expected)) result = xbox2(left != right) tm.assert_equal(result, ~expected) result = xbox2(right != left) - tm.assert_equal(result, rev_box(~expected)) + tm.assert_equal(result, xbox(~expected)) msg = "|".join( [ diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 26dfcf088e74b..9251841bdb82f 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -9,7 +9,6 @@ ) from itertools import ( product, - starmap, ) import operator @@ -771,11 +770,18 @@ def test_dti_cmp_tdi_tzawareness(self, other): result = dti == other expected = np.array([False] * 10) - tm.assert_numpy_array_equal(result, expected) + if isinstance(other, Series): + tm.assert_series_equal(result, Series(expected, index=other.index)) + else: + tm.assert_numpy_array_equal(result, expected) result = dti != other expected = np.array([True] * 10) - tm.assert_numpy_array_equal(result, expected) + if isinstance(other, Series): + tm.assert_series_equal(result, Series(expected, index=other.index)) + else: + tm.assert_numpy_array_equal(result, expected) + msg = "Invalid comparison between" with pytest.raises(TypeError, match=msg): dti < other @@ -956,7 +962,12 @@ def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, box_with_array): result = dtarr - tdarr tm.assert_equal(result, expected) - msg = "cannot subtract|(bad|unsupported) operand type for unary" + msg = "|".join( + [ + "cannot subtract DatetimeArray from ndarray", + "cannot subtract a datelike from a TimedeltaArray", + ] + ) with pytest.raises(TypeError, match=msg): tdarr - dtarr @@ -1273,7 +1284,7 @@ def test_dt64arr_series_sub_tick_DateOffset(self, box_with_array): result2 = -pd.offsets.Second(5) + ser tm.assert_equal(result2, expected) - msg = "(bad|unsupported) operand type for unary" + msg = "cannot subtract DatetimeArray from Second" with pytest.raises(TypeError, match=msg): pd.offsets.Second(5) - ser @@ -1318,9 +1329,7 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): roundtrip = offset - scalar tm.assert_equal(roundtrip, dates) - msg = "|".join( - ["bad operand type for unary -", "cannot subtract DatetimeArray"] - ) + msg = "cannot subtract DatetimeArray from" with pytest.raises(TypeError, match=msg): scalar - dates @@ -1379,7 +1388,7 @@ def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array, unit): expected = DatetimeIndex([x - off for x in vec_items]).as_unit(exp_unit) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, vec - off) - msg = "(bad|unsupported) operand type for unary" + msg = "cannot subtract DatetimeArray from" with pytest.raises(TypeError, match=msg): off - vec @@ -1495,7 +1504,7 @@ def test_dt64arr_add_sub_DateOffsets( expected = DatetimeIndex([offset + x for x in vec_items]).as_unit(unit) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, offset + vec) - msg = "(bad|unsupported) operand type for unary" + msg = "cannot subtract DatetimeArray from" with pytest.raises(TypeError, match=msg): offset - vec @@ -1984,7 +1993,7 @@ def test_operators_datetimelike_with_timezones(self): result = dt1 - td1[0] exp = (dt1.dt.tz_localize(None) - td1[0]).dt.tz_localize(tz) tm.assert_series_equal(result, exp) - msg = "(bad|unsupported) operand type for unary" + msg = "cannot subtract DatetimeArray from" with pytest.raises(TypeError, match=msg): td1[0] - dt1 @@ -2211,7 +2220,7 @@ def test_timedelta64_equal_timedelta_supported_ops(self, op, box_with_array): def timedelta64(*args): # see casting notes in NumPy gh-12927 - return np.sum(list(starmap(np.timedelta64, zip(args, intervals)))) + return np.sum(list(map(np.timedelta64, args, intervals))) for d, h, m, s, us in product(*([range(2)] * 5)): nptd = timedelta64(d, h, m, s, us) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 87e085fb22878..642420713aeba 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -320,7 +320,7 @@ def test_subtraction_ops(self): with pytest.raises(TypeError, match=msg): td - dt - msg = "(bad|unsupported) operand type for unary" + msg = "cannot subtract DatetimeArray from Timedelta" with pytest.raises(TypeError, match=msg): td - dti diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index d7eb6800e5d07..cf2de894cc0c0 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,10 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -444,13 +440,12 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="Can't be NumPy strings" - ) def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects - cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) - assert all(isinstance(x, np.str_) for x in cat.categories) + # We can't pass all-strings because the constructor would cast + # those to StringDtype post-PDEP14 + cat = Categorical(["1", "0", "1", 2], [np.str_("0"), np.str_("1"), 2]) + assert all(isinstance(x, (np.str_, int)) for x in cat.categories) def test_constructor_from_categorical_with_dtype(self): dtype = CategoricalDtype(["a", "b", "c"], ordered=True) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 3a2c489920eb0..a82ba24a2c732 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,7 +1,4 @@ import numpy as np -import pytest - -from pandas._config import using_string_dtype from pandas import ( Categorical, @@ -77,17 +74,19 @@ def test_print_none_width(self): with option_context("display.width", None): assert exp == repr(a) - @pytest.mark.skipif( - using_string_dtype(), - reason="Change once infer_string is set to True by default", - ) - def test_unicode_print(self): + def test_unicode_print(self, using_infer_string): c = Categorical(["aaaaa", "bb", "cccc"] * 20) expected = """\ ['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc'] Length: 60 Categories (3, object): ['aaaaa', 'bb', 'cccc']""" + if using_infer_string: + expected = expected.replace( + "(3, object): ['aaaaa', 'bb', 'cccc']", + "(3, str): [aaaaa, bb, cccc]", + ) + assert repr(c) == expected c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) @@ -96,6 +95,12 @@ def test_unicode_print(self): Length: 60 Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "(3, object): ['ああああ', 'いいいいい', 'ううううううう']", + "(3, str): [ああああ, いいいいい, ううううううう]", + ) + assert repr(c) == expected # unicode option should not affect to Categorical, as it doesn't care @@ -106,6 +111,12 @@ def test_unicode_print(self): Length: 60 Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "(3, object): ['ああああ', 'いいいいい', 'ううううううう']", + "(3, str): [ああああ, いいいいい, ううううううう]", + ) + assert repr(c) == expected def test_categorical_repr(self): diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 431309aca0df2..c1d9ac0d1d273 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -1,7 +1,5 @@ import pytest -from pandas.compat.pyarrow import pa_version_under10p1 - from pandas.core.dtypes.dtypes import PeriodDtype import pandas as pd @@ -33,7 +31,6 @@ def test_arrow_extension_type(): assert hash(p1) != hash(p3) -@pytest.mark.xfail(not pa_version_under10p1, reason="Wrong behavior with pyarrow 10") @pytest.mark.parametrize( "data, freq", [ @@ -60,9 +57,6 @@ def test_arrow_array(data, freq): with pytest.raises(TypeError, match=msg): pa.array(periods, type="float64") - with pytest.raises(TypeError, match="different 'freq'"): - pa.array(periods, type=ArrowPeriodType("T")) - def test_arrow_array_missing(): from pandas.core.arrays.arrow.extension_types import ArrowPeriodType diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 736c0e1782fc0..96e1cc05e284c 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -12,7 +12,7 @@ from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import ( - pa_version_under12p0, + pa_version_under12p1, pa_version_under19p0, ) import pandas.util._test_decorators as td @@ -600,7 +600,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) - if dtype.storage == "pyarrow" and pa_version_under12p0: + if dtype.storage == "pyarrow" and pa_version_under12p1: expected = pa.chunked_array(expected) if dtype.storage == "python": expected = pc.cast(expected, pa.string()) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index e6103da5021bb..2b5f60ce70b4c 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -178,7 +178,7 @@ def test_from_sequence_wrong_dtype_raises(using_infer_string): @td.skip_if_installed("pyarrow") def test_pyarrow_not_installed_raises(): - msg = re.escape("pyarrow>=10.0.1 is required for PyArrow backed") + msg = re.escape("pyarrow>=12.0.1 is required for PyArrow backed") with pytest.raises(ImportError, match=msg): StringDtype(storage="pyarrow") diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 91f5badeb9728..90f662eeec5ca 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -4,7 +4,7 @@ import pytest from pandas.compat import HAS_PYARROW -from pandas.compat.pyarrow import pa_version_under12p0 +from pandas.compat.pyarrow import pa_version_under12p1 from pandas import ( DataFrame, @@ -196,7 +196,7 @@ def test_astype_arrow_timestamp(): ) result = df.astype("timestamp[ns][pyarrow]") assert not result._mgr._has_no_reference(0) - if pa_version_under12p0: + if pa_version_under12p1: assert not np.shares_memory( get_array(df, "a"), get_array(result, "a")._pa_array ) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 32fea794975b6..d23263835c615 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas import ( DataFrame, Index, @@ -247,13 +243,9 @@ def test_merge_copy_keyword(): assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -@pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, - reason="TODO(infer_string); result.index infers str dtype while both " - "df1 and df2 index are object.", -) -def test_join_on_key(): - df_index = Index(["a", "b", "c"], name="key", dtype=object) +@pytest.mark.parametrize("dtype", [object, "str"]) +def test_join_on_key(dtype): + df_index = Index(["a", "b", "c"], name="key", dtype=dtype) df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)) @@ -265,7 +257,7 @@ def test_join_on_key(): assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) - assert np.shares_memory(get_array(result.index), get_array(df1.index)) + assert tm.shares_memory(get_array(result.index), get_array(df1.index)) assert not np.shares_memory(get_array(result.index), get_array(df2.index)) result.iloc[0, 0] = 0 diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 7fd0395009adb..6e08ebda88420 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1396,6 +1396,15 @@ def test_infer_dtype_period_with_na(self, na_value): arr = np.array([na_value, Period("2011-01", freq="D"), na_value]) assert lib.infer_dtype(arr, skipna=True) == "period" + @pytest.mark.parametrize("na_value", [pd.NA, np.nan]) + def test_infer_dtype_numeric_with_na(self, na_value): + # GH61621 + ser = Series([1, 2, na_value], dtype=object) + assert lib.infer_dtype(ser, skipna=True) == "integer" + + ser = Series([1.0, 2.0, na_value], dtype=object) + assert lib.infer_dtype(ser, skipna=True) == "floating" + def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) assert lib.infer_dtype(arr, skipna=True) == "floating" diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 8c7d8ff491cd3..890766acbd610 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -32,6 +32,16 @@ def skip_if_doesnt_support_2d(self, dtype, request): # TODO: is there a less hacky way of checking this? pytest.skip(f"{dtype} does not support 2D.") + def test_shift_2d(self, data): + arr2d = data.repeat(2).reshape(-1, 2) + + for n in [1, -2]: + for fill_value in [None, data[0]]: + result = arr2d.shift(n, fill_value=fill_value) + expected_col = data.shift(n, fill_value=fill_value) + tm.assert_extension_array_equal(result[:, 0], expected_col) + tm.assert_extension_array_equal(result[:, 1], expected_col) + def test_transpose(self, data): arr2d = data.repeat(2).reshape(-1, 2) shape = arr2d.shape diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e0632722df808..4d766d6664218 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -39,11 +39,11 @@ PY312, is_ci_environment, is_platform_windows, - pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, pa_version_under19p0, pa_version_under20p0, + pa_version_under21p0, ) from pandas.core.dtypes.dtypes import ( @@ -68,10 +68,7 @@ pa = pytest.importorskip("pyarrow") -from pandas.core.arrays.arrow.array import ( - ArrowExtensionArray, - get_unit_from_pa_dtype, -) +from pandas.core.arrays.arrow.array import ArrowExtensionArray from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -353,15 +350,6 @@ def test_from_sequence_of_strings_pa_array(self, data, request): reason="Nanosecond time parsing not supported.", ) ) - elif pa_version_under11p0 and ( - pa.types.is_duration(pa_dtype) or pa.types.is_decimal(pa_dtype) - ): - request.applymarker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"pyarrow doesn't support parsing {pa_dtype}", - ) - ) elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: _require_timezone_database(request) @@ -549,14 +537,16 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): elif pa.types.is_date(pa_type): cmp_dtype = ArrowDtype(pa.duration("s")) elif pa.types.is_time(pa_type): - unit = get_unit_from_pa_dtype(pa_type) - cmp_dtype = ArrowDtype(pa.duration(unit)) + cmp_dtype = ArrowDtype(pa.duration(pa_type.unit)) else: cmp_dtype = ArrowDtype(pa.duration(pa_type.unit)) else: cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": - if op_name not in ["median", "var", "std", "sem", "skew"]: + if op_name == "sum" and not pa_version_under21p0: + # https://github.com/apache/arrow/pull/44184 + cmp_dtype = ArrowDtype(pa.decimal128(38, 3)) + elif op_name not in ["median", "var", "std", "sem", "skew"]: cmp_dtype = arr.dtype else: cmp_dtype = "float64[pyarrow]" @@ -2696,6 +2686,7 @@ def test_dt_tz_localize_unsupported_tz_options(): ser.dt.tz_localize("UTC", nonexistent="NaT") +@pytest.mark.xfail(reason="Converts to UTC before localizing GH#61780") def test_dt_tz_localize_none(): ser = pd.Series( [datetime(year=2023, month=1, day=2, hour=3), None], @@ -2703,7 +2694,7 @@ def test_dt_tz_localize_none(): ) result = ser.dt.tz_localize(None) expected = pd.Series( - [datetime(year=2023, month=1, day=2, hour=3), None], + [ser[0].tz_localize(None), None], dtype=ArrowDtype(pa.timestamp("ns")), ) tm.assert_series_equal(result, expected) @@ -2763,7 +2754,7 @@ def test_dt_tz_convert_none(): ) result = ser.dt.tz_convert(None) expected = pd.Series( - [datetime(year=2023, month=1, day=2, hour=3), None], + [ser[0].tz_convert(None), None], dtype=ArrowDtype(pa.timestamp("ns")), ) tm.assert_series_equal(result, expected) @@ -2777,7 +2768,7 @@ def test_dt_tz_convert(unit): ) result = ser.dt.tz_convert("US/Eastern") expected = pd.Series( - [datetime(year=2023, month=1, day=2, hour=3), None], + [ser[0].tz_convert("US/Eastern"), None], dtype=ArrowDtype(pa.timestamp(unit, "US/Eastern")), ) tm.assert_series_equal(result, expected) @@ -3090,7 +3081,7 @@ def test_infer_dtype_pyarrow_dtype(data, request): res = lib.infer_dtype(data) assert res != "unknown-array" - if data._hasna and res in ["floating", "datetime64", "timedelta64"]: + if data._hasna and res in ["datetime64", "timedelta64"]: mark = pytest.mark.xfail( reason="in infer_dtype pd.NA is not ignored in these cases " "even with skipna=True in the list(data) check below" @@ -3288,9 +3279,6 @@ def test_pow_missing_operand(): tm.assert_series_equal(result, expected) -@pytest.mark.skipif( - pa_version_under11p0, reason="Decimal128 to string cast implemented in pyarrow 11" -) def test_decimal_parse_raises(): # GH 56984 ser = pd.Series(["1.2345"], dtype=ArrowDtype(pa.string())) @@ -3300,9 +3288,6 @@ def test_decimal_parse_raises(): ser.astype(ArrowDtype(pa.decimal128(1, 0))) -@pytest.mark.skipif( - pa_version_under11p0, reason="Decimal128 to string cast implemented in pyarrow 11" -) def test_decimal_parse_succeeds(): # GH 56984 ser = pd.Series(["1.2345"], dtype=ArrowDtype(pa.string())) @@ -3564,3 +3549,30 @@ def test_arrow_json_type(): dtype = ArrowDtype(pa.json_(pa.string())) result = dtype.type assert result == str + + +def test_timestamp_dtype_disallows_decimal(): + # GH#61773 constructing with pyarrow timestamp dtype should disallow + # Decimal NaN, just like pd.to_datetime + vals = [pd.Timestamp("2016-01-02 03:04:05"), Decimal("NaN")] + + msg = " is not convertible to datetime" + with pytest.raises(TypeError, match=msg): + # Check that the non-pyarrow version raises as expected + pd.to_datetime(vals) + + with pytest.raises(TypeError, match=msg): + pd.array(vals, dtype=ArrowDtype(pa.timestamp("us"))) + + +def test_timestamp_dtype_matches_to_datetime(): + # GH#61775 + dtype1 = "datetime64[ns, US/Eastern]" + dtype2 = "timestamp[ns, US/Eastern][pyarrow]" + + ts = pd.Timestamp("2025-07-03 18:10") + + result = pd.Series([ts], dtype=dtype2) + expected = pd.Series([ts], dtype=dtype1).convert_dtypes(dtype_backend="pyarrow") + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 3b9079d06e231..c7fe9e99ec6e5 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -168,6 +168,13 @@ def data_for_grouping(dtype): class TestMaskedArrays(base.ExtensionTests): + @pytest.fixture(autouse=True) + def skip_if_doesnt_support_2d(self, dtype, request): + # Override the fixture so that we run these tests. + assert not dtype._supports_2d + # If dtype._supports_2d is ever changed to True, then this fixture + # override becomes unnecessary. + @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) @@ -402,7 +409,3 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): else: raise NotImplementedError(f"{op_name} not supported") - - -class Test2DCompat(base.Dim2CompatTests): - pass diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 142bad6db4f95..2e6fe12cbbd13 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -26,7 +26,6 @@ iNaT, ) from pandas.compat import is_platform_windows -from pandas.compat.numpy import np_version_gte1p24 from pandas.core.dtypes.dtypes import PeriodDtype @@ -104,7 +103,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): @pytest.mark.parametrize("periods", [1, -2]) def test_diff(self, data, periods): - if is_platform_windows() and np_version_gte1p24: + if is_platform_windows(): with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): super().test_diff(data, periods) else: diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index b530cb98ef46c..761daf0e985cc 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning - from pandas import ( DataFrame, Index, @@ -72,19 +70,6 @@ def test_insert_with_columns_dups(self): ) tm.assert_frame_equal(df, exp) - def test_insert_item_cache(self, performance_warning): - df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) - ser = df[0] - expected_warning = PerformanceWarning if performance_warning else None - - with tm.assert_produces_warning(expected_warning): - for n in range(100): - df[n + 3] = df[1] * n - - ser.iloc[0] = 99 - assert df.iloc[0, 0] == df[0][0] - assert df.iloc[0, 0] != 99 - def test_insert_EA_no_warning(self): # PerformanceWarning about fragmented frame should not be raised when # using EAs (https://github.com/pandas-dev/pandas/issues/44098) diff --git a/pandas/tests/frame/indexing/test_mask.py b/pandas/tests/frame/indexing/test_mask.py index ac6f0a1ac0f73..e4036efeab7ff 100644 --- a/pandas/tests/frame/indexing/test_mask.py +++ b/pandas/tests/frame/indexing/test_mask.py @@ -105,7 +105,7 @@ def test_mask_stringdtype(frame_or_series): {"A": ["this", "that"]}, index=["id2", "id3"], dtype=StringDtype() ) expected = DataFrame( - {"A": [NA, "this", "that", NA]}, + {"A": ["foo", "this", "that", NA]}, index=["id1", "id2", "id3", "id4"], dtype=StringDtype(), ) @@ -114,7 +114,10 @@ def test_mask_stringdtype(frame_or_series): filtered_obj = filtered_obj["A"] expected = expected["A"] - filter_ser = Series([False, True, True, False]) + filter_ser = Series( + [False, True, True, False], + index=["id1", "id2", "id3", "id4"], + ) result = obj.mask(filter_ser, filtered_obj) tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index eb1ee4e7b2970..c428bd1820cb1 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -745,10 +743,7 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) GH#60639") - def test_astype_dt64_to_string( - self, frame_or_series, tz_naive_fixture, using_infer_string - ): + def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): # GH#41409 tz = tz_naive_fixture @@ -766,10 +761,7 @@ def test_astype_dt64_to_string( item = result.iloc[0] if frame_or_series is DataFrame: item = item.iloc[0] - if using_infer_string: - assert item is np.nan - else: - assert item is pd.NA + assert item is pd.NA # For non-NA values, we should match what we get for non-EA str alt = obj.astype(str) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 2ffc3f933e246..aea1a24097206 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.compat.numpy import np_version_gte1p25 - import pandas as pd import pandas._testing as tm @@ -270,7 +268,7 @@ def test_compare_ea_and_np_dtype(val1, val2): # GH#18463 TODO: is this really the desired behavior? expected.loc[1, ("a", "self")] = np.nan - if val1 is pd.NA and np_version_gte1p25: + if val1 is pd.NA: # can't compare with numpy array if it contains pd.NA with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): result = df1.compare(df2, keep_shape=True) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 304638a3a7dcf..a5ed2e86283e9 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -207,20 +207,6 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method): expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) - def test_corr_item_cache(self): - # Check that corr does not lead to incorrect entries in item_cache - - df = DataFrame({"A": range(10)}) - df["B"] = range(10)[::-1] - - ser = df["A"] # populate item_cache - assert len(df._mgr.blocks) == 2 - - _ = df.corr(numeric_only=True) - - ser.iloc[0] = 99 - assert df.loc[0, "A"] == 0 - @pytest.mark.parametrize("length", [2, 20, 200, 2000]) def test_corr_for_constant_columns(self, length): # GH: 37448 diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index d9668ce46c943..48c5d3a2e982b 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -346,6 +346,18 @@ def test_drop_multiindex_other_level_nan(self): ) tm.assert_frame_equal(result, expected) + def test_drop_raise_with_both_axis_and_index(self): + # GH#61823 + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + msg = "Cannot specify both 'axis' and 'index'/'columns'" + with pytest.raises(ValueError, match=msg): + df.drop(index="b", axis=1) + def test_drop_nonunique(self): df = DataFrame( [ diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index d7baac7264a1d..631742d43263f 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -721,22 +721,6 @@ def test_quantile_empty_no_columns(self, interp_method): expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) - def test_quantile_item_cache(self, interp_method): - # previous behavior incorrect retained an invalid _item_cache entry - interpolation, method = interp_method - df = DataFrame( - np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"] - ) - df["D"] = df["A"] * 2 - ser = df["A"] - assert len(df._mgr.blocks) == 2 - - df.quantile(numeric_only=False, interpolation=interpolation, method=method) - - ser.iloc[0] = 99 - assert df.iloc[0, 0] == df["A"][0] - assert df.iloc[0, 0] != 99 - def test_invalid_method(self): with pytest.raises(ValueError, match="Invalid method: foo"): DataFrame(range(1)).quantile(0.5, method="foo") diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index a9d56cbfd2b46..9b6660778508e 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -113,9 +113,6 @@ def test_sample_invalid_weight_lengths(self, obj): with pytest.raises(ValueError, match=msg): obj.sample(n=3, weights=[0.5] * 11) - with pytest.raises(ValueError, match="Fewer non-zero entries in p than size"): - obj.sample(n=4, weights=Series([0, 0, 0.2])) - def test_sample_negative_weights(self, obj): # Check won't accept negative weights bad_weights = [-0.1] * 10 @@ -137,6 +134,33 @@ def test_sample_inf_weights(self, obj): with pytest.raises(ValueError, match=msg): obj.sample(n=3, weights=weights_with_ninf) + def test_sample_unit_probabilities_raises(self, obj): + # GH#61516 + high_variance_weights = [1] * 10 + high_variance_weights[0] = 100 + msg = ( + "Weighted sampling cannot be achieved with replace=False. Either " + "set replace=True or use smaller weights. See the docstring of " + "sample for details." + ) + with pytest.raises(ValueError, match=msg): + obj.sample(n=2, weights=high_variance_weights, replace=False) + + def test_sample_unit_probabilities_edge_case_do_not_raise(self, obj): + # GH#61516 + # edge case, n*max(weights)/sum(weights) == 1 + edge_variance_weights = [1] * 10 + edge_variance_weights[0] = 9 + # should not raise + obj.sample(n=2, weights=edge_variance_weights, replace=False) + + def test_sample_unit_normal_probabilities_do_not_raise(self, obj): + # GH#61516 + low_variance_weights = [1] * 10 + low_variance_weights[0] = 8 + # should not raise + obj.sample(n=2, weights=low_variance_weights, replace=False) + def test_sample_zero_weights(self, obj): # All zeros raises errors diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 9a628c2ee9f73..9abe0c97c3260 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -592,21 +592,6 @@ def test_sort_values_nat_na_position_default(self): result = expected.sort_values(["A", "date"]) tm.assert_frame_equal(result, expected) - def test_sort_values_item_cache(self): - # previous behavior incorrect retained an invalid _item_cache entry - df = DataFrame( - np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"] - ) - df["D"] = df["A"] * 2 - ser = df["A"] - assert len(df._mgr.blocks) == 2 - - df.sort_values(by="A") - - ser.iloc[0] = 99 - assert df.iloc[0, 0] == df["A"][0] - assert df.iloc[0, 0] != 99 - def test_sort_values_reshaping(self): # GH 39426 values = list(range(21)) diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 4f621b4643b70..a6b99a70d6ecd 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,14 +1,8 @@ -import numpy as np -import pytest - -from pandas._config import using_string_dtype - from pandas import ( DataFrame, MultiIndex, ) import pandas._testing as tm -from pandas.core.arrays import NumpyExtensionArray class TestToDictOfBlocks: @@ -27,22 +21,6 @@ def test_no_copy_blocks(self, float_frame): assert _last_df is not None and not _last_df[column].equals(df[column]) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_to_dict_of_blocks_item_cache(): - # Calling to_dict_of_blocks should not poison item_cache - df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - df["c"] = NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object)) - mgr = df._mgr - assert len(mgr.blocks) == 3 # i.e. not consolidated - - ser = df["b"] # populations item_cache["b"] - - df._to_dict_of_blocks() - - with pytest.raises(ValueError, match="read-only"): - ser.values[0] = "foo" - - def test_set_change_dtype_slice(): # GH#8850 cols = MultiIndex.from_tuples([("1st", "a"), ("2nd", "b"), ("3rd", "c")]) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index bc69ec388bf0c..a9a98a5005bb3 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import ( DataFrame, @@ -2183,19 +2181,14 @@ def test_enum_column_equality(): tm.assert_series_equal(result, expected) -def test_mixed_col_index_dtype(using_infer_string): +def test_mixed_col_index_dtype(string_dtype_no_object): # GH 47382 df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) - df1.columns = df2.columns.astype("string") + df1.columns = df2.columns.astype(string_dtype_no_object) result = df1 + df2 expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) - if using_infer_string: - # df2.columns.dtype will be "str" instead of object, - # so the aligned result will be "string", not object - if HAS_PYARROW: - dtype = "string[pyarrow]" - else: - dtype = "string" - expected.columns = expected.columns.astype(dtype) + + expected.columns = expected.columns.astype(string_dtype_no_object) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 6fdbfac8f4e0a..f084d16e387a8 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -381,30 +381,3 @@ def test_update_inplace_sets_valid_block_values(): # check we haven't put a Series into any block.values assert isinstance(df._mgr.blocks[0].values, Categorical) - - -def test_nonconsolidated_item_cache_take(): - # https://github.com/pandas-dev/pandas/issues/35521 - - # create non-consolidated dataframe with object dtype columns - df = DataFrame( - { - "col1": Series(["a"], dtype=object), - } - ) - df["col2"] = Series([0], dtype=object) - assert not df._mgr.is_consolidated() - - # access column (item cache) - df["col1"] == "A" - # take operation - # (regression was that this consolidated but didn't reset item cache, - # resulting in an invalid cache and the .at operation not working properly) - df[df["col2"] == 0] - - # now setting value should update actual dataframe - df.at[0, "col1"] = "A" - - expected = DataFrame({"col1": ["A"], "col2": [0]}, dtype=object) - tm.assert_frame_equal(df, expected) - assert df.at[0, "col1"] == "A" diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 652f52bd226af..034a43ac40bba 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat.numpy import np_version_gte1p25 - import pandas as pd import pandas._testing as tm @@ -123,13 +121,10 @@ def test_pos_object(self, df_data): def test_pos_object_raises(self): # GH#21380 df = pd.DataFrame({"a": ["a", "b"]}) - if np_version_gte1p25: - with pytest.raises( - TypeError, match=r"^bad operand type for unary \+: \'str\'$" - ): - tm.assert_frame_equal(+df, df) - else: - tm.assert_series_equal(+df["a"], df["a"]) + with pytest.raises( + TypeError, match=r"^bad operand type for unary \+: \'str\'$" + ): + tm.assert_frame_equal(+df, df) def test_pos_raises(self): df = pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 8c4ab42b7be7a..724ee0489f0a0 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas.compat.pyarrow import pa_version_under10p1 +import pandas.util._test_decorators as td from pandas.core.dtypes.missing import na_value_for_dtype @@ -411,12 +411,7 @@ def test_groupby_drop_nan_with_multi_index(): "Float64", "category", "string", - pytest.param( - "string[pyarrow]", - marks=pytest.mark.skipif( - pa_version_under10p1, reason="pyarrow is not installed" - ), - ), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), "datetime64[ns]", "period[D]", "Sparse[float]", diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 550efe9187fe8..a64b15c211908 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -76,10 +74,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: - # TODO(infer_string) resample sum introduces 0's - # https://github.com/pandas-dev/pandas/issues/60229 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_groupby_with_timegrouper(self): + def test_groupby_with_timegrouper(self, using_infer_string): # GH 4161 # TimeGrouper requires a sorted index # also verifies that the resultant index has the correct name @@ -116,8 +111,11 @@ def test_groupby_with_timegrouper(self): {"Buyer": 0, "Quantity": 0}, index=exp_dti, ) - # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" + # Cast to object/str to avoid implicit cast when setting + # entry to "CarlCarlCarl" expected = expected.astype({"Buyer": object}) + if using_infer_string: + expected = expected.astype({"Buyer": "str"}) expected.iloc[0, 0] = "CarlCarlCarl" expected.iloc[6, 0] = "CarlCarl" expected.iloc[18, 0] = "Joe" diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 260b4203a4f04..2368b8bce2d9e 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -1,7 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import Index @@ -16,7 +15,6 @@ def test_repr_is_valid_construction_code(self): res = eval(repr(idx)) tm.assert_index_equal(res, idx) - @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -77,11 +75,13 @@ def test_repr_is_valid_construction_code(self): ), ], ) - def test_string_index_repr(self, index, expected): + def test_string_index_repr(self, index, expected, using_infer_string): result = repr(index) + if using_infer_string: + expected = expected.replace("dtype='object'", "dtype='str'") + assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -121,11 +121,16 @@ def test_string_index_repr(self, index, expected): ), ], ) - def test_string_index_repr_with_unicode_option(self, index, expected): + def test_string_index_repr_with_unicode_option( + self, index, expected, using_infer_string + ): # Enable Unicode option ----------------------------------------- with cf.option_context("display.unicode.east_asian_width", True): result = repr(index) - assert result == expected + + if using_infer_string: + expected = expected.replace("dtype='object'", "dtype='str'") + assert result == expected def test_repr_summary(self): with cf.option_context("display.max_seq_items", 10): diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index d9c9fdc62b0bc..262b043adaf58 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -199,6 +199,7 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) + # TODO(3.0): remove this test once using_string_dtype() is always True @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index b1361b3e8106e..2308a62bc44a4 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -2,86 +2,132 @@ Tests for CategoricalIndex.__repr__ and related methods. """ -import pytest - -from pandas._config import using_string_dtype import pandas._config.config as cf from pandas import CategoricalIndex -class TestCategoricalIndexRepr: - @pytest.mark.xfail(using_string_dtype(), reason="repr different") - def test_string_categorical_index_repr(self): +class TestCategoricalIndexReprStringCategories: + def test_string_categorical_index_repr(self, using_infer_string): # short idx = CategoricalIndex(["a", "bb", "ccc"]) expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['a', 'bb', 'ccc']", + "categories=[a, bb, ccc]", + ) assert repr(idx) == expected + def test_categorical_index_repr_multiline(self, using_infer_string): # multiple lines idx = CategoricalIndex(["a", "bb", "ccc"] * 10) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['a', 'bb', 'ccc']", + "categories=[a, bb, ccc]", + ) assert repr(idx) == expected + def test_categorical_index_repr_truncated(self, using_infer_string): # truncated idx = CategoricalIndex(["a", "bb", "ccc"] * 100) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['a', 'bb', 'ccc']", + "categories=[a, bb, ccc]", + ) assert repr(idx) == expected + def test_categorical_index_repr_many_categories(self, using_infer_string): # larger categories idx = CategoricalIndex(list("abcdefghijklmmo")) expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'm', 'o'], categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o'], ordered=False, dtype='category')""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o']", + "categories=[a, b, c, d, ..., k, l, m, o]", + ) assert repr(idx) == expected + def test_categorical_index_repr_unicode(self, using_infer_string): # short idx = CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected + def test_categorical_index_repr_unicode_multiline(self, using_infer_string): # multiple lines idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected + def test_categorical_index_repr_unicode_truncated(self, using_infer_string): # truncated idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected + def test_categorical_index_repr_unicode_many_categories(self, using_infer_string): # larger categories idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 - + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ']", + "categories=[あ, い, う, え, ..., し, す, せ, そ]", + ) assert repr(idx) == expected - # Enable Unicode option ----------------------------------------- + def test_categorical_index_repr_east_asian_width(self, using_infer_string): with cf.option_context("display.unicode.east_asian_width", True): # short idx = CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected + def test_categorical_index_repr_east_asian_width_multiline( + self, using_infer_string + ): + with cf.option_context("display.unicode.east_asian_width", True): # multiple lines idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', @@ -90,8 +136,17 @@ def test_string_categorical_index_repr(self): 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected + def test_categorical_index_repr_east_asian_width_truncated( + self, using_infer_string + ): + with cf.option_context("display.unicode.east_asian_width", True): # truncated idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', @@ -101,12 +156,25 @@ def test_string_categorical_index_repr(self): 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'いい', 'ううう']", + "categories=[あ, いい, ううう]", + ) assert repr(idx) == expected - # larger categories + def test_categorical_index_repr_east_asian_width_many_categories( + self, using_infer_string + ): + with cf.option_context("display.unicode.east_asian_width", True): idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 + if using_infer_string: + expected = expected.replace( + "categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ']", + "categories=[あ, い, う, え, ..., し, す, せ, そ]", + ) assert repr(idx) == expected diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index 9babbd5b8d56d..ca155b0e3639d 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -64,8 +64,8 @@ def test_equals_op(idx): with pytest.raises(ValueError, match="Lengths must match"): index_a == series_b - tm.assert_numpy_array_equal(index_a == series_a, expected1) - tm.assert_numpy_array_equal(index_a == series_c, expected2) + tm.assert_series_equal(index_a == series_a, Series(expected1)) + tm.assert_series_equal(index_a == series_c, Series(expected2)) # cases where length is 1 for one of them with pytest.raises(ValueError, match="Lengths must match"): diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 00e8262ddfa4c..75382cb735288 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -502,7 +502,7 @@ def test_get_indexer2(self): ) msg = "Input has different freq=None from PeriodArray\\(freq=h\\)" - with pytest.raises(ValueError, match=msg): + with pytest.raises(libperiod.IncompatibleFrequency, match=msg): idx.get_indexer(target, "nearest", tolerance="1 minute") tm.assert_numpy_array_equal( diff --git a/pandas/tests/indexes/period/test_join.py b/pandas/tests/indexes/period/test_join.py index 3e659c1a63266..9f733b358f772 100644 --- a/pandas/tests/indexes/period/test_join.py +++ b/pandas/tests/indexes/period/test_join.py @@ -1,7 +1,4 @@ import numpy as np -import pytest - -from pandas._libs.tslibs import IncompatibleFrequency from pandas import ( DataFrame, @@ -51,8 +48,9 @@ def test_join_does_not_recur(self): tm.assert_index_equal(res, expected) def test_join_mismatched_freq_raises(self): + # pre-GH#55782 this raises IncompatibleFrequency index = period_range("1/1/2000", "1/20/2000", freq="D") index3 = period_range("1/1/2000", "1/20/2000", freq="2D") - msg = r".*Input has different freq=2D from Period\(freq=D\)" - with pytest.raises(IncompatibleFrequency, match=msg): - index.join(index3) + result = index.join(index3) + expected = index.astype(object).join(index3.astype(object)) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 77b8e76894647..d465225da7f24 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.errors import IncompatibleFrequency + from pandas import ( Index, NaT, @@ -198,7 +200,7 @@ def test_maybe_convert_timedelta(): offset = offsets.BusinessDay() msg = r"Input has different freq=B from PeriodIndex\(freq=D\)" - with pytest.raises(ValueError, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): pi._maybe_convert_timedelta(offset) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 5b75bd9afd6df..e734878e6a102 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -40,6 +40,7 @@ ensure_index, ensure_index_from_sequences, ) +from pandas.testing import assert_series_equal class TestIndex: @@ -1717,3 +1718,27 @@ def test_is_monotonic_pyarrow_list_type(): idx = Index([[1], [2, 3]], dtype=pd.ArrowDtype(pa.list_(pa.int64()))) assert not idx.is_monotonic_increasing assert not idx.is_monotonic_decreasing + + +def test_index_equals_different_string_dtype(string_dtype_no_object): + # GH 61099 + idx_obj = Index(["a", "b", "c"]) + idx_str = Index(["a", "b", "c"], dtype=string_dtype_no_object) + + assert idx_obj.equals(idx_str) + assert idx_str.equals(idx_obj) + + +def test_index_comparison_different_string_dtype(string_dtype_no_object): + # GH 61099 + idx = Index(["a", "b", "c"]) + s_obj = Series([1, 2, 3], index=idx) + s_str = Series([4, 5, 6], index=idx.astype(string_dtype_no_object)) + + expected = Series([True, True, True], index=["a", "b", "c"]) + result = s_obj < s_str + assert_series_equal(result, expected) + + result = s_str > s_obj + expected.index = idx.astype(string_dtype_no_object) + assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index bf16554871efc..a842d174a4894 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -14,7 +14,6 @@ import pytest from pandas.compat import IS64 -from pandas.compat.numpy import np_version_gte1p25 from pandas.core.dtypes.common import ( is_integer_dtype, @@ -381,13 +380,11 @@ def test_astype_preserves_name(self, index, dtype): else: index.name = "idx" - warn = None - if index.dtype.kind == "c" and dtype in ["float64", "int64", "uint64"]: - # imaginary components discarded - if np_version_gte1p25: - warn = np.exceptions.ComplexWarning - else: - warn = np.ComplexWarning + warn = ( + np.exceptions.ComplexWarning + if index.dtype.kind == "c" and dtype in ["float64", "int64", "uint64"] + else None + ) is_pyarrow_str = str(index.dtype) == "string[pyarrow]" and dtype == "category" try: diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index ace78d77350cb..86d0ca1280596 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -2,6 +2,7 @@ import pytest from pandas import ( + BooleanDtype, CategoricalIndex, DatetimeIndex, Index, @@ -14,7 +15,6 @@ is_complex_dtype, is_numeric_dtype, ) -from pandas.core.arrays import BooleanArray from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin @@ -111,11 +111,10 @@ def test_numpy_ufuncs_other(index, func): if func in (np.isfinite, np.isinf, np.isnan): # numpy 1.18 changed isinf and isnan to not raise on dt64/td64 result = func(index) - assert isinstance(result, np.ndarray) out = np.empty(index.shape, dtype=bool) func(index, out=out) - tm.assert_numpy_array_equal(out, result) + tm.assert_index_equal(Index(out), result) else: with tm.external_error_raised(TypeError): func(index) @@ -129,19 +128,20 @@ def test_numpy_ufuncs_other(index, func): ): # Results in bool array result = func(index) + assert isinstance(result, Index) if not isinstance(index.dtype, np.dtype): # e.g. Int64 we expect to get BooleanArray back - assert isinstance(result, BooleanArray) + assert isinstance(result.dtype, BooleanDtype) else: - assert isinstance(result, np.ndarray) + assert isinstance(result.dtype, np.dtype) out = np.empty(index.shape, dtype=bool) func(index, out=out) if not isinstance(index.dtype, np.dtype): - tm.assert_numpy_array_equal(out, result._data) + tm.assert_index_equal(result, Index(out, dtype="boolean")) else: - tm.assert_numpy_array_equal(out, result) + tm.assert_index_equal(result, Index(out)) elif len(index) == 0: pass diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 5f36b8c3f5dbf..3ba19b2a4b254 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -560,8 +560,8 @@ def test_equals_op(self, simple_index): with pytest.raises(ValueError, match=msg): index_a == series_b - tm.assert_numpy_array_equal(index_a == series_a, expected1) - tm.assert_numpy_array_equal(index_a == series_c, expected2) + tm.assert_series_equal(index_a == series_a, Series(expected1)) + tm.assert_series_equal(index_a == series_c, Series(expected2)) # cases where length is 1 for one of them with pytest.raises(ValueError, match="Lengths must match"): diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py index e80acc230a320..d24d343332669 100644 --- a/pandas/tests/indexing/test_at.py +++ b/pandas/tests/indexing/test_at.py @@ -49,29 +49,6 @@ def test_selection_methods_of_assigned_col(): class TestAtSetItem: - def test_at_setitem_item_cache_cleared(self): - # GH#22372 Note the multi-step construction is necessary to trigger - # the original bug. pandas/issues/22372#issuecomment-413345309 - df = DataFrame(index=[0]) - df["x"] = 1 - df["cost"] = 2 - - # accessing df["cost"] adds "cost" to the _item_cache - df["cost"] - - # This loc[[0]] lookup used to call _consolidate_inplace at the - # BlockManager level, which failed to clear the _item_cache - df.loc[[0]] - - df.at[0, "x"] = 4 - df.at[0, "cost"] = 789 - - expected = DataFrame({"x": [4], "cost": 789}, index=[0]) - tm.assert_frame_equal(df, expected) - - # And in particular, check that the _item_cache has updated correctly. - tm.assert_series_equal(df["cost"], expected["cost"]) - def test_at_setitem_mixed_index_assignment(self): # GH#19860 ser = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 64d8068fa9291..266e35ac9088f 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -18,23 +18,6 @@ class TestCaching: - def test_slice_consolidate_invalidate_item_cache(self): - # this is chained assignment, but will 'work' - with option_context("chained_assignment", None): - # #3970 - df = DataFrame({"aa": np.arange(5), "bb": [2.2] * 5}) - - # Creates a second float block - df["cc"] = 0.0 - - # caches a reference to the 'bb' series - df["bb"] - - # Assignment to wrong series - with tm.raises_chained_assignment_error(): - df["bb"].iloc[0] = 0.17 - tm.assert_almost_equal(df["bb"][0], 2.2) - @pytest.mark.parametrize("do_ref", [True, False]) def test_setitem_cache_updating(self, do_ref): # GH 5424 @@ -89,18 +72,6 @@ def test_setitem_cache_updating_slices(self): tm.assert_frame_equal(out, expected) tm.assert_series_equal(out["A"], expected["A"]) - def test_altering_series_clears_parent_cache(self): - # GH #33675 - df = DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"]) - ser = df["A"] - - # Adding a new entry to ser swaps in a new array, so "A" needs to - # be removed from df._item_cache - ser["c"] = 5 - assert len(ser) == 3 - assert df["A"] is not ser - assert len(df["A"]) == 2 - class TestChaining: def test_setitem_chained_setfault(self): diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index a41d7dec8b496..bf746a9eaa976 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -282,7 +282,7 @@ def test_empty_pyarrow(data): def test_multi_chunk_pyarrow() -> None: - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "14.0.0") n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) names = ["n_legs"] table = pa.table([n_legs], names=names) @@ -488,7 +488,7 @@ def test_pandas_nullable_with_missing_values( ) -> None: # https://github.com/pandas-dev/pandas/issues/57643 # https://github.com/pandas-dev/pandas/issues/57664 - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "14.0.0") import pyarrow.interchange as pai if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": @@ -554,7 +554,7 @@ def test_pandas_nullable_without_missing_values( data: list, dtype: str, expected_dtype: str ) -> None: # https://github.com/pandas-dev/pandas/issues/57643 - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "14.0.0") import pyarrow.interchange as pai if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ac8ac0766f04d..11e6b99204aee 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -735,8 +735,6 @@ def test_reindex_items(self): mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8; f: bool; g: f8-2") reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0) - # reindex_axis does not consolidate_inplace, as that risks failing to - # invalidate _item_cache assert not reindexed.is_consolidated() tm.assert_index_equal(reindexed.items, Index(["g", "c", "a", "d"])) diff --git a/pandas/tests/io/data/legacy_pickle/1.5.3/1.5.3_x86_64_win_3.11.13.pickle b/pandas/tests/io/data/legacy_pickle/1.5.3/1.5.3_x86_64_win_3.11.13.pickle new file mode 100644 index 0000000000000..d12fc5929ea5b Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/1.5.3/1.5.3_x86_64_win_3.11.13.pickle differ diff --git a/pandas/tests/io/data/legacy_pickle/2.0.3/2.0.3_AMD64_windows_3.11.12.pickle b/pandas/tests/io/data/legacy_pickle/2.0.3/2.0.3_AMD64_windows_3.11.12.pickle new file mode 100644 index 0000000000000..f508272d058f9 Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/2.0.3/2.0.3_AMD64_windows_3.11.12.pickle differ diff --git a/pandas/tests/io/data/legacy_pickle/2.1.4/2.1.4_AMD64_windows_3.11.12.pickle b/pandas/tests/io/data/legacy_pickle/2.1.4/2.1.4_AMD64_windows_3.11.12.pickle new file mode 100644 index 0000000000000..6f838839c2937 Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/2.1.4/2.1.4_AMD64_windows_3.11.12.pickle differ diff --git a/pandas/tests/io/data/legacy_pickle/2.2.3/2.2.3_AMD64_windows_3.11.12.pickle b/pandas/tests/io/data/legacy_pickle/2.2.3/2.2.3_AMD64_windows_3.11.12.pickle new file mode 100644 index 0000000000000..f98766fd4e05d Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/2.2.3/2.2.3_AMD64_windows_3.11.12.pickle differ diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 86682e8160762..a485578b139dc 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -1395,8 +1393,7 @@ def test_unicode_name_in_footer(self): sf = fmt.SeriesFormatter(s, name="\u05e2\u05d1\u05e8\u05d9\u05ea") sf._get_footer() # should not raise exception - @pytest.mark.xfail(using_string_dtype(), reason="Fixup when arrow is default") - def test_east_asian_unicode_series(self): + def test_east_asian_unicode_series(self, using_infer_string): # not aligned properly because of east asian width # unicode index @@ -1409,6 +1406,8 @@ def test_east_asian_unicode_series(self): "ええええ D\ndtype: object", ] ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # unicode values @@ -1422,7 +1421,8 @@ def test_east_asian_unicode_series(self): "dtype: object", ] ) - + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # both @@ -1439,7 +1439,8 @@ def test_east_asian_unicode_series(self): "dtype: object", ] ) - + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # unicode footer @@ -1452,6 +1453,8 @@ def test_east_asian_unicode_series(self): "ああ あ\nいいいい いい\nう ううう\n" "えええ ええええ\nName: おおおおおおお, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # MultiIndex @@ -1495,6 +1498,8 @@ def test_east_asian_unicode_series(self): "3 ええええ\n" "Name: おおおおおおお, Length: 4, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected s.index = ["ああ", "いいいい", "う", "えええ"] @@ -1503,6 +1508,8 @@ def test_east_asian_unicode_series(self): "えええ ええええ\n" "Name: おおおおおおお, Length: 4, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # Enable Unicode option ----------------------------------------- @@ -1516,6 +1523,8 @@ def test_east_asian_unicode_series(self): "あ a\nいい bb\nううう CCC\n" "ええええ D\ndtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # unicode values @@ -1527,6 +1536,8 @@ def test_east_asian_unicode_series(self): "a あ\nbb いい\nc ううう\n" "ddd ええええ\ndtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # both s = Series( @@ -1539,6 +1550,8 @@ def test_east_asian_unicode_series(self): "う ううう\n" "えええ ええええ\ndtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # unicode footer @@ -1554,6 +1567,8 @@ def test_east_asian_unicode_series(self): "えええ ええええ\n" "Name: おおおおおおお, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # MultiIndex @@ -1599,6 +1614,8 @@ def test_east_asian_unicode_series(self): "3 ええええ\n" "Name: おおおおおおお, Length: 4, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected s.index = ["ああ", "いいいい", "う", "えええ"] @@ -1608,6 +1625,8 @@ def test_east_asian_unicode_series(self): "えええ ええええ\n" "Name: おおおおおおお, Length: 4, dtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected # ambiguous unicode @@ -1621,6 +1640,8 @@ def test_east_asian_unicode_series(self): "¡¡ ううう\n" "えええ ええええ\ndtype: object" ) + if using_infer_string: + expected = expected.replace("dtype: object", "dtype: str") assert repr(s) == expected def test_float_trim_zeros(self): @@ -1770,27 +1791,34 @@ def chck_ncols(self, s): ncolsizes = len({len(line.strip()) for line in lines}) assert ncolsizes == 1 - @pytest.mark.xfail(using_string_dtype(), reason="change when arrow is default") - def test_format_explicit(self): + def test_format_explicit(self, using_infer_string): test_sers = gen_series_formatting() with option_context("display.max_rows", 4, "display.show_dimensions", False): res = repr(test_sers["onel"]) exp = "0 a\n1 a\n ..\n98 a\n99 a\ndtype: object" + if using_infer_string: + exp = exp.replace("dtype: object", "dtype: str") assert exp == res res = repr(test_sers["twol"]) exp = "0 ab\n1 ab\n ..\n98 ab\n99 ab\ndtype: object" + if using_infer_string: + exp = exp.replace("dtype: object", "dtype: str") assert exp == res res = repr(test_sers["asc"]) exp = ( "0 a\n1 ab\n ... \n4 abcde\n5 " "abcdef\ndtype: object" ) + if using_infer_string: + exp = exp.replace("dtype: object", "dtype: str") assert exp == res res = repr(test_sers["desc"]) exp = ( "5 abcdef\n4 abcde\n ... \n1 ab\n0 " "a\ndtype: object" ) + if using_infer_string: + exp = exp.replace("dtype: object", "dtype: str") assert exp == res def test_ncols(self): diff --git a/pandas/tests/io/formats/test_ipython_compat.py b/pandas/tests/io/formats/test_ipython_compat.py index 8512f41396906..df202dbd7d9fb 100644 --- a/pandas/tests/io/formats/test_ipython_compat.py +++ b/pandas/tests/io/formats/test_ipython_compat.py @@ -22,7 +22,8 @@ def test_publishes(self, ip): last_obj = None for obj, expected in zip(objects, expected_keys): last_obj = obj - with opt: + with cf.option_context("display.html.table_schema", True): + # Can't reuse opt on all systems GH#58055 formatted = ipython.display_formatter.format(obj) assert set(formatted[0].keys()) == expected diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 6d762fdeb8d79..dd2d85c4755af 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -741,3 +741,140 @@ def test_to_csv_iterative_compression_buffer(compression): pd.read_csv(buffer, compression=compression, index_col=0), df ) assert not buffer.closed + + +def test_new_style_float_format_basic(): + df = DataFrame({"A": [1234.56789, 9876.54321]}) + result = df.to_csv(float_format="{:.2f}", lineterminator="\n") + expected = ",A\n0,1234.57\n1,9876.54\n" + assert result == expected + + +def test_new_style_float_format_thousands(): + df = DataFrame({"A": [1234.56789, 9876.54321]}) + result = df.to_csv(float_format="{:,.2f}", lineterminator="\n") + expected = ',A\n0,"1,234.57"\n1,"9,876.54"\n' + assert result == expected + + +def test_new_style_scientific_format(): + df = DataFrame({"A": [0.000123, 0.000456]}) + result = df.to_csv(float_format="{:.2e}", lineterminator="\n") + expected = ",A\n0,1.23e-04\n1,4.56e-04\n" + assert result == expected + + +def test_new_style_with_nan(): + df = DataFrame({"A": [1.23, np.nan, 4.56]}) + result = df.to_csv(float_format="{:.2f}", na_rep="NA", lineterminator="\n") + expected = ",A\n0,1.23\n1,NA\n2,4.56\n" + assert result == expected + + +def test_new_style_with_mixed_types(): + df = DataFrame({"A": [1.23, 4.56], "B": ["x", "y"]}) + result = df.to_csv(float_format="{:.2f}", lineterminator="\n") + expected = ",A,B\n0,1.23,x\n1,4.56,y\n" + assert result == expected + + +def test_new_style_with_mixed_types_in_column(): + df = DataFrame({"A": [1.23, "text", 4.56]}) + result = df.to_csv(float_format="{:.2f}", lineterminator="\n") + expected = ",A\n0,1.23\n1,text\n2,4.56\n" + assert result == expected + + +def test_invalid_new_style_format_missing_brace(): + df = DataFrame({"A": [1.23]}) + with pytest.raises(ValueError, match="Invalid new-style format string '{:.2f"): + df.to_csv(float_format="{:.2f") + + +def test_invalid_new_style_format_specifier(): + df = DataFrame({"A": [1.23]}) + with pytest.raises(ValueError, match="Invalid new-style format string '{:.2z}'"): + df.to_csv(float_format="{:.2z}") + + +def test_old_style_format_compatibility(): + df = DataFrame({"A": [1234.56789, 9876.54321]}) + result = df.to_csv(float_format="%.2f", lineterminator="\n") + expected = ",A\n0,1234.57\n1,9876.54\n" + assert result == expected + + +def test_callable_float_format_compatibility(): + df = DataFrame({"A": [1234.56789, 9876.54321]}) + result = df.to_csv(float_format=lambda x: f"{x:,.2f}", lineterminator="\n") + expected = ',A\n0,"1,234.57"\n1,"9,876.54"\n' + assert result == expected + + +def test_no_float_format(): + df = DataFrame({"A": [1.23, 4.56]}) + result = df.to_csv(float_format=None, lineterminator="\n") + expected = ",A\n0,1.23\n1,4.56\n" + assert result == expected + + +def test_large_numbers(): + df = DataFrame({"A": [1e308, 2e308]}) + result = df.to_csv(float_format="{:.2e}", lineterminator="\n") + expected = ",A\n0,1.00e+308\n1,inf\n" + assert result == expected + + +def test_zero_and_negative(): + df = DataFrame({"A": [0.0, -1.23456]}) + result = df.to_csv(float_format="{:+.2f}", lineterminator="\n") + expected = ",A\n0,+0.00\n1,-1.23\n" + assert result == expected + + +def test_unicode_format(): + df = DataFrame({"A": [1.23, 4.56]}) + result = df.to_csv(float_format="{:.2f}€", encoding="utf-8", lineterminator="\n") + expected = ",A\n0,1.23€\n1,4.56€\n" + assert result == expected + + +def test_empty_dataframe(): + df = DataFrame({"A": []}) + result = df.to_csv(float_format="{:.2f}", lineterminator="\n") + expected = ",A\n" + assert result == expected + + +def test_multi_column_float(): + df = DataFrame({"A": [1.23, 4.56], "B": [7.89, 0.12]}) + result = df.to_csv(float_format="{:.2f}", lineterminator="\n") + expected = ",A,B\n0,1.23,7.89\n1,4.56,0.12\n" + assert result == expected + + +def test_invalid_float_format_type(): + df = DataFrame({"A": [1.23]}) + with pytest.raises(ValueError, match="float_format must be a string or callable"): + df.to_csv(float_format=123) + + +def test_new_style_with_inf(): + df = DataFrame({"A": [1.23, np.inf, -np.inf]}) + result = df.to_csv(float_format="{:.2f}", na_rep="NA", lineterminator="\n") + expected = ",A\n0,1.23\n1,inf\n2,-inf\n" + assert result == expected + + +def test_new_style_with_precision_edge(): + df = DataFrame({"A": [1.23456789]}) + result = df.to_csv(float_format="{:.10f}", lineterminator="\n") + expected = ",A\n0,1.2345678900\n" + assert result == expected + + +def test_new_style_with_template(): + df = DataFrame({"A": [1234.56789]}) + result = df.to_csv(float_format="Value: {:,.2f}", lineterminator="\n") + expected = ',A\n0,"Value: 1,234.57"\n' + assert result == expected diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 9bfd8eb9d51d5..9cb50b03e223a 100644 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -147,6 +147,7 @@ def create_pickle_data(): "float": Index(np.arange(10, dtype=np.float64)), "uint": Index(np.arange(10, dtype=np.uint64)), "timedelta": timedelta_range("00:00:00", freq="30min", periods=10), + "string": Index(["foo", "bar", "baz", "qux", "quux"], dtype="string"), } index["range"] = RangeIndex(10) @@ -185,6 +186,7 @@ def create_pickle_data(): "dt": Series(date_range("20130101", periods=5)), "dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")), "period": Series([Period("2000Q1")] * 5), + "string": Series(["foo", "bar", "baz", "qux", "quux"], dtype="string"), } mixed_dup_df = DataFrame(data) @@ -233,6 +235,12 @@ def create_pickle_data(): }, index=range(5), ), + "string": DataFrame( + { + "A": Series(["foo", "bar", "baz", "qux", "quux"], dtype="string"), + "B": Series(["one", "two", "one", "two", "three"], dtype="string"), + } + ), } cat = { diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 98f437e757e31..d895fd6e6770c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1566,11 +1566,8 @@ def test_from_json_to_json_table_dtypes(self): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) - # TODO: We are casting to string which coerces None to NaN before casting back - # to object, ending up with incorrect na values - @pytest.mark.xfail(using_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) - def test_to_json_from_json_columns_dtypes(self, orient): + def test_to_json_from_json_columns_dtypes(self, orient, using_infer_string): # GH21892 GH33205 expected = DataFrame.from_dict( { @@ -1591,6 +1588,11 @@ def test_to_json_from_json_columns_dtypes(self, orient): with tm.assert_produces_warning(FutureWarning, match=msg): dfjson = expected.to_json(orient=orient) + if using_infer_string: + # When this is read back in it is inferred to "str" dtype which + # uses NaN instead of None. + expected.loc[0, "Object"] = np.nan + result = read_json( StringIO(dfjson), orient=orient, diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 11a30a26f91ef..469fe84a80dcd 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -19,7 +19,6 @@ import pytest from pandas.compat import WASM -from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, ParserWarning, @@ -90,10 +89,9 @@ def test_dtype_and_names_error(c_parser_only): 3.0 3 """ # fallback casting, but not castable - warning = RuntimeWarning if np_version_gte1p24 else None if not WASM: # no fp exception support in wasm with pytest.raises(ValueError, match="cannot safely convert"): - with tm.assert_produces_warning(warning, check_stacklevel=False): + with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): parser.read_csv( StringIO(data), sep=r"\s+", diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index eeb783f1957b7..ea44564e3f3e1 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -24,6 +24,12 @@ ) from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs +# The only non-test way that TextReader gets called has na_valuess and na_fvalues +# either both-sets or both dicts, and the code assumes this is the case. +# But the default argument in its __init__ is None, so we have to pass these +# explicitly in tests. +_na_value_kwargs: dict[str, set] = {"na_values": set(), "na_fvalues": set()} + class TestTextReader: @pytest.fixture @@ -32,20 +38,20 @@ def csv_path(self, datapath): def test_file_handle(self, csv_path): with open(csv_path, "rb") as f: - reader = TextReader(f) + reader = TextReader(f, **_na_value_kwargs) reader.read() def test_file_handle_mmap(self, csv_path): # this was never using memory_map=True with open(csv_path, "rb") as f: - reader = TextReader(f, header=None) + reader = TextReader(f, header=None, **_na_value_kwargs) reader.read() def test_StringIO(self, csv_path): with open(csv_path, "rb") as f: text = f.read() src = BytesIO(text) - reader = TextReader(src, header=None) + reader = TextReader(src, header=None, **_na_value_kwargs) reader.read() def test_encoding_mismatch_warning(self, csv_path): @@ -58,14 +64,16 @@ def test_encoding_mismatch_warning(self, csv_path): def test_string_factorize(self): # should this be optional? data = "a\nb\na\nb\na" - reader = TextReader(StringIO(data), header=None) + reader = TextReader(StringIO(data), header=None, **_na_value_kwargs) result = reader.read() assert len(set(map(id, result[0]))) == 2 def test_skipinitialspace(self): data = "a, b\na, b\na, b\na, b" - reader = TextReader(StringIO(data), skipinitialspace=True, header=None) + reader = TextReader( + StringIO(data), skipinitialspace=True, header=None, **_na_value_kwargs + ) result = reader.read() tm.assert_numpy_array_equal( @@ -78,7 +86,7 @@ def test_skipinitialspace(self): def test_parse_booleans(self): data = "True\nFalse\nTrue\nTrue" - reader = TextReader(StringIO(data), header=None) + reader = TextReader(StringIO(data), header=None, **_na_value_kwargs) result = reader.read() assert result[0].dtype == np.bool_ @@ -86,7 +94,9 @@ def test_parse_booleans(self): def test_delimit_whitespace(self): data = 'a b\na\t\t "b"\n"a"\t \t b' - reader = TextReader(StringIO(data), delim_whitespace=True, header=None) + reader = TextReader( + StringIO(data), delim_whitespace=True, header=None, **_na_value_kwargs + ) result = reader.read() tm.assert_numpy_array_equal( @@ -99,7 +109,7 @@ def test_delimit_whitespace(self): def test_embedded_newline(self): data = 'a\n"hello\nthere"\nthis' - reader = TextReader(StringIO(data), header=None) + reader = TextReader(StringIO(data), header=None, **_na_value_kwargs) result = reader.read() expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_) @@ -108,7 +118,9 @@ def test_embedded_newline(self): def test_euro_decimal(self): data = "12345,67\n345,678" - reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None) + reader = TextReader( + StringIO(data), delimiter=":", decimal=",", header=None, **_na_value_kwargs + ) result = reader.read() expected = np.array([12345.67, 345.678]) @@ -117,7 +129,13 @@ def test_euro_decimal(self): def test_integer_thousands(self): data = "123,456\n12,500" - reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None) + reader = TextReader( + StringIO(data), + delimiter=":", + thousands=",", + header=None, + **_na_value_kwargs, + ) result = reader.read() expected = np.array([123456, 12500], dtype=np.int64) @@ -138,7 +156,9 @@ def test_skip_bad_lines(self): # too many lines, see #2430 for why data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r" - reader = TextReader(StringIO(data), delimiter=":", header=None) + reader = TextReader( + StringIO(data), delimiter=":", header=None, **_na_value_kwargs + ) msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4" with pytest.raises(parser.ParserError, match=msg): reader.read() @@ -148,6 +168,7 @@ def test_skip_bad_lines(self): delimiter=":", header=None, on_bad_lines=2, # Skip + **_na_value_kwargs, ) result = reader.read() expected = { @@ -163,13 +184,14 @@ def test_skip_bad_lines(self): delimiter=":", header=None, on_bad_lines=1, # Warn + **_na_value_kwargs, ) reader.read() def test_header_not_enough_lines(self): data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6" - reader = TextReader(StringIO(data), delimiter=",", header=2) + reader = TextReader(StringIO(data), delimiter=",", header=2, **_na_value_kwargs) header = reader.header expected = [["a", "b", "c"]] assert header == expected @@ -185,7 +207,13 @@ def test_header_not_enough_lines(self): def test_escapechar(self): data = '\\"hello world"\n\\"hello world"\n\\"hello world"' - reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\") + reader = TextReader( + StringIO(data), + delimiter=",", + header=None, + escapechar="\\", + **_na_value_kwargs, + ) result = reader.read() expected = {0: np.array(['"hello world"'] * 3, dtype=object)} assert_array_dicts_equal(result, expected) @@ -208,7 +236,9 @@ def test_numpy_string_dtype(self): def _make_reader(**kwds): if "dtype" in kwds: kwds["dtype"] = ensure_dtype_objs(kwds["dtype"]) - return TextReader(StringIO(data), delimiter=",", header=None, **kwds) + return TextReader( + StringIO(data), delimiter=",", header=None, **kwds, **_na_value_kwargs + ) reader = _make_reader(dtype="S5,i4") result = reader.read() @@ -237,7 +267,7 @@ def test_pass_dtype(self): def _make_reader(**kwds): if "dtype" in kwds: kwds["dtype"] = ensure_dtype_objs(kwds["dtype"]) - return TextReader(StringIO(data), delimiter=",", **kwds) + return TextReader(StringIO(data), delimiter=",", **kwds, **_na_value_kwargs) reader = _make_reader(dtype={"one": "u1", 1: "S1"}) result = reader.read() @@ -263,7 +293,7 @@ def test_usecols(self): 10,11,12""" def _make_reader(**kwds): - return TextReader(StringIO(data), delimiter=",", **kwds) + return TextReader(StringIO(data), delimiter=",", **kwds, **_na_value_kwargs) reader = _make_reader(usecols=(1, 2)) result = reader.read() @@ -296,14 +326,14 @@ def _make_reader(**kwds): ) def test_cr_delimited(self, text, kwargs): nice_text = text.replace("\r", "\r\n") - result = TextReader(StringIO(text), **kwargs).read() - expected = TextReader(StringIO(nice_text), **kwargs).read() + result = TextReader(StringIO(text), **kwargs, **_na_value_kwargs).read() + expected = TextReader(StringIO(nice_text), **kwargs, **_na_value_kwargs).read() assert_array_dicts_equal(result, expected) def test_empty_field_eof(self): data = "a,b,c\n1,2,3\n4,," - result = TextReader(StringIO(data), delimiter=",").read() + result = TextReader(StringIO(data), delimiter=",", **_na_value_kwargs).read() expected = { 0: np.array([1, 4], dtype=np.int64), diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index c5cac5a5caf09..80e7664d1969e 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -82,6 +82,9 @@ def test_complex_mixed_fixed(tmp_path, setup_path): tm.assert_frame_equal(df, reread) +@pytest.mark.filterwarnings( + "ignore:`alltrue` is deprecated as of NumPy 1.25.0:DeprecationWarning" +) def test_complex_mixed_table(tmp_path, setup_path): complex64 = np.array( [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 9192804e49bd1..7bfc392af55f8 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -42,6 +42,9 @@ def _compare_with_tz(a, b): gettz_pytz = lambda x: x +@pytest.mark.filterwarnings( + "ignore:`alltrue` is deprecated as of NumPy 1.25.0:DeprecationWarning" +) @pytest.mark.parametrize("gettz", [gettz_dateutil, gettz_pytz]) def test_append_with_timezones(setup_path, gettz): # as columns @@ -332,6 +335,9 @@ def test_dst_transitions(setup_path): tm.assert_frame_equal(result, df) +@pytest.mark.filterwarnings( + "ignore:`alltrue` is deprecated as of NumPy 1.25.0:DeprecationWarning" +) def test_read_with_where_tz_aware_index(tmp_path, setup_path): # GH 11926 periods = 10 diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fefed34894cf3..9f9304c8d1664 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -13,7 +13,6 @@ from pandas.compat import is_platform_windows from pandas.compat.pyarrow import ( - pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, pa_version_under17p0, @@ -729,7 +728,7 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): expected = df_full.copy() expected.loc[1, "string_with_nan"] = None - if pa_version_under11p0: + if pa_version_under13p0: expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( "M8[ns]" ) @@ -980,15 +979,12 @@ def test_additional_extension_types(self, pa): def test_timestamp_nanoseconds(self, pa): # with version 2.6, pyarrow defaults to writing the nanoseconds, so - # this should work without error - # Note in previous pyarrows(<7.0.0), only the pseudo-version 2.0 was available + # this should work without error, even for pyarrow < 13 ver = "2.6" df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": ver}) def test_timezone_aware_index(self, pa, timezone_aware_date_list): - pytest.importorskip("pyarrow", "11.0.0") - idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) @@ -1003,7 +999,7 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): # this use-case sets the resolution to 1 minute expected = df[:] - if pa_version_under11p0: + if pa_version_under13p0: expected.index = expected.index.as_unit("ns") if timezone_aware_date_list.tzinfo != datetime.timezone.utc: # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone @@ -1140,7 +1136,6 @@ def test_string_inference(self, tmp_path, pa, using_infer_string): ) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(pa_version_under11p0, reason="not supported before 11.0") def test_roundtrip_decimal(self, tmp_path, pa): # GH#54768 import pyarrow as pa @@ -1189,7 +1184,7 @@ def test_infer_string_large_string_type(self, tmp_path, pa): def test_non_nanosecond_timestamps(self, temp_file): # GH#49236 - pa = pytest.importorskip("pyarrow", "11.0.0") + pa = pytest.importorskip("pyarrow", "13.0.0") pq = pytest.importorskip("pyarrow.parquet") arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 4a6a5635eb68c..6f4c1602a5e64 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -8,6 +8,7 @@ time, timedelta, ) +from decimal import Decimal from io import StringIO from pathlib import Path import sqlite3 @@ -1038,6 +1039,12 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): # GH 52046 pytest.importorskip("pyarrow") + if isinstance(nulls_fixture, Decimal): + pytest.skip( + # GH#61773 + reason="Decimal('NaN') not supported in constructor for timestamp dtype" + ) + df = DataFrame( { "datetime": pd.array( diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index b155c0cca4aa6..90fda2c10962b 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1030,7 +1030,13 @@ def test_big_dates(self, datapath, temp_file): # {c : c[-2:] for c in columns} path = temp_file expected.index.name = "index" - expected.to_stata(path, convert_dates=date_conversion) + msg = ( + "Converting object-dtype columns of datetimes to datetime64 " + "when writing to stata is deprecated" + ) + exp_object = expected.astype(object) + with tm.assert_produces_warning(FutureWarning, match=msg): + exp_object.to_stata(path, convert_dates=date_conversion) written_and_read_again = self.read_dta(path) tm.assert_frame_equal( diff --git a/pandas/tests/plotting/frame/test_frame_subplots.py b/pandas/tests/plotting/frame/test_frame_subplots.py index b44725a01fe23..7f4009bdb5e66 100644 --- a/pandas/tests/plotting/frame/test_frame_subplots.py +++ b/pandas/tests/plotting/frame/test_frame_subplots.py @@ -6,7 +6,6 @@ import pytest from pandas.compat import is_platform_linux -from pandas.compat.numpy import np_version_gte1p24 import pandas as pd from pandas import ( @@ -423,7 +422,7 @@ def test_subplots_dup_columns_secondary_y_no_subplot(self): assert len(ax.right_ax.lines) == 5 @pytest.mark.xfail( - np_version_gte1p24 and is_platform_linux(), + is_platform_linux(), reason="Weird rounding problems", strict=False, ) @@ -438,7 +437,7 @@ def test_bar_log_no_subplots(self): tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) @pytest.mark.xfail( - np_version_gte1p24 and is_platform_linux(), + is_platform_linux(), reason="Weird rounding problems", strict=False, ) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 98e70f770896c..eb1b4f7d85a68 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -7,7 +7,6 @@ import pytest from pandas.compat import is_platform_linux -from pandas.compat.numpy import np_version_gte1p24 import pandas.util._test_decorators as td import pandas as pd @@ -277,7 +276,7 @@ def test_line_use_index_false_diff_var(self): assert label2 == "" @pytest.mark.xfail( - np_version_gte1p24 and is_platform_linux(), + is_platform_linux(), reason="Weird rounding problems", strict=False, ) @@ -290,7 +289,7 @@ def test_bar_log(self, axis, meth): tm.assert_numpy_array_equal(getattr(ax, axis).get_ticklocs(), expected) @pytest.mark.xfail( - np_version_gte1p24 and is_platform_linux(), + is_platform_linux(), reason="Weird rounding problems", strict=False, ) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 070c756e8c928..1482da8a074eb 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -289,7 +289,7 @@ def test_margin_dropna4(self): # GH: 10772: Keep np.nan in result with dropna=False df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) actual = crosstab(df.a, df.b, margins=True, dropna=False) - expected = DataFrame([[1, 0, 1.0], [1, 3, 4.0], [0, 1, np.nan], [2, 4, 6.0]]) + expected = DataFrame([[1, 0, 1], [1, 3, 4], [0, 1, 1], [2, 4, 6]]) expected.index = Index([1.0, 2.0, np.nan, "All"], name="a") expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) @@ -301,11 +301,11 @@ def test_margin_dropna5(self): ) actual = crosstab(df.a, df.b, margins=True, dropna=False) expected = DataFrame( - [[1, 0, 0, 1.0], [0, 1, 0, 1.0], [0, 3, 1, np.nan], [1, 4, 0, 6.0]] + [[1, 0, 0, 1.0], [0, 1, 0, 1.0], [0, 3, 1, 4.0], [1, 4, 1, 6.0]] ) expected.index = Index([1.0, 2.0, np.nan, "All"], name="a") expected.columns = Index([3.0, 4.0, np.nan, "All"], name="b") - tm.assert_frame_equal(actual, expected) + tm.assert_frame_equal(actual, expected, check_dtype=False) def test_margin_dropna6(self): # GH: 10772: Keep np.nan in result with dropna=False @@ -326,7 +326,7 @@ def test_margin_dropna6(self): names=["b", "c"], ) expected = DataFrame( - [[1, 0, 1, 0, 0, 0, 2], [2, 0, 1, 1, 0, 1, 5], [3, 0, 2, 1, 0, 0, 7]], + [[1, 0, 1, 0, 0, 0, 2], [2, 0, 1, 1, 0, 1, 5], [3, 0, 2, 1, 0, 1, 7]], columns=m, ) expected.index = Index(["bar", "foo", "All"], name="a") @@ -349,7 +349,7 @@ def test_margin_dropna6(self): [0, 0, np.nan], [2, 0, 2.0], [1, 1, 2.0], - [0, 1, np.nan], + [0, 1, 1.0], [5, 2, 7.0], ], index=m, diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index c7b7992a78232..dfb691c785404 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -333,9 +333,7 @@ def test_no_prefix_string_cats_default_category( ): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) - expected = DataFrame(expected) - if using_infer_string: - expected[""] = expected[""].astype("str") + expected = DataFrame(expected, dtype=dummies.columns.dtype) tm.assert_frame_equal(result, expected) @@ -449,3 +447,31 @@ def test_maintain_original_index(): result = from_dummies(df) expected = DataFrame({"": list("abca")}, index=list("abcd")) tm.assert_frame_equal(result, expected) + + +def test_int_columns_with_float_default(): + # https://github.com/pandas-dev/pandas/pull/60694 + df = DataFrame( + { + 3: [1, 0, 0], + 4: [0, 1, 0], + }, + ) + with pytest.raises(ValueError, match="Trying to coerce float values to integers"): + from_dummies(df, default_category=0.5) + + +def test_object_dtype_preserved(): + # https://github.com/pandas-dev/pandas/pull/60694 + # When the input has object dtype, the result should as + # well even when infer_string is True. + df = DataFrame( + { + "x": [1, 0, 0], + "y": [0, 1, 0], + }, + ) + df.columns = df.columns.astype("object") + result = from_dummies(df, default_category="z") + expected = DataFrame({"": ["x", "y", "z"]}, dtype="object") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 2a58815c1cece..a0ad91e679bcc 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -11,8 +11,6 @@ from pandas._config import using_string_dtype -from pandas.compat.numpy import np_version_gte1p25 - import pandas as pd from pandas import ( ArrowDtype, @@ -2134,13 +2132,6 @@ def test_pivot_string_func_vs_func(self, f, f_numpy, data): data = data.drop(columns="C") result = pivot_table(data, index="A", columns="B", aggfunc=f) expected = pivot_table(data, index="A", columns="B", aggfunc=f_numpy) - - if not np_version_gte1p25 and isinstance(f_numpy, list): - # Prior to 1.25, np.min/np.max would come through as amin and amax - mapper = {"amin": "min", "amax": "max", "sum": "sum", "mean": "mean"} - expected.columns = expected.columns.map( - lambda x: (mapper[x[0]], x[1], x[2]) - ) tm.assert_frame_equal(result, expected) @pytest.mark.slow @@ -2594,6 +2585,36 @@ def test_pivot_table_values_as_two_params( expected = DataFrame(data=e_data, index=e_index, columns=e_cols) tm.assert_frame_equal(result, expected) + def test_pivot_table_margins_include_nan_groups(self): + # GH#61509 + df = DataFrame( + { + "i": [1, 2, 3], + "g1": ["a", "b", "b"], + "g2": ["x", None, None], + } + ) + + result = df.pivot_table( + index="g1", + columns="g2", + values="i", + aggfunc="count", + dropna=False, + margins=True, + ) + + expected = DataFrame( + { + "x": {"a": 1.0, "b": np.nan, "All": 1.0}, + np.nan: {"a": np.nan, "b": 2.0, "All": 2.0}, + "All": {"a": 1.0, "b": 2.0, "All": 3.0}, + } + ) + expected.index.name = "g1" + expected.columns.name = "g2" + tm.assert_frame_equal(result, expected, check_dtype=False) + class TestPivot: def test_pivot(self): diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 287b7557f50f9..d2bc8f521d7bb 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -167,6 +167,12 @@ def test_logical_and(): assert False & NA is False assert NA & NA is NA + # GH#58427 + assert NA & np.bool_(True) is NA + assert np.bool_(True) & NA is NA + assert NA & np.bool_(False) is False + assert np.bool_(False) & NA is False + msg = "unsupported operand type" with pytest.raises(TypeError, match=msg): NA & 5 @@ -179,6 +185,12 @@ def test_logical_or(): assert False | NA is NA assert NA | NA is NA + # GH#58427 + assert NA | np.bool_(True) is True + assert np.bool_(True) | NA is True + assert NA | np.bool_(False) is NA + assert np.bool_(False) | NA is NA + msg = "unsupported operand type" with pytest.raises(TypeError, match=msg): NA | 5 @@ -191,6 +203,12 @@ def test_logical_xor(): assert False ^ NA is NA assert NA ^ NA is NA + # GH#58427 + assert NA ^ np.bool_(True) is NA + assert np.bool_(True) ^ NA is NA + assert NA ^ np.bool_(False) is NA + assert np.bool_(False) ^ NA is NA + msg = "unsupported operand type" with pytest.raises(TypeError, match=msg): NA ^ 5 diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index b20df43dd49a6..b7a73da7d58cd 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -9,7 +9,6 @@ import pytest from pandas._libs.tslibs import iNaT -from pandas.compat.numpy import np_version_gte1p24p3 from pandas import ( DatetimeIndex, @@ -537,24 +536,10 @@ def test_to_numpy_alias(): [ Timedelta(0), Timedelta(0).to_pytimedelta(), - pytest.param( - Timedelta(0).to_timedelta64(), - marks=pytest.mark.xfail( - not np_version_gte1p24p3, - reason="td64 doesn't return NotImplemented, see numpy#17017", - # When this xfail is fixed, test_nat_comparisons_numpy - # can be removed. - ), - ), + Timedelta(0).to_timedelta64(), Timestamp(0), Timestamp(0).to_pydatetime(), - pytest.param( - Timestamp(0).to_datetime64(), - marks=pytest.mark.xfail( - not np_version_gte1p24p3, - reason="dt64 doesn't return NotImplemented, see numpy#17017", - ), - ), + Timestamp(0).to_datetime64(), Timestamp(0).tz_localize("UTC"), NaT, ], @@ -570,18 +555,6 @@ def test_nat_comparisons(compare_operators_no_eq_ne, other): assert op(other, NaT) is False -@pytest.mark.parametrize("other", [np.timedelta64(0, "ns"), np.datetime64("now", "ns")]) -def test_nat_comparisons_numpy(other): - # Once numpy#17017 is fixed and the xfailed cases in test_nat_comparisons - # pass, this test can be removed - assert not NaT == other - assert NaT != other - assert not NaT < other - assert not NaT > other - assert not NaT <= other - assert not NaT >= other - - @pytest.mark.parametrize("other_and_type", [("foo", "str"), (2, "int"), (2.0, "float")]) @pytest.mark.parametrize( "symbol_and_op", diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 2c97c4a32e0aa..09ca5a71503ad 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -478,6 +478,13 @@ def test_now_today_unit(self, method): class TestTimestampConstructors: + def test_disallow_dt64_with_weird_unit(self): + # GH#25611 + dt64 = np.datetime64(1, "500m") + msg = "np.datetime64 objects with units containing a multiplier" + with pytest.raises(ValueError, match=msg): + Timestamp(dt64) + def test_weekday_but_no_day_raises(self): # GH#52659 msg = "Parsing datetimes with weekday but no day information is not supported" diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index bec8ca13a2f5f..3541592e7c51e 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -10,8 +10,6 @@ pa = pytest.importorskip("pyarrow") -from pandas.compat import pa_version_under11p0 - @pytest.mark.parametrize( "list_dtype", @@ -57,20 +55,14 @@ def test_list_getitem_slice(): index=[1, 3, 7], name="a", ) - if pa_version_under11p0: - with pytest.raises( - NotImplementedError, match="List slice not supported by pyarrow " - ): - ser.list[1:None:None] - else: - actual = ser.list[1:None:None] - expected = Series( - [[2, 3], [None, 5], None], - dtype=ArrowDtype(pa.list_(pa.int64())), - index=[1, 3, 7], - name="a", - ) - tm.assert_series_equal(actual, expected) + actual = ser.list[1:None:None] + expected = Series( + [[2, 3], [None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], + name="a", + ) + tm.assert_series_equal(actual, expected) def test_list_len(): @@ -105,14 +97,8 @@ def test_list_getitem_slice_invalid(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), ) - if pa_version_under11p0: - with pytest.raises( - NotImplementedError, match="List slice not supported by pyarrow " - ): - ser.list[1:None:0] - else: - with pytest.raises(pa.lib.ArrowInvalid, match=re.escape("`step` must be >= 1")): - ser.list[1:None:0] + with pytest.raises(pa.lib.ArrowInvalid, match=re.escape("`step` must be >= 1")): + ser.list[1:None:0] def test_list_accessor_non_list_dtype(): diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py index 80aea75fda406..c1ef1b14ec3d0 100644 --- a/pandas/tests/series/accessors/test_struct_accessor.py +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -2,10 +2,7 @@ import pytest -from pandas.compat.pyarrow import ( - pa_version_under11p0, - pa_version_under13p0, -) +from pandas.compat.pyarrow import pa_version_under13p0 from pandas import ( ArrowDtype, @@ -105,7 +102,6 @@ def test_struct_accessor_field_with_invalid_name_or_index(): ser.struct.field(1.1) -@pytest.mark.skipif(pa_version_under11p0, reason="pyarrow>=11.0.0 required") def test_struct_accessor_explode(): index = Index([-100, 42, 123]) ser = Series( diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index ae3b0d10214e3..f894005296781 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -4,12 +4,11 @@ datetime, ) from decimal import Decimal -import os import numpy as np import pytest -from pandas.compat.numpy import np_version_gte1p24 +from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError from pandas.core.dtypes.common import is_list_like @@ -1441,13 +1440,7 @@ def obj(self): np.float32, False, marks=pytest.mark.xfail( - ( - not np_version_gte1p24 - or ( - np_version_gte1p24 - and os.environ.get("NPY_PROMOTION_STATE", "weak") != "weak" - ) - ), + not np_version_gt2, reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", ), diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index 79ec11feb5308..5f4db8170eab0 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.compat.numpy import np_version_gte1p25 - from pandas.core.dtypes.common import ( is_complex_dtype, is_extension_array_dtype, @@ -95,6 +93,32 @@ def test_describe_empty_object(self): assert np.isnan(result.iloc[2]) assert np.isnan(result.iloc[3]) + def test_describe_multiple_dtypes(self): + """ + GH61707: describe() doesn't work on EAs which generate + statistics with multiple dtypes. + """ + from decimal import Decimal + + from pandas.tests.extension.decimal import to_decimal + + s = Series(to_decimal([1, 2.5, 3]), dtype="decimal") + + expected = Series( + [ + 3, + Decimal("2.166666666666666666666666667"), + Decimal("0.8498365855987974716713706849"), + Decimal("1"), + Decimal("3"), + ], + index=["count", "mean", "std", "min", "max"], + dtype="object", + ) + + result = s.describe(percentiles=[]) + tm.assert_series_equal(result, expected) + def test_describe_with_tz(self, tz_naive_fixture): # GH 21332 tz = tz_naive_fixture @@ -167,7 +191,7 @@ def test_numeric_result_dtype(self, any_numeric_dtype): dtype = "complex128" if is_complex_dtype(any_numeric_dtype) else None ser = Series([0, 1], dtype=any_numeric_dtype) - if dtype == "complex128" and np_version_gte1p25: + if dtype == "complex128": with pytest.raises( TypeError, match=r"^a must be an array of real numbers$" ): diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index e7d284bd47e21..35a9742d653db 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -10,7 +10,6 @@ import pytest from pandas._libs import lib -from pandas._libs.tslibs import IncompatibleFrequency import pandas as pd from pandas import ( @@ -172,10 +171,6 @@ def test_add_series_with_period_index(self): result = ts + _permute(ts[::2]) tm.assert_series_equal(result, expected) - msg = "Input has different freq=D from Period\\(freq=Y-DEC\\)" - with pytest.raises(IncompatibleFrequency, match=msg): - ts + ts.asfreq("D", how="end") - @pytest.mark.parametrize( "target_add,input_value,expected_value", [ diff --git a/pandas/tests/tslibs/test_timedeltas.py b/pandas/tests/tslibs/test_timedeltas.py index 4784a6d0d600d..8e27acdd7af75 100644 --- a/pandas/tests/tslibs/test_timedeltas.py +++ b/pandas/tests/tslibs/test_timedeltas.py @@ -104,6 +104,10 @@ def test_kwarg_assertion(kwargs): with pytest.raises(ValueError, match=re.escape(err_message)): Timedelta(**kwargs) + with pytest.raises(ValueError, match=re.escape(err_message)): + # GH#53801 'unit' misspelled as 'units' + Timedelta(1, units="hours") + class TestArrayToTimedelta64: def test_array_to_timedelta64_string_with_unit_2d_raises(self): diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index c0e9756372f47..f455e06dde8bb 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -12,7 +12,7 @@ PKG = os.path.dirname(os.path.dirname(__file__)) -def test(extra_args: list[str] | None = None, run_doctests: bool = False) -> None: +def test(extra_args: list[str] | None = None, run_doctests: bool = False) -> None: # noqa: PT028 """ Run the pandas test suite using pytest. diff --git a/pyproject.toml b/pyproject.toml index b17a1eacfa717..e013222f8fe79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,10 +27,9 @@ authors = [ license = {file = 'LICENSE'} requires-python = '>=3.10' dependencies = [ - "numpy>=1.23.5; python_version<'3.12'", - "numpy>=1.26.0; python_version>='3.12'", + "numpy>=1.26.0", "python-dateutil>=2.8.2", - "tzdata>=2022.7" + "tzdata>=2023.3" ] classifiers = [ 'Development Status :: 5 - Production/Stable', @@ -59,15 +58,15 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] -pyarrow = ['pyarrow>=10.0.1'] +pyarrow = ['pyarrow>=12.0.1'] performance = ['bottleneck>=1.3.6', 'numba>=0.59.0', 'numexpr>=2.9.0'] computation = ['scipy>=1.12.0', 'xarray>=2024.1.1'] fss = ['fsspec>=2023.12.2'] aws = ['s3fs>=2023.12.2'] gcp = ['gcsfs>=2023.12.2'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.2', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.2.0'] -parquet = ['pyarrow>=10.0.1'] -feather = ['pyarrow>=10.0.1'] +parquet = ['pyarrow>=12.0.1'] +feather = ['pyarrow>=12.0.1'] iceberg = ['pyiceberg>=0.7.1'] hdf5 = ['tables>=3.8.0'] spss = ['pyreadstat>=1.2.6'] @@ -98,7 +97,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'odfpy>=1.4.1', 'openpyxl>=3.1.2', 'psycopg2>=2.9.6', - 'pyarrow>=10.0.1', + 'pyarrow>=12.0.1', 'pyiceberg>=0.7.1', 'pymysql>=1.1.0', 'PyQt5>=5.15.9', @@ -318,6 +317,8 @@ ignore = [ "ISC001", # if-stmt-min-max "PLR1730", + # nan-comparison + "PLW0177", ### TODO: Enable gradually # Useless statement diff --git a/requirements-dev.txt b/requirements-dev.txt index 64a9ecdacfb45..3e2e637927389 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,7 +32,7 @@ numexpr>=2.9.0 openpyxl>=3.1.2 odfpy>=1.4.1 psycopg2-binary>=2.9.6 -pyarrow>=10.0.1 +pyarrow>=12.0.1 pyiceberg>=0.7.1 pymysql>=1.1.0 pyreadstat>=1.2.6 @@ -86,4 +86,4 @@ jupyterlite-pyodide-kernel adbc-driver-postgresql>=0.10.0 adbc-driver-sqlite>=0.8.0 typing_extensions; python_version<"3.11" -tzdata>=2022.7 +tzdata>=2023.3 diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index a57876902ad36..68cda68e26001 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -24,7 +24,7 @@ import yaml EXCLUDE = {"python", "c-compiler", "cxx-compiler"} -REMAP_VERSION = {"tzdata": "2022.7"} +REMAP_VERSION = {"tzdata": "2023.3"} CONDA_TO_PIP = { "versioneer": "versioneer[toml]", "meson": "meson[ninja]", diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index 21c269f573b3d..e6135ca088772 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -55,7 +55,7 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0'] performance = ['bottleneck>=1.3.2', 'numba>=0.53.1', 'numexpr>=2.7.1'] -timezone = ['tzdata>=2022.1'] +timezone = ['tzdata>=2023.3'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] @@ -103,7 +103,7 @@ all = ['beautifulsoup4>=5.9.3', 'SQLAlchemy>=1.4.16', 'tables>=3.6.1', 'tabulate>=0.8.9', - 'tzdata>=2022.1', + 'tzdata>=2023.3', 'xarray>=0.21.0', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3', diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index 7908aaef3d890..a45791f6c05cd 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -37,7 +37,6 @@ YAML_PATH = pathlib.Path("ci/deps") ENV_PATH = pathlib.Path("environment.yml") EXCLUDE_DEPS = {"tzdata", "pyqt", "pyqt5"} -EXCLUSION_LIST = frozenset(["python=3.8[build=*_pypy]"]) # pandas package is not available # in pre-commit environment sys.path.append("pandas/compat") @@ -111,7 +110,6 @@ def get_yaml_map_from( for dependency in yaml_dic: if ( isinstance(dependency, dict) - or dependency in EXCLUSION_LIST or dependency in yaml_map ): continue @@ -124,11 +122,6 @@ def get_yaml_map_from( yaml_package, yaml_version2 = yaml_dependency.split(operator) yaml_version2 = operator + yaml_version2 yaml_map[yaml_package] = [yaml_version1, yaml_version2] - elif "[build=*_pypy]" in dependency: - search_text = search_text.replace("[build=*_pypy]", "") - yaml_package, yaml_version = search_text.split(operator) - yaml_version = operator + yaml_version - yaml_map[yaml_package] = [yaml_version] elif operator is not None: yaml_package, yaml_version = search_text.split(operator) yaml_version = operator + yaml_version @@ -164,8 +157,6 @@ def pin_min_versions_to_yaml_file( ) -> str: data = yaml_file_data for yaml_package, yaml_versions in yaml_map.items(): - if yaml_package in EXCLUSION_LIST: - continue old_dep = yaml_package if yaml_versions is not None: old_dep = old_dep + ", ".join(yaml_versions) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index d804e15f6d48f..8475747a80367 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -53,6 +53,7 @@ "_get_option", "_fill_limit_area_1d", "_make_block", + "_DatetimeTZBlock", } diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 78c239ac4f690..752b7b89c799b 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -141,7 +141,7 @@ pd.set_option("plotting.backend", "hvplot") [Plotly's](https://plot.ly/) [Python API](https://plot.ly/python/) enables interactive figures and web shareability. Maps, 2D, 3D, and -live-streaming graphs are rendered with WebGL and +live-streaming graphs are rendered with [WebGL](https://www.khronos.org/webgl/) and [D3.js](https://d3js.org/). The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of [matplotlib, ggplot for Python, and